kiwi/scripts/backfill_texture_profiles.py
pyr0ball 1a493e0ad9 feat: recipe engine — assembly templates, prep notes, FTS fixes, texture backfill
- Assembly template system (13 templates: burrito, fried rice, omelette, stir fry,
  pasta, sandwich, grain bowl, soup/stew, casserole, pancakes, porridge, pie, pudding)
  with role-based matching, whole-word single-keyword guard, deterministic titles
  via MD5 pantry hash
- Prep-state stripping: strips 'melted butter' → 'butter' for coverage checks;
  reconstructs actionable states as 'Before you start:' cooking instructions
  (NutritionPanel prep_notes field + RecipesView.vue display block)
- FTS5 fixes: always double-quote all terms; strip apostrophes to prevent
  syntax errors on brands like "Stouffer's"; 'plant-based' → bare 'based' crash
- Bidirectional synonym expansion: alt-meat, alt-chicken, alt-beef, alt-pork
  mapped to canonical texture class; pantry expansion covers 'hamburger' from
  'burger patties' etc.
- Texture profile backfill script (378K ingredient_profiles rows) with macro-derived
  classification in priority order (fatty → creamy → starchy → firm → fibrous →
  tender → liquid → neutral); oats/legumes starchy-first fix
- LLM prompt: ban flavoured/sweetened ingredients (vanilla yoghurt) from savoury
- Migrations 014 (nutrition macros) + 015 (recipe FTS index)
- Nutrition estimation pipeline script
- gitignore MagicMock sqlite test artifacts
2026-04-02 22:12:35 -07:00

134 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Backfill texture_profile in ingredient_profiles from existing macro data.
Texture categories and their macro signatures (all values g/100g):
fatty - fat > 60 (oils, lard, pure butter)
creamy - fat 15-60 (cream, cheese, fatty meats, nut butter)
firm - protein > 15, fat < 15 (lean meats, fish, legumes, firm tofu)
starchy - carbs > 40, fat < 10 (flour, oats, rice, bread, potatoes)
fibrous - fiber > 4, carbs < 40 (brassicas, leafy greens, whole grains)
tender - protein 2-15, fat < 10, (soft veg, eggs, soft tofu, cooked beans)
carbs < 40
liquid - calories < 25, fat < 1, (broth, juice, dilute sauces)
protein < 3
neutral - fallthrough default
Rules are applied in priority order: fatty → creamy → firm → starchy →
fibrous → tender → liquid → neutral.
Run:
python scripts/backfill_texture_profiles.py [path/to/kiwi.db]
Or inside the container:
docker exec kiwi-cloud-api-1 python /app/kiwi/scripts/backfill_texture_profiles.py
"""
from __future__ import annotations
import sqlite3
import sys
from pathlib import Path
# Default DB paths to try
_DEFAULT_PATHS = [
"/devl/kiwi-cloud-data/local-dev/kiwi.db",
"/devl/kiwi-data/kiwi.db",
]
BATCH_SIZE = 5_000
def _classify(fat: float, protein: float, carbs: float,
fiber: float, calories: float) -> str:
# Cap runaway values — data quality issue in some branded entries
fat = min(fat or 0.0, 100.0)
protein = min(protein or 0.0, 100.0)
carbs = min(carbs or 0.0, 100.0)
fiber = min(fiber or 0.0, 50.0)
calories = min(calories or 0.0, 900.0)
if fat > 60:
return "fatty"
if fat > 15:
return "creamy"
# Starchy before firm: oats/legumes have high protein AND high carbs — carbs win
if carbs > 40 and fat < 10:
return "starchy"
# Firm: lean proteins with low carbs (meats, fish, hard tofu)
# Lower protein threshold (>7) catches tofu (9%) and similar plant proteins
if protein > 7 and fat < 12 and carbs < 20:
return "firm"
if fiber > 4 and carbs < 40:
return "fibrous"
if 2 < protein <= 15 and fat < 10 and carbs < 40:
return "tender"
if calories < 25 and fat < 1 and protein < 3:
return "liquid"
return "neutral"
def backfill(db_path: str) -> None:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
total = conn.execute("SELECT COUNT(*) FROM ingredient_profiles").fetchone()[0]
print(f"Total rows: {total:,}")
updated = 0
offset = 0
counts: dict[str, int] = {}
while True:
rows = conn.execute(
"""SELECT id, fat_pct, protein_pct, carbs_g_per_100g,
fiber_g_per_100g, calories_per_100g
FROM ingredient_profiles
LIMIT ? OFFSET ?""",
(BATCH_SIZE, offset),
).fetchall()
if not rows:
break
batch: list[tuple[str, int]] = []
for row in rows:
texture = _classify(
row["fat_pct"],
row["protein_pct"],
row["carbs_g_per_100g"],
row["fiber_g_per_100g"],
row["calories_per_100g"],
)
counts[texture] = counts.get(texture, 0) + 1
batch.append((texture, row["id"]))
conn.executemany(
"UPDATE ingredient_profiles SET texture_profile = ? WHERE id = ?",
batch,
)
conn.commit()
updated += len(batch)
offset += BATCH_SIZE
print(f" {updated:,} / {total:,} updated...", end="\r")
print(f"\nDone. {updated:,} rows updated.\n")
print("Texture distribution:")
for texture, count in sorted(counts.items(), key=lambda x: -x[1]):
pct = count / updated * 100
print(f" {texture:10s} {count:8,} ({pct:.1f}%)")
conn.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
path = sys.argv[1]
else:
path = next((p for p in _DEFAULT_PATHS if Path(p).exists()), None)
if not path:
print(f"No DB found. Pass path as argument or create one of: {_DEFAULT_PATHS}")
sys.exit(1)
print(f"Backfilling texture profiles in: {path}")
backfill(path)