- Assembly template system (13 templates: burrito, fried rice, omelette, stir fry, pasta, sandwich, grain bowl, soup/stew, casserole, pancakes, porridge, pie, pudding) with role-based matching, whole-word single-keyword guard, deterministic titles via MD5 pantry hash - Prep-state stripping: strips 'melted butter' → 'butter' for coverage checks; reconstructs actionable states as 'Before you start:' cooking instructions (NutritionPanel prep_notes field + RecipesView.vue display block) - FTS5 fixes: always double-quote all terms; strip apostrophes to prevent syntax errors on brands like "Stouffer's"; 'plant-based' → bare 'based' crash - Bidirectional synonym expansion: alt-meat, alt-chicken, alt-beef, alt-pork mapped to canonical texture class; pantry expansion covers 'hamburger' from 'burger patties' etc. - Texture profile backfill script (378K ingredient_profiles rows) with macro-derived classification in priority order (fatty → creamy → starchy → firm → fibrous → tender → liquid → neutral); oats/legumes starchy-first fix - LLM prompt: ban flavoured/sweetened ingredients (vanilla yoghurt) from savoury - Migrations 014 (nutrition macros) + 015 (recipe FTS index) - Nutrition estimation pipeline script - gitignore MagicMock sqlite test artifacts
170 lines
6.8 KiB
Python
170 lines
6.8 KiB
Python
"""
|
|
Build ingredient_profiles table from USDA FDC (Food Data Central) data.
|
|
|
|
Usage:
|
|
conda run -n job-seeker python scripts/pipeline/build_ingredient_index.py \
|
|
--db /path/to/kiwi.db \
|
|
--usda-fdc data/usda_fdc_cleaned.parquet \
|
|
--usda-branded data/usda_branded.parquet
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
|
|
# ── Element derivation rules (threshold-based) ────────────────────────────
|
|
|
|
_ELEMENT_RULES: list[tuple[str, callable]] = [
|
|
("Richness", lambda r: r.get("fat_pct", 0) > 5.0),
|
|
("Seasoning", lambda r: r.get("sodium_mg_per_100g", 0) > 200),
|
|
("Depth", lambda r: r.get("glutamate_mg", 0) > 1.0),
|
|
("Structure", lambda r: r.get("starch_pct", 0) > 10.0 or r.get("binding_score", 0) >= 2),
|
|
("Texture", lambda r: r.get("water_activity", 1.0) < 0.6), # low water = likely crunchy/dry
|
|
]
|
|
|
|
_ACID_KEYWORDS = ["vinegar", "lemon", "lime", "citric", "tartaric", "kombucha", "kefir",
|
|
"yogurt", "buttermilk", "wine", "tomato"]
|
|
_AROMA_KEYWORDS = ["garlic", "onion", "herb", "spice", "basil", "oregano", "cumin",
|
|
"ginger", "cinnamon", "pepper", "chili", "paprika", "thyme", "rosemary",
|
|
"cilantro", "parsley", "dill", "fennel", "cardamom", "turmeric"]
|
|
_FERMENTED_KEYWORDS = ["miso", "soy sauce", "kimchi", "sauerkraut", "kefir", "yogurt",
|
|
"kombucha", "tempeh", "natto", "vinegar", "nutritional yeast"]
|
|
|
|
|
|
def normalize_name(raw: str) -> str:
|
|
"""Lowercase, strip parentheticals and trailing descriptors."""
|
|
name = raw.lower().strip()
|
|
name = re.sub(r"\(.*?\)", "", name) # remove (85% lean)
|
|
name = re.sub(r",.*$", "", name) # remove ,shredded
|
|
name = re.sub(r"\s+", " ", name).strip()
|
|
return name
|
|
|
|
|
|
def derive_elements(row: dict) -> list[str]:
|
|
elements = [elem for elem, check in _ELEMENT_RULES if check(row)]
|
|
name = row.get("name", "").lower()
|
|
if any(k in name for k in _ACID_KEYWORDS):
|
|
elements.append("Brightness")
|
|
if any(k in name for k in _AROMA_KEYWORDS):
|
|
elements.append("Aroma")
|
|
return list(dict.fromkeys(elements)) # dedup, preserve order
|
|
|
|
|
|
def derive_binding_score(row: dict) -> int:
|
|
protein = row.get("protein_pct", 0)
|
|
starch = row.get("starch_pct", 0)
|
|
if starch > 50 or (protein > 10 and starch > 20):
|
|
return 3
|
|
if starch > 20 or protein > 12:
|
|
return 2
|
|
if starch > 5 or protein > 6:
|
|
return 1
|
|
return 0
|
|
|
|
|
|
def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None:
|
|
conn = sqlite3.connect(db_path)
|
|
conn.execute("PRAGMA foreign_keys=ON")
|
|
|
|
df_fdc = pd.read_parquet(usda_fdc_path)
|
|
df_branded = pd.read_parquet(usda_branded_path)
|
|
|
|
# Rename columns to unified schema
|
|
fdc_col_map = {
|
|
"food_item": "name",
|
|
"Total lipid (fat)": "fat_pct",
|
|
"Protein": "protein_pct",
|
|
"Carbohydrate, by difference": "carb_pct",
|
|
"Fiber, total dietary": "fiber_pct",
|
|
"Sodium, Na": "sodium_mg_per_100g",
|
|
"Water": "moisture_pct",
|
|
"Energy": "calories_per_100g",
|
|
}
|
|
df = df_fdc.rename(columns={k: v for k, v in fdc_col_map.items() if k in df_fdc.columns})
|
|
|
|
# Build a sugar lookup from the branded parquet (keyed by normalized name).
|
|
# usda_branded has SUGARS, TOTAL (G) for processed/packaged foods.
|
|
branded_col_map = {
|
|
"FOOD_NAME": "name",
|
|
"SUGARS, TOTAL (G)": "sugar_g_per_100g",
|
|
}
|
|
df_branded_slim = df_branded.rename(
|
|
columns={k: v for k, v in branded_col_map.items() if k in df_branded.columns}
|
|
)[list(set(branded_col_map.values()) & set(df_branded.rename(columns=branded_col_map).columns))]
|
|
sugar_lookup: dict[str, float] = {}
|
|
for _, brow in df_branded_slim.iterrows():
|
|
bname = normalize_name(str(brow.get("name", "")))
|
|
val = brow.get("sugar_g_per_100g")
|
|
try:
|
|
fval = float(val) # type: ignore[arg-type]
|
|
if fval > 0 and bname not in sugar_lookup:
|
|
sugar_lookup[bname] = fval
|
|
except (TypeError, ValueError):
|
|
pass
|
|
|
|
inserted = 0
|
|
for _, row in df.iterrows():
|
|
name = normalize_name(str(row.get("name", "")))
|
|
if not name or len(name) < 2:
|
|
continue
|
|
r = {
|
|
"name": name,
|
|
"fat_pct": float(row.get("fat_pct") or 0),
|
|
"protein_pct": float(row.get("protein_pct") or 0),
|
|
"moisture_pct": float(row.get("moisture_pct") or 0),
|
|
"sodium_mg_per_100g": float(row.get("sodium_mg_per_100g") or 0),
|
|
"starch_pct": 0.0,
|
|
"carbs_g_per_100g": float(row.get("carb_pct") or 0),
|
|
"fiber_g_per_100g": float(row.get("fiber_pct") or 0),
|
|
"calories_per_100g": float(row.get("calories_per_100g") or 0),
|
|
"sugar_g_per_100g": sugar_lookup.get(name, 0.0),
|
|
}
|
|
r["binding_score"] = derive_binding_score(r)
|
|
r["elements"] = derive_elements(r)
|
|
r["is_fermented"] = int(any(k in name for k in _FERMENTED_KEYWORDS))
|
|
|
|
try:
|
|
# Insert new profile or update macro columns on existing one.
|
|
conn.execute("""
|
|
INSERT INTO ingredient_profiles
|
|
(name, elements, fat_pct, fat_saturated_pct, moisture_pct,
|
|
protein_pct, starch_pct, binding_score, sodium_mg_per_100g,
|
|
is_fermented,
|
|
carbs_g_per_100g, fiber_g_per_100g, calories_per_100g, sugar_g_per_100g,
|
|
source)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
ON CONFLICT(name) DO UPDATE SET
|
|
carbs_g_per_100g = excluded.carbs_g_per_100g,
|
|
fiber_g_per_100g = excluded.fiber_g_per_100g,
|
|
calories_per_100g = excluded.calories_per_100g,
|
|
sugar_g_per_100g = excluded.sugar_g_per_100g
|
|
""", (
|
|
r["name"], json.dumps(r["elements"]),
|
|
r["fat_pct"], 0.0, r["moisture_pct"],
|
|
r["protein_pct"], r["starch_pct"], r["binding_score"],
|
|
r["sodium_mg_per_100g"], r["is_fermented"],
|
|
r["carbs_g_per_100g"], r["fiber_g_per_100g"],
|
|
r["calories_per_100g"], r["sugar_g_per_100g"],
|
|
"usda_fdc",
|
|
))
|
|
inserted += 1
|
|
except Exception:
|
|
continue
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print(f"Inserted {inserted} ingredient profiles from USDA FDC")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--db", required=True, type=Path)
|
|
parser.add_argument("--usda-fdc", required=True, type=Path)
|
|
parser.add_argument("--usda-branded", required=True, type=Path)
|
|
args = parser.parse_args()
|
|
build(args.db, args.usda_fdc, args.usda_branded)
|