feat: data pipeline -- USDA FDC ingredient index builder

2026-03-30 22:44:25 -07:00 · 2026-03-30 22:44:25 -07:00 · 97203313c1
commit 97203313c1
parent e56881d943
7 changed files with 201 additions and 0 deletions
--- a/scripts/init.py
+++ b/scripts/init.py
--- a/scripts/pipeline/init.py
+++ b/scripts/pipeline/init.py
--- a/scripts/pipeline/build_ingredient_index.py
+++ b/scripts/pipeline/build_ingredient_index.py
@ -0,0 +1,134 @@
+"""
+Build ingredient_profiles table from USDA FDC (Food Data Central) data.
+
+Usage:
+    conda run -n job-seeker python scripts/pipeline/build_ingredient_index.py \
+        --db /path/to/kiwi.db \
+        --usda-fdc data/usda_fdc_cleaned.parquet \
+        --usda-branded data/usda_branded.parquet
+"""
+from __future__ import annotations
+import argparse
+import json
+import re
+import sqlite3
+from pathlib import Path
+
+import pandas as pd
+
+
+# ── Element derivation rules (threshold-based) ────────────────────────────
+
+_ELEMENT_RULES: list[tuple[str, callable]] = [
+    ("Richness",  lambda r: r.get("fat_pct", 0)        > 5.0),
+    ("Seasoning", lambda r: r.get("sodium_mg_per_100g", 0) > 200),
+    ("Depth",     lambda r: r.get("glutamate_mg", 0)   > 1.0),
+    ("Structure", lambda r: r.get("starch_pct", 0)     > 10.0 or r.get("binding_score", 0) >= 2),
+    ("Texture",   lambda r: r.get("water_activity", 1.0) < 0.6),  # low water = likely crunchy/dry
+]
+
+_ACID_KEYWORDS = ["vinegar", "lemon", "lime", "citric", "tartaric", "kombucha", "kefir",
+                  "yogurt", "buttermilk", "wine", "tomato"]
+_AROMA_KEYWORDS = ["garlic", "onion", "herb", "spice", "basil", "oregano", "cumin",
+                   "ginger", "cinnamon", "pepper", "chili", "paprika", "thyme", "rosemary",
+                   "cilantro", "parsley", "dill", "fennel", "cardamom", "turmeric"]
+_FERMENTED_KEYWORDS = ["miso", "soy sauce", "kimchi", "sauerkraut", "kefir", "yogurt",
+                       "kombucha", "tempeh", "natto", "vinegar", "nutritional yeast"]
+
+
+def normalize_name(raw: str) -> str:
+    """Lowercase, strip parentheticals and trailing descriptors."""
+    name = raw.lower().strip()
+    name = re.sub(r"\(.*?\)", "", name)          # remove (85% lean)
+    name = re.sub(r",.*$", "", name)             # remove ,shredded
+    name = re.sub(r"\s+", " ", name).strip()
+    return name
+
+
+def derive_elements(row: dict) -> list[str]:
+    elements = [elem for elem, check in _ELEMENT_RULES if check(row)]
+    name = row.get("name", "").lower()
+    if any(k in name for k in _ACID_KEYWORDS):
+        elements.append("Brightness")
+    if any(k in name for k in _AROMA_KEYWORDS):
+        elements.append("Aroma")
+    return list(dict.fromkeys(elements))  # dedup, preserve order
+
+
+def derive_binding_score(row: dict) -> int:
+    protein = row.get("protein_pct", 0)
+    starch = row.get("starch_pct", 0)
+    if starch > 50 or (protein > 10 and starch > 20):
+        return 3
+    if starch > 20 or protein > 12:
+        return 2
+    if starch > 5 or protein > 6:
+        return 1
+    return 0
+
+
+def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None:
+    conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA foreign_keys=ON")
+
+    df_fdc = pd.read_parquet(usda_fdc_path)
+    df_branded = pd.read_parquet(usda_branded_path)
+
+    # Rename columns to unified schema
+    fdc_col_map = {
+        "food_item": "name",
+        "Total lipid (fat)": "fat_pct",
+        "Protein": "protein_pct",
+        "Carbohydrate, by difference": "carb_pct",
+        "Fiber, total dietary": "fiber_pct",
+        "Sodium, Na": "sodium_mg_per_100g",
+        "Water": "moisture_pct",
+    }
+    df = df_fdc.rename(columns={k: v for k, v in fdc_col_map.items() if k in df_fdc.columns})
+
+    inserted = 0
+    for _, row in df.iterrows():
+        name = normalize_name(str(row.get("name", "")))
+        if not name or len(name) < 2:
+            continue
+        r = {
+            "name": name,
+            "fat_pct": float(row.get("fat_pct") or 0),
+            "protein_pct": float(row.get("protein_pct") or 0),
+            "moisture_pct": float(row.get("moisture_pct") or 0),
+            "sodium_mg_per_100g": float(row.get("sodium_mg_per_100g") or 0),
+            "starch_pct": 0.0,
+        }
+        r["binding_score"] = derive_binding_score(r)
+        r["elements"] = derive_elements(r)
+        r["is_fermented"] = int(any(k in name for k in _FERMENTED_KEYWORDS))
+
+        try:
+            conn.execute("""
+                INSERT OR IGNORE INTO ingredient_profiles
+                  (name, elements, fat_pct, fat_saturated_pct, moisture_pct,
+                   protein_pct, starch_pct, binding_score, sodium_mg_per_100g,
+                   is_fermented, source)
+                VALUES (?,?,?,?,?,?,?,?,?,?,?)
+            """, (
+                r["name"], json.dumps(r["elements"]),
+                r["fat_pct"], 0.0, r["moisture_pct"],
+                r["protein_pct"], r["starch_pct"], r["binding_score"],
+                r["sodium_mg_per_100g"], r["is_fermented"], "usda_fdc",
+            ))
+            inserted += conn.execute("SELECT changes()").fetchone()[0]
+        except Exception:
+            continue
+
+    conn.commit()
+    conn.close()
+    print(f"Inserted {inserted} ingredient profiles from USDA FDC")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--db",           required=True, type=Path)
+    parser.add_argument("--usda-fdc",     required=True, type=Path)
+    parser.add_argument("--usda-branded", required=True, type=Path)
+    args = parser.parse_args()
+    build(args.db, args.usda_fdc, args.usda_branded)
--- a/scripts/pipeline/download_datasets.py
+++ b/scripts/pipeline/download_datasets.py
@ -0,0 +1,44 @@
+"""
+Download recipe engine datasets from HuggingFace.
+
+Usage:
+    conda run -n job-seeker python scripts/pipeline/download_datasets.py --data-dir /path/to/data
+
+Downloads:
+  - AkashPS11/recipes_data_food.com  (MIT)        → data/recipes_foodcom.parquet
+  - omid5/usda-fdc-foods-cleaned     (CC0)         → data/usda_fdc_cleaned.parquet
+  - jacktol/usda-branded-food-data   (MIT)         → data/usda_branded.parquet
+  - lishuyang/recipepairs            (GPL-3.0 ⚠)  → data/recipepairs.parquet  [derive only, don't ship]
+"""
+from __future__ import annotations
+import argparse
+from pathlib import Path
+from datasets import load_dataset
+
+
+DATASETS = [
+    ("AkashPS11/recipes_data_food.com", "train", "recipes_foodcom.parquet"),
+    ("omid5/usda-fdc-foods-cleaned",    "train", "usda_fdc_cleaned.parquet"),
+    ("jacktol/usda-branded-food-data",  "train", "usda_branded.parquet"),
+    ("lishuyang/recipepairs",           "train", "recipepairs.parquet"),
+]
+
+
+def download_all(data_dir: Path) -> None:
+    data_dir.mkdir(parents=True, exist_ok=True)
+    for hf_path, split, filename in DATASETS:
+        out = data_dir / filename
+        if out.exists():
+            print(f"  skip {filename} (already exists)")
+            continue
+        print(f"  downloading {hf_path} ...")
+        ds = load_dataset(hf_path, split=split)
+        ds.to_parquet(str(out))
+        print(f"  saved → {out}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data-dir", required=True, type=Path)
+    args = parser.parse_args()
+    download_all(args.data_dir)
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/pipeline/init.py
+++ b/tests/pipeline/init.py
--- a/tests/pipeline/test_build_ingredient_index.py
+++ b/tests/pipeline/test_build_ingredient_index.py
@ -0,0 +1,23 @@
+import pytest
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parents[2]))
+
+def test_normalize_ingredient_name():
+    from scripts.pipeline.build_ingredient_index import normalize_name
+    assert normalize_name("Ground Beef (85% lean)") == "ground beef"
+    assert normalize_name("  Olive Oil  ") == "olive oil"
+    assert normalize_name("Cheddar Cheese, shredded") == "cheddar cheese"
+
+def test_derive_elements_from_usda_row():
+    from scripts.pipeline.build_ingredient_index import derive_elements
+    row = {"fat_pct": 20.0, "protein_pct": 17.0, "moisture_pct": 60.0,
+           "sodium_mg_per_100g": 65.0, "glutamate_mg": 2.8, "starch_pct": 0.0}
+    elements = derive_elements(row)
+    assert "Richness" in elements   # high fat
+    assert "Depth" in elements      # notable glutamate
+
+def test_derive_binding_score():
+    from scripts.pipeline.build_ingredient_index import derive_binding_score
+    assert derive_binding_score({"protein_pct": 12.0, "starch_pct": 68.0}) == 3  # flour
+    assert derive_binding_score({"protein_pct": 1.0, "starch_pct": 0.5}) == 0   # water