diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/pipeline/__init__.py b/scripts/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/pipeline/build_ingredient_index.py b/scripts/pipeline/build_ingredient_index.py new file mode 100644 index 0000000..9a1eee3 --- /dev/null +++ b/scripts/pipeline/build_ingredient_index.py @@ -0,0 +1,134 @@ +""" +Build ingredient_profiles table from USDA FDC (Food Data Central) data. + +Usage: + conda run -n job-seeker python scripts/pipeline/build_ingredient_index.py \ + --db /path/to/kiwi.db \ + --usda-fdc data/usda_fdc_cleaned.parquet \ + --usda-branded data/usda_branded.parquet +""" +from __future__ import annotations +import argparse +import json +import re +import sqlite3 +from pathlib import Path + +import pandas as pd + + +# ── Element derivation rules (threshold-based) ──────────────────────────── + +_ELEMENT_RULES: list[tuple[str, callable]] = [ + ("Richness", lambda r: r.get("fat_pct", 0) > 5.0), + ("Seasoning", lambda r: r.get("sodium_mg_per_100g", 0) > 200), + ("Depth", lambda r: r.get("glutamate_mg", 0) > 1.0), + ("Structure", lambda r: r.get("starch_pct", 0) > 10.0 or r.get("binding_score", 0) >= 2), + ("Texture", lambda r: r.get("water_activity", 1.0) < 0.6), # low water = likely crunchy/dry +] + +_ACID_KEYWORDS = ["vinegar", "lemon", "lime", "citric", "tartaric", "kombucha", "kefir", + "yogurt", "buttermilk", "wine", "tomato"] +_AROMA_KEYWORDS = ["garlic", "onion", "herb", "spice", "basil", "oregano", "cumin", + "ginger", "cinnamon", "pepper", "chili", "paprika", "thyme", "rosemary", + "cilantro", "parsley", "dill", "fennel", "cardamom", "turmeric"] +_FERMENTED_KEYWORDS = ["miso", "soy sauce", "kimchi", "sauerkraut", "kefir", "yogurt", + "kombucha", "tempeh", "natto", "vinegar", "nutritional yeast"] + + +def normalize_name(raw: str) -> str: + """Lowercase, strip parentheticals and trailing descriptors.""" + name = raw.lower().strip() + name = re.sub(r"\(.*?\)", "", name) # remove (85% lean) + name = re.sub(r",.*$", "", name) # remove ,shredded + name = re.sub(r"\s+", " ", name).strip() + return name + + +def derive_elements(row: dict) -> list[str]: + elements = [elem for elem, check in _ELEMENT_RULES if check(row)] + name = row.get("name", "").lower() + if any(k in name for k in _ACID_KEYWORDS): + elements.append("Brightness") + if any(k in name for k in _AROMA_KEYWORDS): + elements.append("Aroma") + return list(dict.fromkeys(elements)) # dedup, preserve order + + +def derive_binding_score(row: dict) -> int: + protein = row.get("protein_pct", 0) + starch = row.get("starch_pct", 0) + if starch > 50 or (protein > 10 and starch > 20): + return 3 + if starch > 20 or protein > 12: + return 2 + if starch > 5 or protein > 6: + return 1 + return 0 + + +def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None: + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA foreign_keys=ON") + + df_fdc = pd.read_parquet(usda_fdc_path) + df_branded = pd.read_parquet(usda_branded_path) + + # Rename columns to unified schema + fdc_col_map = { + "food_item": "name", + "Total lipid (fat)": "fat_pct", + "Protein": "protein_pct", + "Carbohydrate, by difference": "carb_pct", + "Fiber, total dietary": "fiber_pct", + "Sodium, Na": "sodium_mg_per_100g", + "Water": "moisture_pct", + } + df = df_fdc.rename(columns={k: v for k, v in fdc_col_map.items() if k in df_fdc.columns}) + + inserted = 0 + for _, row in df.iterrows(): + name = normalize_name(str(row.get("name", ""))) + if not name or len(name) < 2: + continue + r = { + "name": name, + "fat_pct": float(row.get("fat_pct") or 0), + "protein_pct": float(row.get("protein_pct") or 0), + "moisture_pct": float(row.get("moisture_pct") or 0), + "sodium_mg_per_100g": float(row.get("sodium_mg_per_100g") or 0), + "starch_pct": 0.0, + } + r["binding_score"] = derive_binding_score(r) + r["elements"] = derive_elements(r) + r["is_fermented"] = int(any(k in name for k in _FERMENTED_KEYWORDS)) + + try: + conn.execute(""" + INSERT OR IGNORE INTO ingredient_profiles + (name, elements, fat_pct, fat_saturated_pct, moisture_pct, + protein_pct, starch_pct, binding_score, sodium_mg_per_100g, + is_fermented, source) + VALUES (?,?,?,?,?,?,?,?,?,?,?) + """, ( + r["name"], json.dumps(r["elements"]), + r["fat_pct"], 0.0, r["moisture_pct"], + r["protein_pct"], r["starch_pct"], r["binding_score"], + r["sodium_mg_per_100g"], r["is_fermented"], "usda_fdc", + )) + inserted += conn.execute("SELECT changes()").fetchone()[0] + except Exception: + continue + + conn.commit() + conn.close() + print(f"Inserted {inserted} ingredient profiles from USDA FDC") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--db", required=True, type=Path) + parser.add_argument("--usda-fdc", required=True, type=Path) + parser.add_argument("--usda-branded", required=True, type=Path) + args = parser.parse_args() + build(args.db, args.usda_fdc, args.usda_branded) diff --git a/scripts/pipeline/download_datasets.py b/scripts/pipeline/download_datasets.py new file mode 100644 index 0000000..3166210 --- /dev/null +++ b/scripts/pipeline/download_datasets.py @@ -0,0 +1,44 @@ +""" +Download recipe engine datasets from HuggingFace. + +Usage: + conda run -n job-seeker python scripts/pipeline/download_datasets.py --data-dir /path/to/data + +Downloads: + - AkashPS11/recipes_data_food.com (MIT) → data/recipes_foodcom.parquet + - omid5/usda-fdc-foods-cleaned (CC0) → data/usda_fdc_cleaned.parquet + - jacktol/usda-branded-food-data (MIT) → data/usda_branded.parquet + - lishuyang/recipepairs (GPL-3.0 ⚠) → data/recipepairs.parquet [derive only, don't ship] +""" +from __future__ import annotations +import argparse +from pathlib import Path +from datasets import load_dataset + + +DATASETS = [ + ("AkashPS11/recipes_data_food.com", "train", "recipes_foodcom.parquet"), + ("omid5/usda-fdc-foods-cleaned", "train", "usda_fdc_cleaned.parquet"), + ("jacktol/usda-branded-food-data", "train", "usda_branded.parquet"), + ("lishuyang/recipepairs", "train", "recipepairs.parquet"), +] + + +def download_all(data_dir: Path) -> None: + data_dir.mkdir(parents=True, exist_ok=True) + for hf_path, split, filename in DATASETS: + out = data_dir / filename + if out.exists(): + print(f" skip {filename} (already exists)") + continue + print(f" downloading {hf_path} ...") + ds = load_dataset(hf_path, split=split) + ds.to_parquet(str(out)) + print(f" saved → {out}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--data-dir", required=True, type=Path) + args = parser.parse_args() + download_all(args.data_dir) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pipeline/__init__.py b/tests/pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/pipeline/test_build_ingredient_index.py b/tests/pipeline/test_build_ingredient_index.py new file mode 100644 index 0000000..f49f293 --- /dev/null +++ b/tests/pipeline/test_build_ingredient_index.py @@ -0,0 +1,23 @@ +import pytest +from pathlib import Path +import sys +sys.path.insert(0, str(Path(__file__).parents[2])) + +def test_normalize_ingredient_name(): + from scripts.pipeline.build_ingredient_index import normalize_name + assert normalize_name("Ground Beef (85% lean)") == "ground beef" + assert normalize_name(" Olive Oil ") == "olive oil" + assert normalize_name("Cheddar Cheese, shredded") == "cheddar cheese" + +def test_derive_elements_from_usda_row(): + from scripts.pipeline.build_ingredient_index import derive_elements + row = {"fat_pct": 20.0, "protein_pct": 17.0, "moisture_pct": 60.0, + "sodium_mg_per_100g": 65.0, "glutamate_mg": 2.8, "starch_pct": 0.0} + elements = derive_elements(row) + assert "Richness" in elements # high fat + assert "Depth" in elements # notable glutamate + +def test_derive_binding_score(): + from scripts.pipeline.build_ingredient_index import derive_binding_score + assert derive_binding_score({"protein_pct": 12.0, "starch_pct": 68.0}) == 3 # flour + assert derive_binding_score({"protein_pct": 1.0, "starch_pct": 0.5}) == 0 # water