feat: data pipeline -- USDA FDC ingredient index builder
This commit is contained in:
parent
e56881d943
commit
97203313c1
7 changed files with 201 additions and 0 deletions
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
0
scripts/pipeline/__init__.py
Normal file
0
scripts/pipeline/__init__.py
Normal file
134
scripts/pipeline/build_ingredient_index.py
Normal file
134
scripts/pipeline/build_ingredient_index.py
Normal file
|
|
@ -0,0 +1,134 @@
|
|||
"""
|
||||
Build ingredient_profiles table from USDA FDC (Food Data Central) data.
|
||||
|
||||
Usage:
|
||||
conda run -n job-seeker python scripts/pipeline/build_ingredient_index.py \
|
||||
--db /path/to/kiwi.db \
|
||||
--usda-fdc data/usda_fdc_cleaned.parquet \
|
||||
--usda-branded data/usda_branded.parquet
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
|
||||
# ── Element derivation rules (threshold-based) ────────────────────────────
|
||||
|
||||
_ELEMENT_RULES: list[tuple[str, callable]] = [
|
||||
("Richness", lambda r: r.get("fat_pct", 0) > 5.0),
|
||||
("Seasoning", lambda r: r.get("sodium_mg_per_100g", 0) > 200),
|
||||
("Depth", lambda r: r.get("glutamate_mg", 0) > 1.0),
|
||||
("Structure", lambda r: r.get("starch_pct", 0) > 10.0 or r.get("binding_score", 0) >= 2),
|
||||
("Texture", lambda r: r.get("water_activity", 1.0) < 0.6), # low water = likely crunchy/dry
|
||||
]
|
||||
|
||||
_ACID_KEYWORDS = ["vinegar", "lemon", "lime", "citric", "tartaric", "kombucha", "kefir",
|
||||
"yogurt", "buttermilk", "wine", "tomato"]
|
||||
_AROMA_KEYWORDS = ["garlic", "onion", "herb", "spice", "basil", "oregano", "cumin",
|
||||
"ginger", "cinnamon", "pepper", "chili", "paprika", "thyme", "rosemary",
|
||||
"cilantro", "parsley", "dill", "fennel", "cardamom", "turmeric"]
|
||||
_FERMENTED_KEYWORDS = ["miso", "soy sauce", "kimchi", "sauerkraut", "kefir", "yogurt",
|
||||
"kombucha", "tempeh", "natto", "vinegar", "nutritional yeast"]
|
||||
|
||||
|
||||
def normalize_name(raw: str) -> str:
|
||||
"""Lowercase, strip parentheticals and trailing descriptors."""
|
||||
name = raw.lower().strip()
|
||||
name = re.sub(r"\(.*?\)", "", name) # remove (85% lean)
|
||||
name = re.sub(r",.*$", "", name) # remove ,shredded
|
||||
name = re.sub(r"\s+", " ", name).strip()
|
||||
return name
|
||||
|
||||
|
||||
def derive_elements(row: dict) -> list[str]:
|
||||
elements = [elem for elem, check in _ELEMENT_RULES if check(row)]
|
||||
name = row.get("name", "").lower()
|
||||
if any(k in name for k in _ACID_KEYWORDS):
|
||||
elements.append("Brightness")
|
||||
if any(k in name for k in _AROMA_KEYWORDS):
|
||||
elements.append("Aroma")
|
||||
return list(dict.fromkeys(elements)) # dedup, preserve order
|
||||
|
||||
|
||||
def derive_binding_score(row: dict) -> int:
|
||||
protein = row.get("protein_pct", 0)
|
||||
starch = row.get("starch_pct", 0)
|
||||
if starch > 50 or (protein > 10 and starch > 20):
|
||||
return 3
|
||||
if starch > 20 or protein > 12:
|
||||
return 2
|
||||
if starch > 5 or protein > 6:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
|
||||
df_fdc = pd.read_parquet(usda_fdc_path)
|
||||
df_branded = pd.read_parquet(usda_branded_path)
|
||||
|
||||
# Rename columns to unified schema
|
||||
fdc_col_map = {
|
||||
"food_item": "name",
|
||||
"Total lipid (fat)": "fat_pct",
|
||||
"Protein": "protein_pct",
|
||||
"Carbohydrate, by difference": "carb_pct",
|
||||
"Fiber, total dietary": "fiber_pct",
|
||||
"Sodium, Na": "sodium_mg_per_100g",
|
||||
"Water": "moisture_pct",
|
||||
}
|
||||
df = df_fdc.rename(columns={k: v for k, v in fdc_col_map.items() if k in df_fdc.columns})
|
||||
|
||||
inserted = 0
|
||||
for _, row in df.iterrows():
|
||||
name = normalize_name(str(row.get("name", "")))
|
||||
if not name or len(name) < 2:
|
||||
continue
|
||||
r = {
|
||||
"name": name,
|
||||
"fat_pct": float(row.get("fat_pct") or 0),
|
||||
"protein_pct": float(row.get("protein_pct") or 0),
|
||||
"moisture_pct": float(row.get("moisture_pct") or 0),
|
||||
"sodium_mg_per_100g": float(row.get("sodium_mg_per_100g") or 0),
|
||||
"starch_pct": 0.0,
|
||||
}
|
||||
r["binding_score"] = derive_binding_score(r)
|
||||
r["elements"] = derive_elements(r)
|
||||
r["is_fermented"] = int(any(k in name for k in _FERMENTED_KEYWORDS))
|
||||
|
||||
try:
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO ingredient_profiles
|
||||
(name, elements, fat_pct, fat_saturated_pct, moisture_pct,
|
||||
protein_pct, starch_pct, binding_score, sodium_mg_per_100g,
|
||||
is_fermented, source)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||||
""", (
|
||||
r["name"], json.dumps(r["elements"]),
|
||||
r["fat_pct"], 0.0, r["moisture_pct"],
|
||||
r["protein_pct"], r["starch_pct"], r["binding_score"],
|
||||
r["sodium_mg_per_100g"], r["is_fermented"], "usda_fdc",
|
||||
))
|
||||
inserted += conn.execute("SELECT changes()").fetchone()[0]
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print(f"Inserted {inserted} ingredient profiles from USDA FDC")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--db", required=True, type=Path)
|
||||
parser.add_argument("--usda-fdc", required=True, type=Path)
|
||||
parser.add_argument("--usda-branded", required=True, type=Path)
|
||||
args = parser.parse_args()
|
||||
build(args.db, args.usda_fdc, args.usda_branded)
|
||||
44
scripts/pipeline/download_datasets.py
Normal file
44
scripts/pipeline/download_datasets.py
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
Download recipe engine datasets from HuggingFace.
|
||||
|
||||
Usage:
|
||||
conda run -n job-seeker python scripts/pipeline/download_datasets.py --data-dir /path/to/data
|
||||
|
||||
Downloads:
|
||||
- AkashPS11/recipes_data_food.com (MIT) → data/recipes_foodcom.parquet
|
||||
- omid5/usda-fdc-foods-cleaned (CC0) → data/usda_fdc_cleaned.parquet
|
||||
- jacktol/usda-branded-food-data (MIT) → data/usda_branded.parquet
|
||||
- lishuyang/recipepairs (GPL-3.0 ⚠) → data/recipepairs.parquet [derive only, don't ship]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from datasets import load_dataset
|
||||
|
||||
|
||||
DATASETS = [
|
||||
("AkashPS11/recipes_data_food.com", "train", "recipes_foodcom.parquet"),
|
||||
("omid5/usda-fdc-foods-cleaned", "train", "usda_fdc_cleaned.parquet"),
|
||||
("jacktol/usda-branded-food-data", "train", "usda_branded.parquet"),
|
||||
("lishuyang/recipepairs", "train", "recipepairs.parquet"),
|
||||
]
|
||||
|
||||
|
||||
def download_all(data_dir: Path) -> None:
|
||||
data_dir.mkdir(parents=True, exist_ok=True)
|
||||
for hf_path, split, filename in DATASETS:
|
||||
out = data_dir / filename
|
||||
if out.exists():
|
||||
print(f" skip {filename} (already exists)")
|
||||
continue
|
||||
print(f" downloading {hf_path} ...")
|
||||
ds = load_dataset(hf_path, split=split)
|
||||
ds.to_parquet(str(out))
|
||||
print(f" saved → {out}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--data-dir", required=True, type=Path)
|
||||
args = parser.parse_args()
|
||||
download_all(args.data_dir)
|
||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
0
tests/pipeline/__init__.py
Normal file
23
tests/pipeline/test_build_ingredient_index.py
Normal file
23
tests/pipeline/test_build_ingredient_index.py
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
import pytest
|
||||
from pathlib import Path
|
||||
import sys
|
||||
sys.path.insert(0, str(Path(__file__).parents[2]))
|
||||
|
||||
def test_normalize_ingredient_name():
|
||||
from scripts.pipeline.build_ingredient_index import normalize_name
|
||||
assert normalize_name("Ground Beef (85% lean)") == "ground beef"
|
||||
assert normalize_name(" Olive Oil ") == "olive oil"
|
||||
assert normalize_name("Cheddar Cheese, shredded") == "cheddar cheese"
|
||||
|
||||
def test_derive_elements_from_usda_row():
|
||||
from scripts.pipeline.build_ingredient_index import derive_elements
|
||||
row = {"fat_pct": 20.0, "protein_pct": 17.0, "moisture_pct": 60.0,
|
||||
"sodium_mg_per_100g": 65.0, "glutamate_mg": 2.8, "starch_pct": 0.0}
|
||||
elements = derive_elements(row)
|
||||
assert "Richness" in elements # high fat
|
||||
assert "Depth" in elements # notable glutamate
|
||||
|
||||
def test_derive_binding_score():
|
||||
from scripts.pipeline.build_ingredient_index import derive_binding_score
|
||||
assert derive_binding_score({"protein_pct": 12.0, "starch_pct": 68.0}) == 3 # flour
|
||||
assert derive_binding_score({"protein_pct": 1.0, "starch_pct": 0.5}) == 0 # water
|
||||
Loading…
Reference in a new issue