feat: data pipeline -- recipe corpus + substitution pair derivation

2026-03-30 22:55:41 -07:00 · 2026-03-30 22:55:41 -07:00 · bad6dd175c
commit bad6dd175c
parent 27ec14b40f
4 changed files with 274 additions and 0 deletions
--- a/scripts/pipeline/build_recipe_index.py
+++ b/scripts/pipeline/build_recipe_index.py
@ -0,0 +1,136 @@
 """
 Import food.com recipe corpus into recipes table.
 Usage:
    conda run -n job-seeker python scripts/pipeline/build_recipe_index.py \
        --db /path/to/kiwi.db \
        --recipes data/recipes_foodcom.parquet \
        --batch-size 10000
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sqlite3
 from pathlib import Path
 import pandas as pd
 _MEASURE_PATTERN = re.compile(
    r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*(cup|tbsp|tsp|oz|lb|g|kg|ml|l|clove|slice|piece|can|pkg|package|bunch|head|stalk|sprig|pinch|dash|to taste|as needed)s?\b",
    re.IGNORECASE,
 )
 _LEAD_NUMBER = re.compile(r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*")
 _TRAILING_QUALIFIER = re.compile(
    r"\s*(to taste|as needed|or more|or less|optional|if desired|if needed)\s*$",
    re.IGNORECASE,
 )
 def extract_ingredient_names(raw_list: list[str]) -> list[str]:
    """Strip quantities and units from ingredient strings -> normalized names."""
    names = []
    for raw in raw_list:
        s = raw.lower().strip()
        s = _MEASURE_PATTERN.sub("", s)
        s = _LEAD_NUMBER.sub("", s)
        s = re.sub(r"\(.*?\)", "", s)
        s = re.sub(r",.*$", "", s)
        s = _TRAILING_QUALIFIER.sub("", s)
        s = s.strip(" -.,")
        if s and len(s) > 1:
            names.append(s)
    return names
 def compute_element_coverage(profiles: list[dict]) -> dict[str, float]:
    counts: dict[str, int] = {}
    for p in profiles:
        for elem in p.get("elements", []):
            counts[elem] = counts.get(elem, 0) + 1
    if not profiles:
        return {}
    return {e: round(c / len(profiles), 3) for e, c in counts.items()}
 def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA journal_mode=WAL")
    df = pd.read_parquet(recipes_path)
    inserted = 0
    batch = []
    for _, row in df.iterrows():
        raw_ingredients = row.get("RecipeIngredientParts", [])
        if isinstance(raw_ingredients, str):
            try:
                raw_ingredients = json.loads(raw_ingredients)
            except Exception:
                raw_ingredients = [raw_ingredients]
        raw_ingredients = [str(i) for i in (raw_ingredients or [])]
        ingredient_names = extract_ingredient_names(raw_ingredients)
        profiles = []
        for name in ingredient_names:
            row_p = conn.execute(
                "SELECT elements FROM ingredient_profiles WHERE name = ?", (name,)
            ).fetchone()
            if row_p:
                profiles.append({"elements": json.loads(row_p[0])})
        coverage = compute_element_coverage(profiles)
        directions = row.get("RecipeInstructions", [])
        if isinstance(directions, str):
            try:
                directions = json.loads(directions)
            except Exception:
                directions = [directions]
        batch.append((
            str(row.get("RecipeId", "")),
            str(row.get("Name", ""))[:500],
            json.dumps(raw_ingredients),
            json.dumps(ingredient_names),
            json.dumps([str(d) for d in (directions or [])]),
            str(row.get("RecipeCategory", "") or ""),
            json.dumps(list(row.get("Keywords", []) or [])),
            float(row.get("Calories") or 0) or None,
            float(row.get("FatContent") or 0) or None,
            float(row.get("ProteinContent") or 0) or None,
            float(row.get("SodiumContent") or 0) or None,
            json.dumps(coverage),
        ))
        if len(batch) >= batch_size:
            conn.executemany("""
                INSERT OR IGNORE INTO recipes
                  (external_id, title, ingredients, ingredient_names, directions,
                   category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
                VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
            """, batch)
            conn.commit()
            inserted += len(batch)
            print(f"  {inserted} recipes inserted...")
            batch = []
    if batch:
        conn.executemany("""
            INSERT OR IGNORE INTO recipes
              (external_id, title, ingredients, ingredient_names, directions,
               category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
            VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
        """, batch)
        conn.commit()
        inserted += len(batch)
    conn.close()
    print(f"Total: {inserted} recipes inserted")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--db",         required=True, type=Path)
    parser.add_argument("--recipes",    required=True, type=Path)
    parser.add_argument("--batch-size", type=int, default=10000)
    args = parser.parse_args()
    build(args.db, args.recipes, args.batch_size)
--- a/scripts/pipeline/derive_substitutions.py
+++ b/scripts/pipeline/derive_substitutions.py
@ -0,0 +1,109 @@
 """
 Derive substitution pairs by diffing lishuyang/recipepairs.
 GPL-3.0 source -- derived annotations only, raw pairs not shipped.
 Usage:
    conda run -n job-seeker python scripts/pipeline/derive_substitutions.py \
        --db /path/to/kiwi.db \
        --recipepairs data/recipepairs.parquet \
        --recipes data/recipes_foodcom.parquet
 """
 from __future__ import annotations
 import argparse
 import json
 import sqlite3
 from collections import defaultdict
 from pathlib import Path
 import pandas as pd
 from scripts.pipeline.build_recipe_index import extract_ingredient_names
 CONSTRAINT_COLS = ["vegan", "vegetarian", "dairy_free", "low_calorie",
                   "low_carb", "low_fat", "low_sodium", "gluten_free"]
 def diff_ingredients(base: list[str], target: list[str]) -> tuple[list[str], list[str]]:
    base_set = set(base)
    target_set = set(target)
    removed = list(base_set - target_set)
    added = list(target_set - base_set)
    return removed, added
 def build(db_path: Path, recipepairs_path: Path, recipes_path: Path) -> None:
    conn = sqlite3.connect(db_path)
    print("Loading recipe ingredient index...")
    recipe_ingredients: dict[str, list[str]] = {}
    for row in conn.execute("SELECT external_id, ingredient_names FROM recipes"):
        recipe_ingredients[str(row[0])] = json.loads(row[1])
    df = pd.read_parquet(recipepairs_path)
    pair_counts: dict[tuple, dict] = defaultdict(lambda: {"count": 0})
    print("Diffing recipe pairs...")
    for _, row in df.iterrows():
        base_id = str(row.get("base", ""))
        target_id = str(row.get("target", ""))
        base_ings = recipe_ingredients.get(base_id, [])
        target_ings = recipe_ingredients.get(target_id, [])
        if not base_ings or not target_ings:
            continue
        removed, added = diff_ingredients(base_ings, target_ings)
        if len(removed) != 1 or len(added) != 1:
            continue
        original = removed[0]
        substitute = added[0]
        constraints = [c for c in CONSTRAINT_COLS if row.get(c, 0)]
        for constraint in constraints:
            key = (original, substitute, constraint)
            pair_counts[key]["count"] += 1
    def get_profile(name: str) -> dict:
        row = conn.execute(
            "SELECT fat_pct, moisture_pct, glutamate_mg, protein_pct "
            "FROM ingredient_profiles WHERE name = ?", (name,)
        ).fetchone()
        if row:
            return {"fat": row[0] or 0, "moisture": row[1] or 0,
                    "glutamate": row[2] or 0, "protein": row[3] or 0}
        return {"fat": 0, "moisture": 0, "glutamate": 0, "protein": 0}
    print("Writing substitution pairs...")
    inserted = 0
    for (original, substitute, constraint), data in pair_counts.items():
        if data["count"] < 3:
            continue
        p_orig = get_profile(original)
        p_sub = get_profile(substitute)
        conn.execute("""
            INSERT OR REPLACE INTO substitution_pairs
              (original_name, substitute_name, constraint_label,
               fat_delta, moisture_delta, glutamate_delta, protein_delta,
               occurrence_count, source)
            VALUES (?,?,?,?,?,?,?,?,?)
        """, (
            original, substitute, constraint,
            round(p_sub["fat"] - p_orig["fat"], 2),
            round(p_sub["moisture"] - p_orig["moisture"], 2),
            round(p_sub["glutamate"] - p_orig["glutamate"], 2),
            round(p_sub["protein"] - p_orig["protein"], 2),
            data["count"], "derived",
        ))
        inserted += 1
    conn.commit()
    conn.close()
    print(f"Inserted {inserted} substitution pairs (min 3 occurrences)")
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--db",          required=True, type=Path)
    parser.add_argument("--recipepairs", required=True, type=Path)
    parser.add_argument("--recipes",     required=True, type=Path)
    args = parser.parse_args()
    build(args.db, args.recipepairs, args.recipes)
--- a/tests/pipeline/test_build_recipe_index.py
+++ b/tests/pipeline/test_build_recipe_index.py
@ -0,0 +1,19 @@
 def test_extract_ingredient_names():
    from scripts.pipeline.build_recipe_index import extract_ingredient_names
    raw = ["2 cups all-purpose flour", "1 lb ground beef (85/15)", "salt to taste"]
    names = extract_ingredient_names(raw)
    assert "flour" in names or "all-purpose flour" in names
    assert "ground beef" in names
    assert "salt" in names
 def test_compute_element_coverage():
    from scripts.pipeline.build_recipe_index import compute_element_coverage
    profiles = [
        {"elements": ["Richness", "Depth"]},
        {"elements": ["Brightness"]},
        {"elements": ["Seasoning"]},
    ]
    coverage = compute_element_coverage(profiles)
    assert coverage["Richness"] > 0
    assert coverage["Brightness"] > 0
    assert coverage.get("Aroma", 0) == 0
--- a/tests/pipeline/test_derive_substitutions.py
+++ b/tests/pipeline/test_derive_substitutions.py
@ -0,0 +1,10 @@
 def test_diff_ingredient_lists():
    from scripts.pipeline.derive_substitutions import diff_ingredients
    base =   ["ground beef", "chicken broth", "olive oil", "onion"]
    target = ["lentils",     "vegetable broth", "olive oil", "onion"]
    removed, added = diff_ingredients(base, target)
    assert "ground beef" in removed
    assert "chicken broth" in removed
    assert "lentils" in added
    assert "vegetable broth" in added
    assert "olive oil" not in removed  # unchanged