- Assembly template system (13 templates: burrito, fried rice, omelette, stir fry, pasta, sandwich, grain bowl, soup/stew, casserole, pancakes, porridge, pie, pudding) with role-based matching, whole-word single-keyword guard, deterministic titles via MD5 pantry hash - Prep-state stripping: strips 'melted butter' → 'butter' for coverage checks; reconstructs actionable states as 'Before you start:' cooking instructions (NutritionPanel prep_notes field + RecipesView.vue display block) - FTS5 fixes: always double-quote all terms; strip apostrophes to prevent syntax errors on brands like "Stouffer's"; 'plant-based' → bare 'based' crash - Bidirectional synonym expansion: alt-meat, alt-chicken, alt-beef, alt-pork mapped to canonical texture class; pantry expansion covers 'hamburger' from 'burger patties' etc. - Texture profile backfill script (378K ingredient_profiles rows) with macro-derived classification in priority order (fatty → creamy → starchy → firm → fibrous → tender → liquid → neutral); oats/legumes starchy-first fix - LLM prompt: ban flavoured/sweetened ingredients (vanilla yoghurt) from savoury - Migrations 014 (nutrition macros) + 015 (recipe FTS index) - Nutrition estimation pipeline script - gitignore MagicMock sqlite test artifacts
225 lines
8.2 KiB
Python
225 lines
8.2 KiB
Python
"""
|
|
Import food.com recipe corpus into recipes table.
|
|
|
|
Usage:
|
|
conda run -n job-seeker python scripts/pipeline/build_recipe_index.py \
|
|
--db /path/to/kiwi.db \
|
|
--recipes data/recipes_foodcom.parquet \
|
|
--batch-size 10000
|
|
"""
|
|
from __future__ import annotations
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sqlite3
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
_MEASURE_PATTERN = re.compile(
|
|
r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*(cup|tbsp|tsp|oz|lb|g|kg|ml|l|clove|slice|piece|can|pkg|package|bunch|head|stalk|sprig|pinch|dash|to taste|as needed)s?\b",
|
|
re.IGNORECASE,
|
|
)
|
|
_LEAD_NUMBER = re.compile(r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*")
|
|
_TRAILING_QUALIFIER = re.compile(
|
|
r"\s*(to taste|as needed|or more|or less|optional|if desired|if needed)\s*$",
|
|
re.IGNORECASE,
|
|
)
|
|
_QUOTED = re.compile(r'"([^"]*)"')
|
|
|
|
|
|
def _float_or_none(val: object) -> float | None:
|
|
"""Return float > 0, or None for missing / zero values."""
|
|
try:
|
|
v = float(val) # type: ignore[arg-type]
|
|
return v if v > 0 else None
|
|
except (TypeError, ValueError):
|
|
return None
|
|
|
|
|
|
def _safe_list(val: object) -> list:
|
|
"""Convert a value to a list, handling NaN/float/None gracefully."""
|
|
if val is None:
|
|
return []
|
|
try:
|
|
import math
|
|
if isinstance(val, float) and math.isnan(val):
|
|
return []
|
|
except Exception:
|
|
pass
|
|
if isinstance(val, list):
|
|
return val
|
|
return []
|
|
|
|
|
|
def _parse_r_vector(s: str) -> list[str]:
|
|
"""Parse R character vector format: c("a", "b") -> ["a", "b"]."""
|
|
return _QUOTED.findall(s)
|
|
|
|
|
|
def extract_ingredient_names(raw_list: list[str]) -> list[str]:
|
|
"""Strip quantities and units from ingredient strings -> normalized names."""
|
|
names = []
|
|
for raw in raw_list:
|
|
s = raw.lower().strip()
|
|
s = _MEASURE_PATTERN.sub("", s)
|
|
s = _LEAD_NUMBER.sub("", s)
|
|
s = re.sub(r"\(.*?\)", "", s)
|
|
s = re.sub(r",.*$", "", s)
|
|
s = _TRAILING_QUALIFIER.sub("", s)
|
|
s = s.strip(" -.,")
|
|
if s and len(s) > 1:
|
|
names.append(s)
|
|
return names
|
|
|
|
|
|
def compute_element_coverage(profiles: list[dict]) -> dict[str, float]:
|
|
counts: dict[str, int] = {}
|
|
for p in profiles:
|
|
for elem in p.get("elements", []):
|
|
counts[elem] = counts.get(elem, 0) + 1
|
|
if not profiles:
|
|
return {}
|
|
return {e: round(c / len(profiles), 3) for e, c in counts.items()}
|
|
|
|
|
|
def _parse_allrecipes_text(text: str) -> tuple[str, list[str], list[str]]:
|
|
"""Parse corbt/all-recipes text format into (title, ingredients, directions)."""
|
|
lines = text.strip().split('\n')
|
|
title = lines[0].strip()
|
|
ingredients: list[str] = []
|
|
directions: list[str] = []
|
|
section: str | None = None
|
|
for line in lines[1:]:
|
|
stripped = line.strip()
|
|
if stripped.lower() == 'ingredients:':
|
|
section = 'ingredients'
|
|
elif stripped.lower() in ('directions:', 'steps:', 'instructions:'):
|
|
section = 'directions'
|
|
elif stripped.startswith('- ') and section == 'ingredients':
|
|
ingredients.append(stripped[2:].strip())
|
|
elif stripped.startswith('- ') and section == 'directions':
|
|
directions.append(stripped[2:].strip())
|
|
return title, ingredients, directions
|
|
|
|
|
|
def _row_to_fields(row: pd.Series) -> tuple[str, str, list[str], list[str]]:
|
|
"""Extract (external_id, title, raw_ingredients, directions) from a parquet row.
|
|
|
|
Handles both corbt/all-recipes (single 'input' text column) and the
|
|
food.com columnar format (RecipeId, Name, RecipeIngredientParts, ...).
|
|
"""
|
|
if "input" in row.index and pd.notna(row.get("input")):
|
|
title, raw_ingredients, directions = _parse_allrecipes_text(str(row["input"]))
|
|
external_id = f"ar_{hash(title) & 0xFFFFFFFF}"
|
|
else:
|
|
raw_parts = row.get("RecipeIngredientParts", [])
|
|
if isinstance(raw_parts, str):
|
|
parsed = _parse_r_vector(raw_parts)
|
|
raw_parts = parsed if parsed else [raw_parts]
|
|
raw_ingredients = [str(i) for i in (_safe_list(raw_parts))]
|
|
|
|
raw_dirs = row.get("RecipeInstructions", [])
|
|
if isinstance(raw_dirs, str):
|
|
parsed_dirs = _parse_r_vector(raw_dirs)
|
|
directions = parsed_dirs if parsed_dirs else [raw_dirs]
|
|
else:
|
|
directions = [str(d) for d in (_safe_list(raw_dirs))]
|
|
|
|
title = str(row.get("Name", ""))[:500]
|
|
external_id = str(row.get("RecipeId", ""))
|
|
|
|
return external_id, title, raw_ingredients, directions
|
|
|
|
|
|
def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
|
|
conn = sqlite3.connect(db_path)
|
|
try:
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
|
|
# Pre-load ingredient element profiles to avoid N+1 queries
|
|
profile_index: dict[str, list[str]] = {}
|
|
for row in conn.execute("SELECT name, elements FROM ingredient_profiles"):
|
|
try:
|
|
profile_index[row[0]] = json.loads(row[1])
|
|
except Exception:
|
|
pass
|
|
|
|
df = pd.read_parquet(recipes_path)
|
|
inserted = 0
|
|
batch = []
|
|
|
|
for _, row in df.iterrows():
|
|
external_id, title, raw_ingredients, directions = _row_to_fields(row)
|
|
if not title:
|
|
continue
|
|
ingredient_names = extract_ingredient_names(raw_ingredients)
|
|
|
|
profiles = []
|
|
for name in ingredient_names:
|
|
if name in profile_index:
|
|
profiles.append({"elements": profile_index[name]})
|
|
coverage = compute_element_coverage(profiles)
|
|
|
|
batch.append((
|
|
external_id,
|
|
title,
|
|
json.dumps(raw_ingredients),
|
|
json.dumps(ingredient_names),
|
|
json.dumps(directions),
|
|
str(row.get("RecipeCategory", "") or ""),
|
|
json.dumps(_safe_list(row.get("Keywords"))),
|
|
_float_or_none(row.get("Calories")),
|
|
_float_or_none(row.get("FatContent")),
|
|
_float_or_none(row.get("ProteinContent")),
|
|
_float_or_none(row.get("SodiumContent")),
|
|
json.dumps(coverage),
|
|
# New macro columns (migration 014)
|
|
_float_or_none(row.get("SugarContent")),
|
|
_float_or_none(row.get("CarbohydrateContent")),
|
|
_float_or_none(row.get("FiberContent")),
|
|
_float_or_none(row.get("RecipeServings")),
|
|
0, # nutrition_estimated — food.com direct data is authoritative
|
|
))
|
|
|
|
if len(batch) >= batch_size:
|
|
before = conn.total_changes
|
|
conn.executemany("""
|
|
INSERT OR REPLACE INTO recipes
|
|
(external_id, title, ingredients, ingredient_names, directions,
|
|
category, keywords, calories, fat_g, protein_g, sodium_mg,
|
|
element_coverage,
|
|
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
""", batch)
|
|
conn.commit()
|
|
inserted += conn.total_changes - before
|
|
print(f" {inserted} recipes inserted...")
|
|
batch = []
|
|
|
|
if batch:
|
|
before = conn.total_changes
|
|
conn.executemany("""
|
|
INSERT OR REPLACE INTO recipes
|
|
(external_id, title, ingredients, ingredient_names, directions,
|
|
category, keywords, calories, fat_g, protein_g, sodium_mg,
|
|
element_coverage,
|
|
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
|
|
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
|
""", batch)
|
|
conn.commit()
|
|
inserted += conn.total_changes - before
|
|
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
print(f"Total: {inserted} recipes inserted")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--db", required=True, type=Path)
|
|
parser.add_argument("--recipes", required=True, type=Path)
|
|
parser.add_argument("--batch-size", type=int, default=10000)
|
|
args = parser.parse_args()
|
|
build(args.db, args.recipes, args.batch_size)
|