kiwi/scripts/pipeline/build_recipe_index.py
pyr0ball 144d1dc6c4 chore: commit in-progress work -- tag inferrer, imitate endpoint, hall-of-chaos easter egg, migration files, Dockerfile .env defense
- app/services/recipe/tag_inferrer.py: infer tags from recipe ingredient text
- app/db/migrations/022_recipe_generic_flag.sql, 029_inferred_tags.sql: schema migrations
- app/api/endpoints/imitate.py: recipe imitation endpoint stub
- app/api/endpoints/community.py: hall-of-chaos easter egg endpoint
- scripts/pipeline/infer_recipe_tags.py, backfill_keywords.py: pipeline scripts
- scripts/pipeline/build_recipe_index.py: extended index builder
- Dockerfile: explicit .env removal as defense-in-depth
- frontend/src/components/FeedbackButton.vue: feedback UX improvements
- frontend/src/style.css: minor style tweaks
- app/cloud_session.py: cloud session improvements
- tests/api/test_community_endpoints.py: additional test coverage
2026-04-14 13:23:15 -07:00

253 lines
9.2 KiB
Python

"""
Import food.com recipe corpus into recipes table.
Usage:
conda run -n job-seeker python scripts/pipeline/build_recipe_index.py \
--db /path/to/kiwi.db \
--recipes data/recipes_foodcom.parquet \
--batch-size 10000
"""
from __future__ import annotations
import argparse
import json
import re
import sqlite3
from pathlib import Path
import pandas as pd
_MEASURE_PATTERN = re.compile(
r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*(cup|tbsp|tsp|oz|lb|g|kg|ml|l|clove|slice|piece|can|pkg|package|bunch|head|stalk|sprig|pinch|dash|to taste|as needed)s?\b",
re.IGNORECASE,
)
_LEAD_NUMBER = re.compile(r"^\d[\d\s/\u00bc\u00bd\u00be\u2153\u2154]*\s*")
_TRAILING_QUALIFIER = re.compile(
r"\s*(to taste|as needed|or more|or less|optional|if desired|if needed)\s*$",
re.IGNORECASE,
)
_QUOTED = re.compile(r'"([^"]*)"')
def _float_or_none(val: object) -> float | None:
"""Return float > 0, or None for missing / zero values."""
try:
v = float(val) # type: ignore[arg-type]
return v if v > 0 else None
except (TypeError, ValueError):
return None
def _safe_list(val: object) -> list:
"""Convert a value to a list, handling NaN/float/None gracefully."""
if val is None:
return []
try:
import math
if isinstance(val, float) and math.isnan(val):
return []
except Exception:
pass
if isinstance(val, list):
return val
return []
def _parse_r_vector(s: str) -> list[str]:
"""Parse R character vector format: c("a", "b") -> ["a", "b"]."""
return _QUOTED.findall(s)
def _parse_keywords(val: object) -> list[str]:
"""Parse the food.com Keywords column into a proper list of keyword strings.
The raw parquet value can arrive in three forms:
- None / NaN → []
- str: c("Italian", ...) → parse quoted tokens via _parse_r_vector
- list of single chars → the R-vector was character-split during dataset
export; rejoin then re-parse
- list of strings → already correct, use as-is
"""
import math
if val is None:
return []
if isinstance(val, float) and math.isnan(val):
return []
if isinstance(val, str):
return _parse_r_vector(val)
if isinstance(val, list):
if not val:
return []
# Detect character-split R-vector: every element is a single character
if all(isinstance(e, str) and len(e) == 1 for e in val):
return _parse_r_vector("".join(val))
# Already a proper list of keyword strings
return [str(e) for e in val if e]
return []
def extract_ingredient_names(raw_list: list[str]) -> list[str]:
"""Strip quantities and units from ingredient strings -> normalized names."""
names = []
for raw in raw_list:
s = raw.lower().strip()
s = _MEASURE_PATTERN.sub("", s)
s = _LEAD_NUMBER.sub("", s)
s = re.sub(r"\(.*?\)", "", s)
s = re.sub(r",.*$", "", s)
s = _TRAILING_QUALIFIER.sub("", s)
s = s.strip(" -.,")
if s and len(s) > 1:
names.append(s)
return names
def compute_element_coverage(profiles: list[dict]) -> dict[str, float]:
counts: dict[str, int] = {}
for p in profiles:
for elem in p.get("elements", []):
counts[elem] = counts.get(elem, 0) + 1
if not profiles:
return {}
return {e: round(c / len(profiles), 3) for e, c in counts.items()}
def _parse_allrecipes_text(text: str) -> tuple[str, list[str], list[str]]:
"""Parse corbt/all-recipes text format into (title, ingredients, directions)."""
lines = text.strip().split('\n')
title = lines[0].strip()
ingredients: list[str] = []
directions: list[str] = []
section: str | None = None
for line in lines[1:]:
stripped = line.strip()
if stripped.lower() == 'ingredients:':
section = 'ingredients'
elif stripped.lower() in ('directions:', 'steps:', 'instructions:'):
section = 'directions'
elif stripped.startswith('- ') and section == 'ingredients':
ingredients.append(stripped[2:].strip())
elif stripped.startswith('- ') and section == 'directions':
directions.append(stripped[2:].strip())
return title, ingredients, directions
def _row_to_fields(row: pd.Series) -> tuple[str, str, list[str], list[str]]:
"""Extract (external_id, title, raw_ingredients, directions) from a parquet row.
Handles both corbt/all-recipes (single 'input' text column) and the
food.com columnar format (RecipeId, Name, RecipeIngredientParts, ...).
"""
if "input" in row.index and pd.notna(row.get("input")):
title, raw_ingredients, directions = _parse_allrecipes_text(str(row["input"]))
external_id = f"ar_{hash(title) & 0xFFFFFFFF}"
else:
raw_parts = row.get("RecipeIngredientParts", [])
if isinstance(raw_parts, str):
parsed = _parse_r_vector(raw_parts)
raw_parts = parsed if parsed else [raw_parts]
raw_ingredients = [str(i) for i in (_safe_list(raw_parts))]
raw_dirs = row.get("RecipeInstructions", [])
if isinstance(raw_dirs, str):
parsed_dirs = _parse_r_vector(raw_dirs)
directions = parsed_dirs if parsed_dirs else [raw_dirs]
else:
directions = [str(d) for d in (_safe_list(raw_dirs))]
title = str(row.get("Name", ""))[:500]
external_id = str(row.get("RecipeId", ""))
return external_id, title, raw_ingredients, directions
def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
conn = sqlite3.connect(db_path)
try:
conn.execute("PRAGMA journal_mode=WAL")
# Pre-load ingredient element profiles to avoid N+1 queries
profile_index: dict[str, list[str]] = {}
for row in conn.execute("SELECT name, elements FROM ingredient_profiles"):
try:
profile_index[row[0]] = json.loads(row[1])
except Exception:
pass
df = pd.read_parquet(recipes_path)
inserted = 0
batch = []
for _, row in df.iterrows():
external_id, title, raw_ingredients, directions = _row_to_fields(row)
if not title:
continue
ingredient_names = extract_ingredient_names(raw_ingredients)
profiles = []
for name in ingredient_names:
if name in profile_index:
profiles.append({"elements": profile_index[name]})
coverage = compute_element_coverage(profiles)
batch.append((
external_id,
title,
json.dumps(raw_ingredients),
json.dumps(ingredient_names),
json.dumps(directions),
str(row.get("RecipeCategory", "") or ""),
json.dumps(_parse_keywords(row.get("Keywords"))),
_float_or_none(row.get("Calories")),
_float_or_none(row.get("FatContent")),
_float_or_none(row.get("ProteinContent")),
_float_or_none(row.get("SodiumContent")),
json.dumps(coverage),
# New macro columns (migration 014)
_float_or_none(row.get("SugarContent")),
_float_or_none(row.get("CarbohydrateContent")),
_float_or_none(row.get("FiberContent")),
_float_or_none(row.get("RecipeServings")),
0, # nutrition_estimated — food.com direct data is authoritative
))
if len(batch) >= batch_size:
before = conn.total_changes
conn.executemany("""
INSERT OR REPLACE INTO recipes
(external_id, title, ingredients, ingredient_names, directions,
category, keywords, calories, fat_g, protein_g, sodium_mg,
element_coverage,
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", batch)
conn.commit()
inserted += conn.total_changes - before
print(f" {inserted} recipes inserted...")
batch = []
if batch:
before = conn.total_changes
conn.executemany("""
INSERT OR REPLACE INTO recipes
(external_id, title, ingredients, ingredient_names, directions,
category, keywords, calories, fat_g, protein_g, sodium_mg,
element_coverage,
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", batch)
conn.commit()
inserted += conn.total_changes - before
conn.commit()
finally:
conn.close()
print(f"Total: {inserted} recipes inserted")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--db", required=True, type=Path)
parser.add_argument("--recipes", required=True, type=Path)
parser.add_argument("--batch-size", type=int, default=10000)
args = parser.parse_args()
build(args.db, args.recipes, args.batch_size)