feat: recipe engine — assembly templates, prep notes, FTS fixes, texture backfill

- Assembly template system (13 templates: burrito, fried rice, omelette, stir fry,
  pasta, sandwich, grain bowl, soup/stew, casserole, pancakes, porridge, pie, pudding)
  with role-based matching, whole-word single-keyword guard, deterministic titles
  via MD5 pantry hash
- Prep-state stripping: strips 'melted butter' → 'butter' for coverage checks;
  reconstructs actionable states as 'Before you start:' cooking instructions
  (NutritionPanel prep_notes field + RecipesView.vue display block)
- FTS5 fixes: always double-quote all terms; strip apostrophes to prevent
  syntax errors on brands like "Stouffer's"; 'plant-based' → bare 'based' crash
- Bidirectional synonym expansion: alt-meat, alt-chicken, alt-beef, alt-pork
  mapped to canonical texture class; pantry expansion covers 'hamburger' from
  'burger patties' etc.
- Texture profile backfill script (378K ingredient_profiles rows) with macro-derived
  classification in priority order (fatty → creamy → starchy → firm → fibrous →
  tender → liquid → neutral); oats/legumes starchy-first fix
- LLM prompt: ban flavoured/sweetened ingredients (vanilla yoghurt) from savoury
- Migrations 014 (nutrition macros) + 015 (recipe FTS index)
- Nutrition estimation pipeline script
- gitignore MagicMock sqlite test artifacts
This commit is contained in:
pyr0ball 2026-04-02 22:12:35 -07:00
parent b9c308ab28
commit 1a493e0ad9
11 changed files with 1888 additions and 49 deletions

View file

@ -0,0 +1,18 @@
-- Migration 014: Add macro nutrition columns to recipes and ingredient_profiles.
--
-- recipes: sugar, carbs, fiber, servings, and an estimated flag.
-- ingredient_profiles: carbs, fiber, calories, sugar per 100g (for estimation fallback).
ALTER TABLE recipes ADD COLUMN sugar_g REAL;
ALTER TABLE recipes ADD COLUMN carbs_g REAL;
ALTER TABLE recipes ADD COLUMN fiber_g REAL;
ALTER TABLE recipes ADD COLUMN servings REAL;
ALTER TABLE recipes ADD COLUMN nutrition_estimated INTEGER NOT NULL DEFAULT 0;
ALTER TABLE ingredient_profiles ADD COLUMN carbs_g_per_100g REAL DEFAULT 0.0;
ALTER TABLE ingredient_profiles ADD COLUMN fiber_g_per_100g REAL DEFAULT 0.0;
ALTER TABLE ingredient_profiles ADD COLUMN calories_per_100g REAL DEFAULT 0.0;
ALTER TABLE ingredient_profiles ADD COLUMN sugar_g_per_100g REAL DEFAULT 0.0;
CREATE INDEX idx_recipes_sugar_g ON recipes (sugar_g);
CREATE INDEX idx_recipes_carbs_g ON recipes (carbs_g);

View file

@ -0,0 +1,16 @@
-- Migration 015: FTS5 inverted index for recipe ingredient lookup.
--
-- Content table backed by `recipes` — stores only the inverted index, no text duplication.
-- MATCH queries replace O(N) LIKE scans with O(log N) token lookups.
--
-- One-time rebuild cost on 3.2M rows: ~15-30 seconds at startup.
-- Subsequent startups skip this migration entirely.
CREATE VIRTUAL TABLE IF NOT EXISTS recipes_fts USING fts5(
ingredient_names,
content=recipes,
content_rowid=id,
tokenize="unicode61"
);
INSERT INTO recipes_fts(recipes_fts) VALUES('rebuild');

View file

@ -232,6 +232,72 @@ class Store:
(str(days),),
)
def recalculate_expiry(
self,
tier: str = "local",
has_byok: bool = False,
) -> tuple[int, int]:
"""Re-run the expiration predictor over all available inventory items.
Uses each item's existing purchase_date (falls back to today if NULL)
and its current location. Skips items that have an explicit
expiration_date from a source other than auto-prediction (i.e. items
whose expiry was found on a receipt or entered by the user) cannot be
distinguished all available items are recalculated.
Returns (updated_count, skipped_count).
"""
from datetime import date
from app.services.expiration_predictor import ExpirationPredictor
predictor = ExpirationPredictor()
rows = self._fetch_all(
"""SELECT i.id, i.location, i.purchase_date,
p.name AS product_name, p.category AS product_category
FROM inventory_items i
JOIN products p ON p.id = i.product_id
WHERE i.status = 'available'""",
(),
)
updated = skipped = 0
for row in rows:
cat = predictor.get_category_from_product(
row["product_name"] or "",
product_category=row.get("product_category"),
location=row.get("location"),
)
purchase_date_raw = row.get("purchase_date")
try:
purchase_date = (
date.fromisoformat(purchase_date_raw)
if purchase_date_raw
else date.today()
)
except (ValueError, TypeError):
purchase_date = date.today()
exp = predictor.predict_expiration(
cat,
row["location"] or "pantry",
purchase_date=purchase_date,
product_name=row["product_name"],
tier=tier,
has_byok=has_byok,
)
if exp is None:
skipped += 1
continue
self.conn.execute(
"UPDATE inventory_items SET expiration_date = ?, updated_at = datetime('now') WHERE id = ?",
(str(exp), row["id"]),
)
updated += 1
self.conn.commit()
return updated, skipped
# ── receipt_data ──────────────────────────────────────────────────────
def upsert_receipt_data(self, receipt_id: int, data: dict) -> dict[str, Any]:
@ -266,16 +332,323 @@ class Store:
# ── recipes ───────────────────────────────────────────────────────────
def _fts_ready(self) -> bool:
"""Return True if the recipes_fts virtual table exists."""
row = self._fetch_one(
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='recipes_fts'"
)
return row is not None
# Words that carry no recipe-ingredient signal and should be filtered
# out when tokenising multi-word product names for FTS expansion.
_FTS_TOKEN_STOPWORDS: frozenset[str] = frozenset({
# Common English stopwords
"a", "an", "the", "of", "in", "for", "with", "and", "or", "to",
"from", "at", "by", "as", "on", "into",
# Brand / marketing words that appear in product names
"lean", "cuisine", "healthy", "choice", "stouffer", "original",
"classic", "deluxe", "homestyle", "family", "style", "grade",
"premium", "select", "natural", "organic", "fresh", "lite",
"ready", "quick", "easy", "instant", "microwave", "frozen",
"brand", "size", "large", "small", "medium", "extra",
# Plant-based / alt-meat brand names
"daring", "gardein", "morningstar", "lightlife", "tofurky",
"quorn", "omni", "nuggs", "simulate", "simulate",
# Preparation states — "cut up chicken" is still chicken
"cut", "diced", "sliced", "chopped", "minced", "shredded",
"cooked", "raw", "whole", "boneless", "skinless", "trimmed",
"pre", "prepared", "marinated", "seasoned", "breaded", "battered",
"grilled", "roasted", "smoked", "canned", "dried", "dehydrated",
"pieces", "piece", "strips", "strip", "chunks", "chunk",
"fillets", "fillet", "cutlets", "cutlet", "tenders", "nuggets",
# Units / packaging
"oz", "lb", "lbs", "pkg", "pack", "box", "can", "bag", "jar",
})
# Maps substrings found in product-label names to canonical recipe-corpus
# ingredient terms. Checked as substring matches against the lower-cased
# full product name, then against each individual token.
_FTS_SYNONYMS: dict[str, str] = {
# Ground / minced beef
"burger patt": "hamburger",
"beef patt": "hamburger",
"ground beef": "hamburger",
"ground chuck": "hamburger",
"ground round": "hamburger",
"mince": "hamburger",
"veggie burger": "hamburger",
"beyond burger": "hamburger",
"impossible burger": "hamburger",
"plant burger": "hamburger",
"chicken patt": "hamburger", # FTS match only — recipe scoring still works
# Sausages
"kielbasa": "sausage",
"bratwurst": "sausage",
"brat ": "sausage",
"frankfurter": "hotdog",
"wiener": "hotdog",
# Chicken cuts + plant-based chicken → generic chicken for broader matching
"chicken breast": "chicken",
"chicken thigh": "chicken",
"chicken drumstick": "chicken",
"chicken wing": "chicken",
"rotisserie chicken": "chicken",
"chicken tender": "chicken",
"chicken strip": "chicken",
"chicken piece": "chicken",
"fake chicken": "chicken",
"plant chicken": "chicken",
"vegan chicken": "chicken",
"daring": "chicken", # Daring Foods brand
"gardein chick": "chicken",
"quorn chick": "chicken",
"chick'n": "chicken",
"chikn": "chicken",
"not-chicken": "chicken",
"no-chicken": "chicken",
# Plant-based beef subs — map to broad "beef" not "hamburger"
# (texture varies: strips ≠ ground; let corpus handle the specific form)
"not-beef": "beef",
"no-beef": "beef",
"plant beef": "beef",
"vegan beef": "beef",
# Plant-based pork subs
"not-pork": "pork",
"no-pork": "pork",
"plant pork": "pork",
"vegan pork": "pork",
"omnipork": "pork",
"omni pork": "pork",
# Generic alt-meat catch-alls → broad "beef" (safer than hamburger)
"fake meat": "beef",
"plant meat": "beef",
"vegan meat": "beef",
"meat-free": "beef",
"meatless": "beef",
# Pork cuts
"pork chop": "pork",
"pork loin": "pork",
"pork tenderloin": "pork",
# Tomato-based sauces
"marinara": "tomato sauce",
"pasta sauce": "tomato sauce",
"spaghetti sauce": "tomato sauce",
"pizza sauce": "tomato sauce",
# Pasta shapes — map to generic "pasta" so FTS finds any pasta recipe
"macaroni": "pasta",
"noodles": "pasta",
"spaghetti": "pasta",
"penne": "pasta",
"fettuccine": "pasta",
"rigatoni": "pasta",
"linguine": "pasta",
"rotini": "pasta",
"farfalle": "pasta",
# Cheese variants → "cheese" for broad matching
"shredded cheese": "cheese",
"sliced cheese": "cheese",
"american cheese": "cheese",
"cheddar": "cheese",
"mozzarella": "cheese",
# Cream variants
"heavy cream": "cream",
"whipping cream": "cream",
"half and half": "cream",
# Buns / rolls
"burger bun": "buns",
"hamburger bun": "buns",
"hot dog bun": "buns",
"bread roll": "buns",
"dinner roll": "buns",
# Tortillas / wraps
"flour tortilla": "tortillas",
"corn tortilla": "tortillas",
"tortilla wrap": "tortillas",
"soft taco shell": "tortillas",
"taco shell": "taco shells",
"pita bread": "pita",
"flatbread": "flatbread",
# Canned beans
"black bean": "beans",
"pinto bean": "beans",
"kidney bean": "beans",
"refried bean": "beans",
"chickpea": "beans",
"garbanzo": "beans",
# Rice variants
"white rice": "rice",
"brown rice": "rice",
"jasmine rice": "rice",
"basmati rice": "rice",
"instant rice": "rice",
"microwavable rice": "rice",
# Salsa / hot sauce
"hot sauce": "salsa",
"taco sauce": "salsa",
"enchilada sauce": "salsa",
# Sour cream substitute
"greek yogurt": "sour cream",
# Prepackaged meals
"lean cuisine": "casserole",
"stouffer": "casserole",
"healthy choice": "casserole",
"marie callender": "casserole",
}
@staticmethod
def _normalize_for_fts(name: str) -> list[str]:
"""Expand one pantry item to all FTS search terms it should contribute.
Returns the original name plus:
- Any synonym-map canonical terms (handles product-label corpus name)
- Individual significant tokens from multi-word product names
(handles packaged meals like "Lean Cuisine Chicken Alfredo" also
searches for "chicken" and "alfredo" independently)
"""
lower = name.lower().strip()
if not lower:
return []
terms: list[str] = [lower]
# Substring synonym check on full name
for pattern, canonical in Store._FTS_SYNONYMS.items():
if pattern in lower:
terms.append(canonical)
# For multi-word product names, also add individual significant tokens
if " " in lower:
for token in lower.split():
if len(token) <= 3 or token in Store._FTS_TOKEN_STOPWORDS:
continue
if token not in terms:
terms.append(token)
# Synonym-expand individual tokens too
if token in Store._FTS_SYNONYMS:
canonical = Store._FTS_SYNONYMS[token]
if canonical not in terms:
terms.append(canonical)
return terms
@staticmethod
def _build_fts_query(ingredient_names: list[str]) -> str:
"""Build an FTS5 MATCH expression ORing all ingredient terms.
Each pantry item is expanded via _normalize_for_fts so that
product-label names (e.g. "burger patties") also search for their
recipe-corpus equivalents (e.g. "hamburger"), and multi-word packaged
product names contribute their individual ingredient tokens.
"""
parts: list[str] = []
seen: set[str] = set()
for name in ingredient_names:
for term in Store._normalize_for_fts(name):
# Strip characters that break FTS5 query syntax
clean = term.replace('"', "").replace("'", "")
if not clean or clean in seen:
continue
seen.add(clean)
parts.append(f'"{clean}"')
return " OR ".join(parts)
def search_recipes_by_ingredients(
self,
ingredient_names: list[str],
limit: int = 20,
category: str | None = None,
max_calories: float | None = None,
max_sugar_g: float | None = None,
max_carbs_g: float | None = None,
max_sodium_mg: float | None = None,
excluded_ids: list[int] | None = None,
) -> list[dict]:
"""Find recipes containing any of the given ingredient names.
Scores by match count and returns highest-scoring first."""
Scores by match count and returns highest-scoring first.
Uses FTS5 index (migration 015) when available O(log N) per query.
Falls back to LIKE scans on older databases.
Nutrition filters use NULL-passthrough: rows without nutrition data
always pass (they may be estimated or absent entirely).
"""
if not ingredient_names:
return []
extra_clauses: list[str] = []
extra_params: list = []
if category:
extra_clauses.append("r.category = ?")
extra_params.append(category)
if max_calories is not None:
extra_clauses.append("(r.calories IS NULL OR r.calories <= ?)")
extra_params.append(max_calories)
if max_sugar_g is not None:
extra_clauses.append("(r.sugar_g IS NULL OR r.sugar_g <= ?)")
extra_params.append(max_sugar_g)
if max_carbs_g is not None:
extra_clauses.append("(r.carbs_g IS NULL OR r.carbs_g <= ?)")
extra_params.append(max_carbs_g)
if max_sodium_mg is not None:
extra_clauses.append("(r.sodium_mg IS NULL OR r.sodium_mg <= ?)")
extra_params.append(max_sodium_mg)
if excluded_ids:
placeholders = ",".join("?" * len(excluded_ids))
extra_clauses.append(f"r.id NOT IN ({placeholders})")
extra_params.extend(excluded_ids)
where_extra = (" AND " + " AND ".join(extra_clauses)) if extra_clauses else ""
if self._fts_ready():
return self._search_recipes_fts(
ingredient_names, limit, where_extra, extra_params
)
return self._search_recipes_like(
ingredient_names, limit, where_extra, extra_params
)
def _search_recipes_fts(
self,
ingredient_names: list[str],
limit: int,
where_extra: str,
extra_params: list,
) -> list[dict]:
"""FTS5-backed ingredient search. Candidates fetched via inverted index;
match_count computed in Python over the small candidate set."""
fts_query = self._build_fts_query(ingredient_names)
if not fts_query:
return []
# Pull up to 10× limit candidates so ranking has enough headroom.
sql = f"""
SELECT r.*
FROM recipes_fts
JOIN recipes r ON r.id = recipes_fts.rowid
WHERE recipes_fts MATCH ?
{where_extra}
LIMIT ?
"""
rows = self._fetch_all(sql, (fts_query, *extra_params, limit * 10))
pantry_set = {n.lower().strip() for n in ingredient_names}
scored: list[dict] = []
for row in rows:
raw = row.get("ingredient_names") or []
names: list[str] = raw if isinstance(raw, list) else json.loads(raw or "[]")
match_count = sum(1 for n in names if n.lower() in pantry_set)
scored.append({**row, "match_count": match_count})
scored.sort(key=lambda r: (-r["match_count"], r["id"]))
return scored[:limit]
def _search_recipes_like(
self,
ingredient_names: list[str],
limit: int,
where_extra: str,
extra_params: list,
) -> list[dict]:
"""Legacy LIKE-based ingredient search (O(N×rows) — slow on large corpora)."""
like_params = [f'%"{n}"%' for n in ingredient_names]
like_clauses = " OR ".join(
"r.ingredient_names LIKE ?" for _ in ingredient_names
@ -284,20 +657,15 @@ class Store:
"CASE WHEN r.ingredient_names LIKE ? THEN 1 ELSE 0 END"
for _ in ingredient_names
)
category_clause = ""
category_params: list = []
if category:
category_clause = "AND r.category = ?"
category_params = [category]
sql = f"""
SELECT r.*, ({match_score}) AS match_count
FROM recipes r
WHERE ({like_clauses})
{category_clause}
{where_extra}
ORDER BY match_count DESC, r.id ASC
LIMIT ?
"""
all_params = like_params + like_params + category_params + [limit]
all_params = like_params + like_params + extra_params + [limit]
return self._fetch_all(sql, tuple(all_params))
def get_recipe(self, recipe_id: int) -> dict | None:

View file

@ -12,6 +12,20 @@ class SwapCandidate(BaseModel):
compensation_hints: list[dict] = Field(default_factory=list)
class NutritionPanel(BaseModel):
"""Per-recipe macro summary. All values are per-serving when servings is known,
otherwise for the full recipe. None means data is unavailable."""
calories: float | None = None
fat_g: float | None = None
protein_g: float | None = None
carbs_g: float | None = None
fiber_g: float | None = None
sugar_g: float | None = None
sodium_mg: float | None = None
servings: float | None = None
estimated: bool = False # True when nutrition was inferred from ingredient profiles
class RecipeSuggestion(BaseModel):
id: int
title: str
@ -20,9 +34,11 @@ class RecipeSuggestion(BaseModel):
swap_candidates: list[SwapCandidate] = Field(default_factory=list)
missing_ingredients: list[str] = Field(default_factory=list)
directions: list[str] = Field(default_factory=list)
prep_notes: list[str] = Field(default_factory=list)
notes: str = ""
level: int = 1
is_wildcard: bool = False
nutrition: NutritionPanel | None = None
class GroceryLink(BaseModel):
@ -40,6 +56,14 @@ class RecipeResult(BaseModel):
rate_limit_count: int = 0
class NutritionFilters(BaseModel):
"""Optional per-serving upper bounds for macro filtering. None = no filter."""
max_calories: float | None = None
max_sugar_g: float | None = None
max_carbs_g: float | None = None
max_sodium_mg: float | None = None
class RecipeRequest(BaseModel):
pantry_items: list[str]
level: int = Field(default=1, ge=1, le=4)
@ -48,7 +72,10 @@ class RecipeRequest(BaseModel):
hard_day_mode: bool = False
max_missing: int | None = None
style_id: str | None = None
category: str | None = None
tier: str = "free"
has_byok: bool = False
wildcard_confirmed: bool = False
allergies: list[str] = Field(default_factory=list)
nutrition_filters: NutritionFilters = Field(default_factory=NutritionFilters)
excluded_ids: list[int] = Field(default_factory=list)

View file

@ -0,0 +1,647 @@
"""
Assembly-dish template matcher for Level 1/2.
Assembly dishes (burritos, stir fry, fried rice, omelettes, sandwiches, etc.)
are defined by structural roles -- container + filler + sauce -- not by a fixed
ingredient list. The corpus can never fully cover them.
This module fires when the pantry covers all *required* roles of a template.
Results are injected at the top of the Level 1/2 suggestion list with negative
ids (client displays them identically to corpus recipes).
Templates define:
- required: list of role sets -- ALL must have at least one pantry match
- optional: role sets whose matched items are shown as extras
- directions: short cooking instructions
- notes: serving suggestions / variations
"""
from __future__ import annotations
import hashlib
from dataclasses import dataclass
from app.models.schemas.recipe import RecipeSuggestion
# IDs in range -100..-1 are reserved for assembly-generated suggestions
_ASSEMBLY_ID_START = -1
@dataclass
class AssemblyRole:
"""One role in a template (e.g. 'protein').
display: human-readable role label
keywords: substrings matched against pantry item (lowercased)
"""
display: str
keywords: list[str]
@dataclass
class AssemblyTemplate:
"""A template assembly dish."""
id: int
title: str
required: list[AssemblyRole]
optional: list[AssemblyRole]
directions: list[str]
notes: str = ""
def _matches_role(role: AssemblyRole, pantry_set: set[str]) -> list[str]:
"""Return pantry items that satisfy this role.
Single-word keywords use whole-word matching (word must appear as a
discrete token) so short words like 'pea', 'ham', 'egg' don't false-match
inside longer words like 'peanut', 'hamburger', 'eggnog'.
Multi-word keywords (e.g. 'burger patt') use substring matching.
"""
hits: list[str] = []
for item in pantry_set:
item_lower = item.lower()
item_words = set(item_lower.split())
for kw in role.keywords:
if " " in kw:
# Multi-word: substring match
if kw in item_lower:
hits.append(item)
break
else:
# Single-word: whole-word match only
if kw in item_words:
hits.append(item)
break
return hits
def _pick_one(items: list[str], seed: int) -> str:
"""Deterministically pick one item from a list using a seed."""
return sorted(items)[seed % len(items)]
def _pantry_hash(pantry_set: set[str]) -> int:
"""Stable integer derived from pantry contents — used for deterministic picks."""
key = ",".join(sorted(pantry_set))
return int(hashlib.md5(key.encode()).hexdigest(), 16) # noqa: S324 — non-crypto use
def _keyword_label(item: str, role: AssemblyRole) -> str:
"""Return a short, clean label derived from the keyword that matched.
Uses the longest matching keyword (most specific) as the base label,
then title-cases it. This avoids pasting full raw pantry names like
'Organic Extra Firm Tofu' into titles just 'Tofu' instead.
"""
lower = item.lower()
best_kw = ""
for kw in role.keywords:
if kw in lower and len(kw) > len(best_kw):
best_kw = kw
label = (best_kw or item).strip().title()
# Drop trailing 's' from keywords like "beans" → "Bean" when it reads better
return label
def _personalized_title(tmpl: AssemblyTemplate, pantry_set: set[str], seed: int) -> str:
"""Build a specific title using actual pantry items, e.g. 'Chicken & Broccoli Burrito'.
Uses the matched keyword as the label (not the full pantry item name) so
'Organic Extra Firm Tofu Block' 'Tofu' in the title.
Picks at most two roles; prefers protein then vegetable.
"""
priority_displays = ["protein", "vegetables", "sauce base", "cheese"]
picked: list[str] = []
for display in priority_displays:
for role in tmpl.optional:
if role.display != display:
continue
hits = _matches_role(role, pantry_set)
if hits:
item = _pick_one(hits, seed)
label = _keyword_label(item, role)
if label not in picked:
picked.append(label)
if len(picked) >= 2:
break
if not picked:
return tmpl.title
return f"{' & '.join(picked)} {tmpl.title}"
# ---------------------------------------------------------------------------
# Template definitions
# ---------------------------------------------------------------------------
ASSEMBLY_TEMPLATES: list[AssemblyTemplate] = [
AssemblyTemplate(
id=-1,
title="Burrito / Taco",
required=[
AssemblyRole("tortilla or wrap", [
"tortilla", "wrap", "taco shell", "flatbread", "pita",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "steak", "pork", "sausage", "hamburger",
"burger patt", "shrimp", "egg", "tofu", "beans", "bean",
]),
AssemblyRole("rice or starch", ["rice", "quinoa", "potato"]),
AssemblyRole("cheese", [
"cheese", "cheddar", "mozzarella", "monterey", "queso",
]),
AssemblyRole("salsa or sauce", [
"salsa", "hot sauce", "taco sauce", "enchilada", "guacamole",
]),
AssemblyRole("sour cream or yogurt", ["sour cream", "greek yogurt", "crema"]),
AssemblyRole("vegetables", [
"pepper", "onion", "tomato", "lettuce", "corn", "avocado",
"spinach", "broccoli", "zucchini",
]),
],
directions=[
"Warm the tortilla in a dry skillet or microwave for 20 seconds.",
"Heat any proteins or vegetables in a pan until cooked through.",
"Layer ingredients down the center: rice first, then protein, then vegetables.",
"Add cheese, salsa, and sour cream last so they stay cool.",
"Fold in the sides and roll tightly. Optionally toast seam-side down 1-2 minutes.",
],
notes="Works as a burrito (rolled), taco (folded), or quesadilla (cheese only, pressed flat).",
),
AssemblyTemplate(
id=-2,
title="Fried Rice",
required=[
AssemblyRole("cooked rice", [
"rice", "leftover rice", "instant rice", "microwavable rice",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "pork", "shrimp", "egg", "tofu",
"sausage", "ham", "spam",
]),
AssemblyRole("soy sauce or seasoning", [
"soy sauce", "tamari", "teriyaki", "oyster sauce", "fish sauce",
]),
AssemblyRole("oil", ["oil", "butter", "sesame"]),
AssemblyRole("egg", ["egg"]),
AssemblyRole("vegetables", [
"carrot", "peas", "corn", "onion", "scallion", "green onion",
"broccoli", "bok choy", "bean sprout", "zucchini", "spinach",
]),
AssemblyRole("garlic or ginger", ["garlic", "ginger"]),
],
directions=[
"Use day-old cold rice if available -- it fries better than fresh.",
"Heat oil in a large skillet or wok over high heat.",
"Add garlic/ginger and any raw vegetables; stir fry 2-3 minutes.",
"Push to the side, scramble eggs in the same pan if using.",
"Add protein (pre-cooked or raw) and cook through.",
"Add rice, breaking up clumps. Stir fry until heated and lightly toasted.",
"Season with soy sauce and any other sauces. Toss to combine.",
],
notes="Add a fried egg on top. A drizzle of sesame oil at the end adds a lot.",
),
AssemblyTemplate(
id=-3,
title="Omelette / Scramble",
required=[
AssemblyRole("eggs", ["egg"]),
],
optional=[
AssemblyRole("cheese", [
"cheese", "cheddar", "mozzarella", "feta", "parmesan",
]),
AssemblyRole("vegetables", [
"pepper", "onion", "tomato", "spinach", "mushroom",
"broccoli", "zucchini", "scallion", "avocado",
]),
AssemblyRole("protein", [
"ham", "bacon", "sausage", "chicken", "turkey",
"smoked salmon",
]),
AssemblyRole("herbs or seasoning", [
"herb", "basil", "chive", "parsley", "salt", "pepper",
"hot sauce", "salsa",
]),
],
directions=[
"Beat eggs with a splash of water or milk and a pinch of salt.",
"Saute any vegetables and proteins in butter or oil over medium heat until softened.",
"Pour eggs over fillings (scramble) or pour into a clean buttered pan (omelette).",
"For omelette: cook until nearly set, add fillings to one side, fold over.",
"For scramble: stir gently over medium-low heat until just set.",
"Season and serve immediately.",
],
notes="Works for breakfast, lunch, or a quick dinner. Any leftover vegetables work well.",
),
AssemblyTemplate(
id=-4,
title="Stir Fry",
required=[
AssemblyRole("vegetables", [
"pepper", "broccoli", "carrot", "snap pea", "bok choy",
"zucchini", "mushroom", "corn", "onion", "bean sprout",
"cabbage", "spinach", "asparagus",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "pork", "shrimp", "tofu", "egg",
]),
AssemblyRole("sauce", [
"soy sauce", "teriyaki", "oyster sauce", "hoisin",
"stir fry sauce", "sesame",
]),
AssemblyRole("starch base", ["rice", "noodle", "pasta", "ramen"]),
AssemblyRole("garlic or ginger", ["garlic", "ginger"]),
AssemblyRole("oil", ["oil", "sesame"]),
],
directions=[
"Cut all proteins and vegetables into similar-sized pieces for even cooking.",
"Heat oil in a wok or large skillet over the highest heat your stove allows.",
"Cook protein first until nearly done; remove and set aside.",
"Add dense vegetables (carrots, broccoli) first; quick-cooking veg last.",
"Return protein, add sauce, and toss everything together for 1-2 minutes.",
"Serve over rice or noodles.",
],
notes="High heat is the key. Do not crowd the pan -- cook in batches if needed.",
),
AssemblyTemplate(
id=-5,
title="Pasta with Whatever You Have",
required=[
AssemblyRole("pasta", [
"pasta", "spaghetti", "penne", "fettuccine", "rigatoni",
"linguine", "rotini", "farfalle", "macaroni", "noodle",
]),
],
optional=[
AssemblyRole("sauce base", [
"tomato", "marinara", "pasta sauce", "cream", "butter",
"olive oil", "pesto",
]),
AssemblyRole("protein", [
"chicken", "beef", "pork", "shrimp", "sausage", "bacon",
"ham", "tuna", "canned fish",
]),
AssemblyRole("cheese", [
"parmesan", "romano", "mozzarella", "ricotta", "feta",
]),
AssemblyRole("vegetables", [
"tomato", "spinach", "mushroom", "pepper", "zucchini",
"broccoli", "artichoke", "olive", "onion",
]),
AssemblyRole("garlic", ["garlic"]),
],
directions=[
"Cook pasta in well-salted boiling water until al dente. Reserve 1 cup pasta water.",
"While pasta cooks, saute garlic in olive oil over medium heat.",
"Add proteins and cook through; add vegetables until tender.",
"Add sauce base and simmer 5 minutes. Add pasta water to loosen if needed.",
"Toss cooked pasta with sauce. Finish with cheese if using.",
],
notes="Pasta water is the secret -- the starch thickens and binds any sauce.",
),
AssemblyTemplate(
id=-6,
title="Sandwich / Wrap",
required=[
AssemblyRole("bread or wrap", [
"bread", "roll", "bun", "wrap", "tortilla", "pita",
"bagel", "english muffin", "croissant", "flatbread",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "turkey", "ham", "roast beef", "tuna", "egg",
"bacon", "salami", "pepperoni", "tofu", "tempeh",
]),
AssemblyRole("cheese", [
"cheese", "cheddar", "swiss", "provolone", "mozzarella",
]),
AssemblyRole("condiment", [
"mayo", "mustard", "ketchup", "hot sauce", "ranch",
"hummus", "pesto", "aioli",
]),
AssemblyRole("vegetables", [
"lettuce", "tomato", "onion", "cucumber", "avocado",
"pepper", "sprout", "arugula",
]),
],
directions=[
"Toast bread if desired.",
"Spread condiments on both inner surfaces.",
"Layer protein first, then cheese, then vegetables.",
"Press together and cut diagonally.",
],
notes="Leftovers, deli meat, canned fish -- nearly anything works between bread.",
),
AssemblyTemplate(
id=-7,
title="Grain Bowl",
required=[
AssemblyRole("grain base", [
"rice", "quinoa", "farro", "barley", "couscous",
"bulgur", "freekeh", "polenta",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "pork", "tofu", "egg", "shrimp",
"beans", "bean", "lentil", "chickpea",
]),
AssemblyRole("vegetables", [
"roasted", "broccoli", "carrot", "kale", "spinach",
"cucumber", "tomato", "corn", "edamame", "avocado",
"beet", "sweet potato",
]),
AssemblyRole("dressing or sauce", [
"dressing", "tahini", "vinaigrette", "sauce",
"olive oil", "lemon", "soy sauce",
]),
AssemblyRole("toppings", [
"nut", "seed", "feta", "parmesan", "herb",
]),
],
directions=[
"Cook grain base according to package directions; season with salt.",
"Roast or saute vegetables with oil, salt, and pepper until tender.",
"Cook or slice protein.",
"Arrange grain in a bowl, top with protein and vegetables.",
"Drizzle with dressing and add toppings.",
],
notes="Great for meal prep -- cook grains and proteins in bulk, assemble bowls all week.",
),
AssemblyTemplate(
id=-8,
title="Soup / Stew",
required=[
AssemblyRole("broth or liquid base", [
"broth", "stock", "bouillon",
"tomato sauce", "coconut milk", "cream of",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "pork", "sausage", "shrimp",
"beans", "bean", "lentil", "tofu",
]),
AssemblyRole("vegetables", [
"carrot", "celery", "onion", "potato", "tomato",
"spinach", "kale", "corn", "pea", "zucchini",
]),
AssemblyRole("starch thickener", [
"potato", "pasta", "noodle", "rice", "barley",
"flour", "cornstarch",
]),
AssemblyRole("seasoning", [
"garlic", "herb", "bay leaf", "thyme", "rosemary",
"cumin", "paprika", "chili",
]),
],
directions=[
"Saute onion, celery, and garlic in oil until softened, about 5 minutes.",
"Add any raw proteins and cook until browned.",
"Add broth or liquid base and bring to a simmer.",
"Add dense vegetables (carrots, potatoes) first; quick-cooking veg in the last 10 minutes.",
"Add starches and cook until tender.",
"Season to taste and simmer at least 20 minutes for flavors to develop.",
],
notes="Soups and stews improve overnight in the fridge. Almost any combination works.",
),
AssemblyTemplate(
id=-9,
title="Casserole / Bake",
required=[
AssemblyRole("starch or base", [
"pasta", "rice", "potato", "noodle", "bread",
"tortilla", "polenta", "grits", "macaroni",
]),
AssemblyRole("binder or sauce", [
"cream of", "cheese", "cream cheese", "sour cream",
"soup mix", "gravy", "tomato sauce", "marinara",
"broth", "stock", "milk", "cream",
]),
],
optional=[
AssemblyRole("protein", [
"chicken", "beef", "pork", "tuna", "ham", "sausage",
"ground", "shrimp", "beans", "bean", "lentil",
]),
AssemblyRole("vegetables", [
"broccoli", "corn", "pea", "onion", "mushroom",
"spinach", "zucchini", "tomato", "pepper", "carrot",
]),
AssemblyRole("cheese topping", [
"cheddar", "mozzarella", "parmesan", "swiss",
"cheese", "breadcrumb",
]),
AssemblyRole("seasoning", [
"garlic", "herb", "thyme", "rosemary", "paprika",
"onion powder", "salt", "pepper",
]),
],
directions=[
"Preheat oven to 375 F (190 C). Grease a 9x13 baking dish.",
"Cook starch base (pasta, rice, potato) until just underdone -- it finishes in the oven.",
"Mix cooked starch with sauce/binder, protein, and vegetables in the dish.",
"Season generously -- casseroles need salt.",
"Top with cheese or breadcrumbs if using.",
"Bake covered 25 minutes, then uncovered 15 minutes until golden and bubbly.",
],
notes="Classic pantry dump dinner. Cream of anything soup is the universal binder.",
),
AssemblyTemplate(
id=-10,
title="Pancakes / Waffles / Quick Bread",
required=[
AssemblyRole("flour or baking mix", [
"flour", "bisquick", "pancake mix", "waffle mix",
"baking mix", "cornmeal", "oats",
]),
AssemblyRole("leavening or egg", [
"egg", "baking powder", "baking soda", "yeast",
]),
],
optional=[
AssemblyRole("liquid", [
"milk", "buttermilk", "water", "juice",
"almond milk", "oat milk", "sour cream",
]),
AssemblyRole("fat", [
"butter", "oil", "margarine",
]),
AssemblyRole("sweetener", [
"sugar", "honey", "maple syrup", "brown sugar",
]),
AssemblyRole("mix-ins", [
"blueberr", "banana", "apple", "chocolate chip",
"nut", "berry", "cinnamon", "vanilla",
]),
],
directions=[
"Whisk dry ingredients (flour, leavening, sugar, salt) together in a bowl.",
"Whisk wet ingredients (egg, milk, melted butter) in a separate bowl.",
"Fold wet into dry until just combined -- lumps are fine, do not overmix.",
"For pancakes: cook on a buttered griddle over medium heat, flip when bubbles form.",
"For waffles: pour into preheated waffle iron according to manufacturer instructions.",
"For muffins or quick bread: pour into greased pan, bake at 375 F until a toothpick comes out clean.",
],
notes="Overmixing develops gluten and makes pancakes tough. Stop when just combined.",
),
AssemblyTemplate(
id=-11,
title="Porridge / Oatmeal",
required=[
AssemblyRole("oats or grain porridge", [
"oat", "porridge", "grits", "semolina", "cream of wheat",
"polenta", "congee", "rice porridge",
]),
],
optional=[
AssemblyRole("liquid", ["milk", "water", "almond milk", "oat milk", "coconut milk"]),
AssemblyRole("sweetener", ["sugar", "honey", "maple syrup", "brown sugar", "agave"]),
AssemblyRole("fruit", ["banana", "berry", "apple", "raisin", "date", "mango"]),
AssemblyRole("toppings", ["nut", "seed", "granola", "coconut", "chocolate"]),
AssemblyRole("spice", ["cinnamon", "nutmeg", "vanilla", "cardamom"]),
],
directions=[
"Combine oats with liquid in a pot — typically 1 part oats to 2 parts liquid.",
"Bring to a gentle simmer over medium heat, stirring occasionally.",
"Cook 5 minutes (rolled oats) or 2 minutes (quick oats) until thickened to your liking.",
"Stir in sweetener and spices.",
"Top with fruit, nuts, or seeds and serve immediately.",
],
notes="Overnight oats: skip cooking — soak oats in cold milk overnight in the fridge.",
),
AssemblyTemplate(
id=-12,
title="Pie / Pot Pie",
required=[
AssemblyRole("pastry or crust", [
"pastry", "puff pastry", "pie crust", "shortcrust",
"pie shell", "phyllo", "filo", "biscuit",
]),
],
optional=[
AssemblyRole("protein filling", [
"chicken", "beef", "pork", "lamb", "turkey", "tofu",
"mushroom", "beans", "bean", "lentil", "tuna", "salmon",
]),
AssemblyRole("vegetables", [
"carrot", "pea", "corn", "potato", "onion", "leek",
"broccoli", "spinach", "mushroom", "parsnip", "swede",
]),
AssemblyRole("sauce or binder", [
"gravy", "cream of", "stock", "broth", "cream",
"white sauce", "bechamel", "cheese sauce",
]),
AssemblyRole("seasoning", [
"thyme", "rosemary", "sage", "garlic", "herb",
"mustard", "worcestershire",
]),
AssemblyRole("sweet filling", [
"apple", "berry", "cherry", "pear", "peach",
"rhubarb", "plum", "custard",
]),
],
directions=[
"For pot pie: make a sauce by combining stock or cream-of-something with cooked vegetables and protein.",
"Season generously — fillings need more salt than you think.",
"Pour filling into a baking dish and top with pastry, pressing edges to seal.",
"Cut a few slits in the top to release steam. Brush with egg wash or milk if available.",
"Bake at 400 F (200 C) for 25-35 minutes until pastry is golden brown.",
"For sweet pie: fill unbaked crust with fruit filling, top with second crust or crumble, bake similarly.",
],
notes="Puff pastry from the freezer is the shortcut to impressive pot pies. Thaw in the fridge overnight.",
),
AssemblyTemplate(
id=-13,
title="Pudding / Custard",
required=[
AssemblyRole("dairy or dairy-free milk", [
"milk", "cream", "oat milk", "almond milk",
"soy milk", "coconut milk",
]),
AssemblyRole("thickener or set", [
"egg", "cornstarch", "custard powder", "gelatin",
"agar", "tapioca", "arrowroot",
]),
],
optional=[
AssemblyRole("sweetener", ["sugar", "honey", "maple syrup", "condensed milk"]),
AssemblyRole("flavouring", [
"vanilla", "chocolate", "cocoa", "caramel",
"lemon", "orange", "cinnamon", "nutmeg",
]),
AssemblyRole("starchy base", [
"rice", "bread", "sponge", "cake", "biscuit",
]),
AssemblyRole("fruit", ["raisin", "sultana", "berry", "banana", "apple"]),
],
directions=[
"For stovetop custard: whisk eggs and sugar together, heat milk until steaming.",
"Slowly pour hot milk into egg mixture while whisking constantly (tempering).",
"Return to low heat and stir until mixture coats the back of a spoon.",
"For cornstarch pudding: whisk cornstarch into cold milk first, then heat while stirring.",
"Add flavourings (vanilla, cocoa) once off heat.",
"Pour into dishes and refrigerate at least 2 hours to set.",
],
notes="UK-style pudding is broad — bread pudding, rice pudding, spotted dick, treacle sponge all count.",
),
]
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def match_assembly_templates(
pantry_items: list[str],
pantry_set: set[str],
excluded_ids: list[int],
) -> list[RecipeSuggestion]:
"""Return assembly-dish suggestions whose required roles are all satisfied.
Titles are personalized with specific pantry items (deterministically chosen
from the pantry contents so the same pantry always produces the same title).
Skips templates whose id is in excluded_ids (dismiss/load-more support).
"""
excluded = set(excluded_ids)
seed = _pantry_hash(pantry_set)
results: list[RecipeSuggestion] = []
for tmpl in ASSEMBLY_TEMPLATES:
if tmpl.id in excluded:
continue
# All required roles must be satisfied
if any(not _matches_role(role, pantry_set) for role in tmpl.required):
continue
optional_hit_count = sum(
1 for role in tmpl.optional if _matches_role(role, pantry_set)
)
results.append(RecipeSuggestion(
id=tmpl.id,
title=_personalized_title(tmpl, pantry_set, seed + tmpl.id),
match_count=len(tmpl.required) + optional_hit_count,
element_coverage={},
swap_candidates=[],
missing_ingredients=[],
directions=tmpl.directions,
notes=tmpl.notes,
level=1,
is_wildcard=False,
nutrition=None,
))
# Sort by optional coverage descending — best-matched templates first
results.sort(key=lambda s: s.match_count, reverse=True)
return results

View file

@ -3,6 +3,7 @@ from __future__ import annotations
import logging
import os
import re
from contextlib import nullcontext
from typing import TYPE_CHECKING
@ -54,6 +55,9 @@ class LLMRecipeGenerator:
lines: list[str] = [
"You are a creative chef. Generate a recipe using the ingredients below.",
"IMPORTANT: When you use a pantry item, list it in Ingredients using its exact name from the pantry list. Do not add adjectives, quantities, or cooking states (e.g. use 'butter', not 'unsalted butter' or '2 tbsp butter').",
"IMPORTANT: Only use pantry items that make culinary sense for the dish. Do NOT force flavoured/sweetened items (vanilla yoghurt, fruit yoghurt, jam, dessert sauces, flavoured syrups) into savoury dishes. Plain yoghurt, plain cream, and plain dairy are fine in savoury cooking.",
"IMPORTANT: Do not default to the same ingredient repeatedly across dishes. If a pantry item does not genuinely improve this specific dish, leave it out.",
"",
f"Pantry items: {', '.join(safe_pantry)}",
]
@ -82,10 +86,13 @@ class LLMRecipeGenerator:
lines += [
"",
"Reply in this format:",
"Title: <recipe name>",
"Reply using EXACTLY this plain-text format — no markdown, no bold, no extra commentary:",
"Title: <name of the dish>",
"Ingredients: <comma-separated list>",
"Directions: <numbered steps>",
"Directions:",
"1. <first step>",
"2. <second step>",
"3. <continue for each step>",
"Notes: <optional tips>",
]
@ -101,6 +108,7 @@ class LLMRecipeGenerator:
lines: list[str] = [
"Surprise me with a creative, unexpected recipe.",
"Only use ingredients that make culinary sense together. Do not force flavoured/sweetened items (vanilla yoghurt, flavoured syrups, jam) into savoury dishes.",
f"Ingredients available: {', '.join(safe_pantry)}",
]
@ -112,7 +120,13 @@ class LLMRecipeGenerator:
lines += [
"Treat any mystery ingredient as a wildcard — use your imagination.",
"Title: <name> | Ingredients: <list> | Directions: <steps>",
"Reply using EXACTLY this plain-text format — no markdown, no bold:",
"Title: <name of the dish>",
"Ingredients: <comma-separated list>",
"Directions:",
"1. <first step>",
"2. <second step>",
"Notes: <optional tips>",
]
return "\n".join(lines)
@ -169,8 +183,18 @@ class LLMRecipeGenerator:
logger.error("LLM call failed: %s", exc)
return ""
# Strips markdown bold/italic markers so "**Directions:**" parses like "Directions:"
_MD_BOLD = re.compile(r"\*{1,2}([^*]+)\*{1,2}")
def _strip_md(self, text: str) -> str:
return self._MD_BOLD.sub(r"\1", text).strip()
def _parse_response(self, response: str) -> dict[str, str | list[str]]:
"""Parse LLM response text into structured recipe fields."""
"""Parse LLM response text into structured recipe fields.
Handles both plain-text and markdown-formatted responses. Directions are
preserved as newline-separated text so the caller can split on step numbers.
"""
result: dict[str, str | list[str]] = {
"title": "",
"ingredients": [],
@ -184,14 +208,17 @@ class LLMRecipeGenerator:
def _flush(key: str | None, buf: list[str]) -> None:
if key is None or not buf:
return
text = " ".join(buf).strip()
if key == "ingredients":
if key == "directions":
result["directions"] = "\n".join(buf)
elif key == "ingredients":
text = " ".join(buf)
result["ingredients"] = [i.strip() for i in text.split(",") if i.strip()]
else:
result[key] = text
result[key] = " ".join(buf).strip()
for line in response.splitlines():
lower = line.lower().strip()
for raw_line in response.splitlines():
line = self._strip_md(raw_line)
lower = line.lower()
if lower.startswith("title:"):
_flush(current_key, buffer)
current_key, buffer = "title", [line.split(":", 1)[1].strip()]
@ -200,12 +227,18 @@ class LLMRecipeGenerator:
current_key, buffer = "ingredients", [line.split(":", 1)[1].strip()]
elif lower.startswith("directions:"):
_flush(current_key, buffer)
current_key, buffer = "directions", [line.split(":", 1)[1].strip()]
rest = line.split(":", 1)[1].strip()
current_key, buffer = "directions", ([rest] if rest else [])
elif lower.startswith("notes:"):
_flush(current_key, buffer)
current_key, buffer = "notes", [line.split(":", 1)[1].strip()]
elif current_key and line.strip():
buffer.append(line.strip())
elif current_key is None and line.strip() and ":" not in line:
# Before any section header: a 2-10 word colon-free line is the dish name
words = line.split()
if 2 <= len(words) <= 10 and not result["title"]:
result["title"] = line.strip()
_flush(current_key, buffer)
return result
@ -230,17 +263,37 @@ class LLMRecipeGenerator:
parsed = self._parse_response(response)
raw_directions = parsed.get("directions", "")
directions_list: list[str] = (
[s.strip() for s in raw_directions.split(".") if s.strip()]
if isinstance(raw_directions, str)
else list(raw_directions)
)
if isinstance(raw_directions, str):
# Split on newlines; strip leading step numbers ("1.", "2.", "- ", "* ")
_step_prefix = re.compile(r"^\s*(?:\d+[.)]\s*|[-*]\s+)")
directions_list = [
_step_prefix.sub("", s).strip()
for s in raw_directions.splitlines()
if s.strip()
]
else:
directions_list = list(raw_directions)
raw_notes = parsed.get("notes", "")
notes_str: str = raw_notes if isinstance(raw_notes, str) else ""
all_ingredients: list[str] = list(parsed.get("ingredients", []))
pantry_set = {item.lower() for item in (req.pantry_items or [])}
missing = [i for i in all_ingredients if i.lower() not in pantry_set]
# Strip leading quantities/units (e.g. "2 cups rice" → "rice") before
# checking against pantry, since LLMs return formatted ingredient strings.
_qty_re = re.compile(
r"^\s*[\d½¼¾⅓⅔]+[\s/\-]*" # leading digits or fractions
r"(?:cup|cups|tbsp|tsp|tablespoon|teaspoon|oz|lb|lbs|g|kg|"
r"can|cans|clove|cloves|bunch|package|pkg|slice|slices|"
r"piece|pieces|pinch|dash|handful|head|heads|large|small|medium"
r")s?\b[,\s]*",
re.IGNORECASE,
)
missing = []
for ing in all_ingredients:
bare = _qty_re.sub("", ing).strip().lower()
if bare not in pantry_set and ing.lower() not in pantry_set:
missing.append(bare or ing)
suggestion = RecipeSuggestion(
id=0,

View file

@ -20,13 +20,353 @@ from typing import TYPE_CHECKING
if TYPE_CHECKING:
from app.db.store import Store
from app.models.schemas.recipe import GroceryLink, RecipeRequest, RecipeResult, RecipeSuggestion, SwapCandidate
from app.models.schemas.recipe import GroceryLink, NutritionPanel, RecipeRequest, RecipeResult, RecipeSuggestion, SwapCandidate
from app.services.recipe.assembly_recipes import match_assembly_templates
from app.services.recipe.element_classifier import ElementClassifier
from app.services.recipe.grocery_links import GroceryLinkBuilder
from app.services.recipe.substitution_engine import SubstitutionEngine
_LEFTOVER_DAILY_MAX_FREE = 5
# Words that carry no ingredient-identity signal — stripped before overlap scoring
_SWAP_STOPWORDS = frozenset({
"a", "an", "the", "of", "in", "for", "with", "and", "or",
"to", "from", "at", "by", "as", "on",
})
# Maps product-label substrings to recipe-corpus canonical terms.
# Kept in sync with Store._FTS_SYNONYMS — both must agree on canonical names.
# Used to expand pantry_set so single-word recipe ingredients can match
# multi-word product names (e.g. "hamburger" satisfied by "burger patties").
_PANTRY_LABEL_SYNONYMS: dict[str, str] = {
"burger patt": "hamburger",
"beef patt": "hamburger",
"ground beef": "hamburger",
"ground chuck": "hamburger",
"ground round": "hamburger",
"mince": "hamburger",
"veggie burger": "hamburger",
"beyond burger": "hamburger",
"impossible burger": "hamburger",
"plant burger": "hamburger",
"chicken patt": "chicken patty",
"kielbasa": "sausage",
"bratwurst": "sausage",
"frankfurter": "hotdog",
"wiener": "hotdog",
"chicken breast": "chicken",
"chicken thigh": "chicken",
"chicken drumstick": "chicken",
"chicken wing": "chicken",
"rotisserie chicken": "chicken",
"chicken tender": "chicken",
"chicken strip": "chicken",
"chicken piece": "chicken",
"fake chicken": "chicken",
"plant chicken": "chicken",
"vegan chicken": "chicken",
"daring": "chicken",
"gardein chick": "chicken",
"quorn chick": "chicken",
"chick'n": "chicken",
"chikn": "chicken",
"not-chicken": "chicken",
"no-chicken": "chicken",
# Plant-based beef subs → broad "beef" (strips ≠ ground; texture matters)
"not-beef": "beef",
"no-beef": "beef",
"plant beef": "beef",
"vegan beef": "beef",
# Plant-based pork subs
"not-pork": "pork",
"no-pork": "pork",
"plant pork": "pork",
"vegan pork": "pork",
"omnipork": "pork",
"omni pork": "pork",
# Generic alt-meat catch-alls → broad "beef"
"fake meat": "beef",
"plant meat": "beef",
"vegan meat": "beef",
"meat-free": "beef",
"meatless": "beef",
"pork chop": "pork",
"pork loin": "pork",
"pork tenderloin": "pork",
"marinara": "tomato sauce",
"pasta sauce": "tomato sauce",
"spaghetti sauce": "tomato sauce",
"pizza sauce": "tomato sauce",
"macaroni": "pasta",
"noodles": "pasta",
"spaghetti": "pasta",
"penne": "pasta",
"fettuccine": "pasta",
"rigatoni": "pasta",
"linguine": "pasta",
"rotini": "pasta",
"farfalle": "pasta",
"shredded cheese": "cheese",
"sliced cheese": "cheese",
"american cheese": "cheese",
"cheddar": "cheese",
"mozzarella": "cheese",
"heavy cream": "cream",
"whipping cream": "cream",
"half and half": "cream",
"burger bun": "buns",
"hamburger bun": "buns",
"hot dog bun": "buns",
"bread roll": "buns",
"dinner roll": "buns",
# Tortillas / wraps — assembly dishes (burritos, tacos, quesadillas)
"flour tortilla": "tortillas",
"corn tortilla": "tortillas",
"tortilla wrap": "tortillas",
"soft taco shell": "tortillas",
"taco shell": "taco shells",
"pita bread": "pita",
"flatbread": "flatbread",
# Canned beans — extremely interchangeable in assembly dishes
"black bean": "beans",
"pinto bean": "beans",
"kidney bean": "beans",
"refried bean": "beans",
"chickpea": "beans",
"garbanzo": "beans",
# Rice variants
"white rice": "rice",
"brown rice": "rice",
"jasmine rice": "rice",
"basmati rice": "rice",
"instant rice": "rice",
"microwavable rice": "rice",
# Salsa / hot sauce
"hot sauce": "salsa",
"taco sauce": "salsa",
"enchilada sauce": "salsa",
# Sour cream / Greek yogurt — functional substitutes
"greek yogurt": "sour cream",
# Frozen/prepackaged meal token extraction — handled by individual token
# fallback in _normalize_for_fts; these are the most common single-serve meal types
"lean cuisine": "casserole",
"stouffer": "casserole",
"healthy choice": "casserole",
"marie callender": "casserole",
}
# Matches leading quantity/unit prefixes in recipe ingredient strings,
# e.g. "2 cups flour" → "flour", "1/2 c. ketchup" → "ketchup",
# "3 oz. butter" → "butter"
_QUANTITY_PREFIX = re.compile(
r"^\s*(?:\d+(?:[./]\d+)?\s*)?" # optional leading number (1, 1/2, 2.5)
r"(?:to\s+\d+\s*)?" # optional "to N" range
r"(?:c\.|cup|cups|tbsp|tsp|oz|lb|lbs|g|kg|ml|l|"
r"can|cans|pkg|pkg\.|package|slice|slices|clove|cloves|"
r"small|medium|large|bunch|head|piece|pieces|"
r"pinch|dash|handful|sprig|sprigs)\s*\b",
re.IGNORECASE,
)
# Preparation-state words that modify an ingredient without changing what it is.
# Stripped from both ends so "melted butter", "butter, melted" both → "butter".
_PREP_STATES = re.compile(
r"\b(melted|softened|cold|warm|hot|room.temperature|"
r"diced|sliced|chopped|minced|grated|shredded|shredded|beaten|whipped|"
r"cooked|raw|frozen|canned|dried|dehydrated|marinated|seasoned|"
r"roasted|toasted|ground|crushed|pressed|peeled|seeded|pitted|"
r"boneless|skinless|trimmed|halved|quartered|julienned|"
r"thinly|finely|roughly|coarsely|freshly|lightly|"
r"packed|heaping|level|sifted|divided|optional)\b",
re.IGNORECASE,
)
# Trailing comma + optional prep state (e.g. "butter, melted")
_TRAILING_PREP = re.compile(r",\s*\w+$")
# Maps prep-state words to human-readable instruction templates.
# {ingredient} is replaced with the actual ingredient name.
# None means the state is passive (frozen, canned) — no note needed.
_PREP_INSTRUCTIONS: dict[str, str | None] = {
"melted": "Melt the {ingredient} before starting.",
"softened": "Let the {ingredient} soften to room temperature before using.",
"room temperature": "Bring the {ingredient} to room temperature before using.",
"beaten": "Beat the {ingredient} lightly before adding.",
"whipped": "Whip the {ingredient} until soft peaks form.",
"sifted": "Sift the {ingredient} before measuring.",
"toasted": "Toast the {ingredient} in a dry pan until fragrant.",
"roasted": "Roast the {ingredient} before using.",
"pressed": "Press the {ingredient} to remove excess moisture.",
"diced": "Dice the {ingredient} into small pieces.",
"sliced": "Slice the {ingredient} thinly.",
"chopped": "Chop the {ingredient} roughly.",
"minced": "Mince the {ingredient} finely.",
"grated": "Grate the {ingredient}.",
"shredded": "Shred the {ingredient}.",
"ground": "Grind the {ingredient}.",
"crushed": "Crush the {ingredient}.",
"peeled": "Peel the {ingredient} before use.",
"seeded": "Remove seeds from the {ingredient}.",
"pitted": "Pit the {ingredient} before use.",
"trimmed": "Trim any excess from the {ingredient}.",
"julienned": "Cut the {ingredient} into thin matchstick strips.",
"cooked": "Pre-cook the {ingredient} before adding.",
# Passive states — ingredient is used as-is, no prep note needed
"cold": None,
"warm": None,
"hot": None,
"raw": None,
"frozen": None,
"canned": None,
"dried": None,
"dehydrated": None,
"marinated": None,
"seasoned": None,
"boneless": None,
"skinless": None,
"divided": None,
"optional": None,
"fresh": None,
"freshly": None,
"thinly": None,
"finely": None,
"roughly": None,
"coarsely": None,
"lightly": None,
"packed": None,
"heaping": None,
"level": None,
}
# Finds the first actionable prep state in an ingredient string
_PREP_STATE_SEARCH = re.compile(
r"\b(" + "|".join(re.escape(k) for k in _PREP_INSTRUCTIONS) + r")\b",
re.IGNORECASE,
)
def _strip_quantity(ingredient: str) -> str:
"""Remove leading quantity/unit and preparation-state words from a recipe ingredient.
e.g. "2 tbsp melted butter" "butter"
"butter, melted" "butter"
"1/4 cup flour, sifted" "flour"
"""
stripped = _QUANTITY_PREFIX.sub("", ingredient).strip()
# Strip any remaining leading number (e.g. "3 eggs" → "eggs")
stripped = re.sub(r"^\d+\s+", "", stripped)
# Strip trailing ", prep_state"
stripped = _TRAILING_PREP.sub("", stripped).strip()
# Strip prep-state words (may be leading or embedded)
stripped = _PREP_STATES.sub("", stripped).strip()
# Clean up any double spaces left behind
stripped = re.sub(r"\s{2,}", " ", stripped).strip()
return stripped or ingredient
def _prep_note_for(ingredient: str) -> str | None:
"""Return a human-readable prep instruction for this ingredient string, or None.
e.g. "2 tbsp melted butter" "Melt the butter before starting."
"onion, diced" "Dice the onion into small pieces."
"frozen peas" None (passive state, no action needed)
"""
match = _PREP_STATE_SEARCH.search(ingredient)
if not match:
return None
state = match.group(1).lower()
template = _PREP_INSTRUCTIONS.get(state)
if not template:
return None
# Use the stripped ingredient name as the subject
ingredient_name = _strip_quantity(ingredient)
return template.format(ingredient=ingredient_name)
def _expand_pantry_set(pantry_items: list[str]) -> set[str]:
"""Return pantry_set expanded with canonical recipe-corpus synonyms.
For each pantry item, checks _PANTRY_LABEL_SYNONYMS for substring matches
and adds the canonical form. This lets single-word recipe ingredients
("hamburger", "chicken") match product-label pantry entries
("burger patties", "rotisserie chicken").
"""
expanded: set[str] = set()
for item in pantry_items:
lower = item.lower().strip()
expanded.add(lower)
for pattern, canonical in _PANTRY_LABEL_SYNONYMS.items():
if pattern in lower:
expanded.add(canonical)
return expanded
def _ingredient_in_pantry(ingredient: str, pantry_set: set[str]) -> bool:
"""Return True if the recipe ingredient is satisfied by the pantry.
Checks three layers in order:
1. Exact match after quantity stripping
2. Synonym lookup: ingredient canonical in pantry_set
(handles "ground beef" matched by "burger patties" via shared canonical)
3. Token subset: all content tokens of the ingredient appear in pantry
(handles "diced onions" when "onions" is in pantry)
"""
clean = _strip_quantity(ingredient).lower()
if clean in pantry_set:
return True
# Check if this recipe ingredient maps to a canonical that's in pantry
for pattern, canonical in _PANTRY_LABEL_SYNONYMS.items():
if pattern in clean and canonical in pantry_set:
return True
# Single-token ingredient whose token appears in pantry (e.g. "ketchup" in "c. ketchup")
tokens = [t for t in clean.split() if t not in _SWAP_STOPWORDS and len(t) > 2]
if tokens and all(t in pantry_set for t in tokens):
return True
return False
def _content_tokens(text: str) -> frozenset[str]:
return frozenset(
w for w in text.lower().split()
if w not in _SWAP_STOPWORDS and len(w) > 1
)
def _pantry_creative_swap(required: str, pantry_items: set[str]) -> str | None:
"""Return a pantry item that's a plausible creative substitute, or None.
Requires 2 shared content tokens AND 50% bidirectional overlap so that
single-word differences (cream-of-mushroom vs cream-of-potato) qualify while
single-word ingredients (butter, flour) don't accidentally match supersets
(peanut butter, bread flour).
"""
req_tokens = _content_tokens(required)
if len(req_tokens) < 2:
return None # single-word ingredients must already be in pantry_set
best: str | None = None
best_score = 0.0
for item in pantry_items:
if item.lower() == required.lower():
continue
pan_tokens = _content_tokens(item)
if not pan_tokens:
continue
overlap = len(req_tokens & pan_tokens)
if overlap < 2:
continue
score = min(overlap / len(req_tokens), overlap / len(pan_tokens))
if score >= 0.5 and score > best_score:
best_score = score
best = item
return best
# Method complexity classification patterns
_EASY_METHODS = re.compile(
r"\b(microwave|mix|stir|blend|toast|assemble|heat)\b", re.IGNORECASE
@ -95,7 +435,7 @@ class RecipeEngine:
profiles = self._classifier.classify_batch(req.pantry_items)
gaps = self._classifier.identify_gaps(profiles)
pantry_set = {item.lower().strip() for item in req.pantry_items}
pantry_set = _expand_pantry_set(req.pantry_items)
if req.level >= 3:
from app.services.recipe.llm_recipe import LLMRecipeGenerator
@ -103,7 +443,17 @@ class RecipeEngine:
return gen.generate(req, profiles, gaps)
# Level 1 & 2: deterministic path
rows = self._store.search_recipes_by_ingredients(req.pantry_items, limit=20)
nf = req.nutrition_filters
rows = self._store.search_recipes_by_ingredients(
req.pantry_items,
limit=20,
category=req.category or None,
max_calories=nf.max_calories,
max_sugar_g=nf.max_sugar_g,
max_carbs_g=nf.max_carbs_g,
max_sodium_mg=nf.max_sodium_mg,
excluded_ids=req.excluded_ids or [],
)
suggestions = []
for row in rows:
@ -114,10 +464,31 @@ class RecipeEngine:
except Exception:
ingredient_names = []
# Compute missing ingredients
missing = [n for n in ingredient_names if n.lower() not in pantry_set]
# Compute missing ingredients, detecting pantry coverage first.
# When covered, collect any prep-state annotations (e.g. "melted butter"
# → note "Melt the butter before starting.") to surface separately.
swap_candidates: list[SwapCandidate] = []
missing: list[str] = []
prep_note_set: set[str] = set()
for n in ingredient_names:
if _ingredient_in_pantry(n, pantry_set):
note = _prep_note_for(n)
if note:
prep_note_set.add(note)
continue
swap_item = _pantry_creative_swap(n, pantry_set)
if swap_item:
swap_candidates.append(SwapCandidate(
original_name=n,
substitute_name=swap_item,
constraint_label="pantry_swap",
explanation=f"You have {swap_item} — use it in place of {n}.",
compensation_hints=[],
))
else:
missing.append(n)
# Filter by max_missing
# Filter by max_missing (pantry swaps don't count as missing)
if req.max_missing is not None and len(missing) > req.max_missing:
continue
@ -133,8 +504,7 @@ class RecipeEngine:
if complexity == "involved":
continue
# Build swap candidates for Level 2
swap_candidates: list[SwapCandidate] = []
# Level 2: also add dietary constraint swaps from substitution_pairs
if req.level == 2 and req.constraints:
for ing in ingredient_names:
for constraint in req.constraints:
@ -155,6 +525,22 @@ class RecipeEngine:
except Exception:
coverage_raw = {}
servings = row.get("servings") or None
nutrition = NutritionPanel(
calories=row.get("calories"),
fat_g=row.get("fat_g"),
protein_g=row.get("protein_g"),
carbs_g=row.get("carbs_g"),
fiber_g=row.get("fiber_g"),
sugar_g=row.get("sugar_g"),
sodium_mg=row.get("sodium_mg"),
servings=servings,
estimated=bool(row.get("nutrition_estimated", 0)),
)
has_nutrition = any(
v is not None
for v in (nutrition.calories, nutrition.sugar_g, nutrition.carbs_g)
)
suggestions.append(RecipeSuggestion(
id=row["id"],
title=row["title"],
@ -162,9 +548,20 @@ class RecipeEngine:
element_coverage=coverage_raw,
swap_candidates=swap_candidates,
missing_ingredients=missing,
prep_notes=sorted(prep_note_set),
level=req.level,
nutrition=nutrition if has_nutrition else None,
))
# Prepend assembly-dish templates (burrito, stir fry, omelette, etc.)
# These fire regardless of corpus coverage — any pantry can make a burrito.
assembly = match_assembly_templates(
pantry_items=req.pantry_items,
pantry_set=pantry_set,
excluded_ids=req.excluded_ids or [],
)
suggestions = assembly + suggestions
# Build grocery list — deduplicated union of all missing ingredients
seen: set[str] = set()
grocery_list: list[str] = []

View file

@ -0,0 +1,134 @@
#!/usr/bin/env python3
"""
Backfill texture_profile in ingredient_profiles from existing macro data.
Texture categories and their macro signatures (all values g/100g):
fatty - fat > 60 (oils, lard, pure butter)
creamy - fat 15-60 (cream, cheese, fatty meats, nut butter)
firm - protein > 15, fat < 15 (lean meats, fish, legumes, firm tofu)
starchy - carbs > 40, fat < 10 (flour, oats, rice, bread, potatoes)
fibrous - fiber > 4, carbs < 40 (brassicas, leafy greens, whole grains)
tender - protein 2-15, fat < 10, (soft veg, eggs, soft tofu, cooked beans)
carbs < 40
liquid - calories < 25, fat < 1, (broth, juice, dilute sauces)
protein < 3
neutral - fallthrough default
Rules are applied in priority order: fatty creamy firm starchy
fibrous tender liquid neutral.
Run:
python scripts/backfill_texture_profiles.py [path/to/kiwi.db]
Or inside the container:
docker exec kiwi-cloud-api-1 python /app/kiwi/scripts/backfill_texture_profiles.py
"""
from __future__ import annotations
import sqlite3
import sys
from pathlib import Path
# Default DB paths to try
_DEFAULT_PATHS = [
"/devl/kiwi-cloud-data/local-dev/kiwi.db",
"/devl/kiwi-data/kiwi.db",
]
BATCH_SIZE = 5_000
def _classify(fat: float, protein: float, carbs: float,
fiber: float, calories: float) -> str:
# Cap runaway values — data quality issue in some branded entries
fat = min(fat or 0.0, 100.0)
protein = min(protein or 0.0, 100.0)
carbs = min(carbs or 0.0, 100.0)
fiber = min(fiber or 0.0, 50.0)
calories = min(calories or 0.0, 900.0)
if fat > 60:
return "fatty"
if fat > 15:
return "creamy"
# Starchy before firm: oats/legumes have high protein AND high carbs — carbs win
if carbs > 40 and fat < 10:
return "starchy"
# Firm: lean proteins with low carbs (meats, fish, hard tofu)
# Lower protein threshold (>7) catches tofu (9%) and similar plant proteins
if protein > 7 and fat < 12 and carbs < 20:
return "firm"
if fiber > 4 and carbs < 40:
return "fibrous"
if 2 < protein <= 15 and fat < 10 and carbs < 40:
return "tender"
if calories < 25 and fat < 1 and protein < 3:
return "liquid"
return "neutral"
def backfill(db_path: str) -> None:
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
total = conn.execute("SELECT COUNT(*) FROM ingredient_profiles").fetchone()[0]
print(f"Total rows: {total:,}")
updated = 0
offset = 0
counts: dict[str, int] = {}
while True:
rows = conn.execute(
"""SELECT id, fat_pct, protein_pct, carbs_g_per_100g,
fiber_g_per_100g, calories_per_100g
FROM ingredient_profiles
LIMIT ? OFFSET ?""",
(BATCH_SIZE, offset),
).fetchall()
if not rows:
break
batch: list[tuple[str, int]] = []
for row in rows:
texture = _classify(
row["fat_pct"],
row["protein_pct"],
row["carbs_g_per_100g"],
row["fiber_g_per_100g"],
row["calories_per_100g"],
)
counts[texture] = counts.get(texture, 0) + 1
batch.append((texture, row["id"]))
conn.executemany(
"UPDATE ingredient_profiles SET texture_profile = ? WHERE id = ?",
batch,
)
conn.commit()
updated += len(batch)
offset += BATCH_SIZE
print(f" {updated:,} / {total:,} updated...", end="\r")
print(f"\nDone. {updated:,} rows updated.\n")
print("Texture distribution:")
for texture, count in sorted(counts.items(), key=lambda x: -x[1]):
pct = count / updated * 100
print(f" {texture:10s} {count:8,} ({pct:.1f}%)")
conn.close()
if __name__ == "__main__":
if len(sys.argv) > 1:
path = sys.argv[1]
else:
path = next((p for p in _DEFAULT_PATHS if Path(p).exists()), None)
if not path:
print(f"No DB found. Pass path as argument or create one of: {_DEFAULT_PATHS}")
sys.exit(1)
print(f"Backfilling texture profiles in: {path}")
backfill(path)

View file

@ -83,9 +83,30 @@ def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None:
"Fiber, total dietary": "fiber_pct",
"Sodium, Na": "sodium_mg_per_100g",
"Water": "moisture_pct",
"Energy": "calories_per_100g",
}
df = df_fdc.rename(columns={k: v for k, v in fdc_col_map.items() if k in df_fdc.columns})
# Build a sugar lookup from the branded parquet (keyed by normalized name).
# usda_branded has SUGARS, TOTAL (G) for processed/packaged foods.
branded_col_map = {
"FOOD_NAME": "name",
"SUGARS, TOTAL (G)": "sugar_g_per_100g",
}
df_branded_slim = df_branded.rename(
columns={k: v for k, v in branded_col_map.items() if k in df_branded.columns}
)[list(set(branded_col_map.values()) & set(df_branded.rename(columns=branded_col_map).columns))]
sugar_lookup: dict[str, float] = {}
for _, brow in df_branded_slim.iterrows():
bname = normalize_name(str(brow.get("name", "")))
val = brow.get("sugar_g_per_100g")
try:
fval = float(val) # type: ignore[arg-type]
if fval > 0 and bname not in sugar_lookup:
sugar_lookup[bname] = fval
except (TypeError, ValueError):
pass
inserted = 0
for _, row in df.iterrows():
name = normalize_name(str(row.get("name", "")))
@ -98,25 +119,40 @@ def build(db_path: Path, usda_fdc_path: Path, usda_branded_path: Path) -> None:
"moisture_pct": float(row.get("moisture_pct") or 0),
"sodium_mg_per_100g": float(row.get("sodium_mg_per_100g") or 0),
"starch_pct": 0.0,
"carbs_g_per_100g": float(row.get("carb_pct") or 0),
"fiber_g_per_100g": float(row.get("fiber_pct") or 0),
"calories_per_100g": float(row.get("calories_per_100g") or 0),
"sugar_g_per_100g": sugar_lookup.get(name, 0.0),
}
r["binding_score"] = derive_binding_score(r)
r["elements"] = derive_elements(r)
r["is_fermented"] = int(any(k in name for k in _FERMENTED_KEYWORDS))
try:
# Insert new profile or update macro columns on existing one.
conn.execute("""
INSERT OR IGNORE INTO ingredient_profiles
INSERT INTO ingredient_profiles
(name, elements, fat_pct, fat_saturated_pct, moisture_pct,
protein_pct, starch_pct, binding_score, sodium_mg_per_100g,
is_fermented, source)
VALUES (?,?,?,?,?,?,?,?,?,?,?)
is_fermented,
carbs_g_per_100g, fiber_g_per_100g, calories_per_100g, sugar_g_per_100g,
source)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
ON CONFLICT(name) DO UPDATE SET
carbs_g_per_100g = excluded.carbs_g_per_100g,
fiber_g_per_100g = excluded.fiber_g_per_100g,
calories_per_100g = excluded.calories_per_100g,
sugar_g_per_100g = excluded.sugar_g_per_100g
""", (
r["name"], json.dumps(r["elements"]),
r["fat_pct"], 0.0, r["moisture_pct"],
r["protein_pct"], r["starch_pct"], r["binding_score"],
r["sodium_mg_per_100g"], r["is_fermented"], "usda_fdc",
r["sodium_mg_per_100g"], r["is_fermented"],
r["carbs_g_per_100g"], r["fiber_g_per_100g"],
r["calories_per_100g"], r["sugar_g_per_100g"],
"usda_fdc",
))
inserted += conn.execute("SELECT changes()").fetchone()[0]
inserted += 1
except Exception:
continue

View file

@ -28,6 +28,30 @@ _TRAILING_QUALIFIER = re.compile(
_QUOTED = re.compile(r'"([^"]*)"')
def _float_or_none(val: object) -> float | None:
"""Return float > 0, or None for missing / zero values."""
try:
v = float(val) # type: ignore[arg-type]
return v if v > 0 else None
except (TypeError, ValueError):
return None
def _safe_list(val: object) -> list:
"""Convert a value to a list, handling NaN/float/None gracefully."""
if val is None:
return []
try:
import math
if isinstance(val, float) and math.isnan(val):
return []
except Exception:
pass
if isinstance(val, list):
return val
return []
def _parse_r_vector(s: str) -> list[str]:
"""Parse R character vector format: c("a", "b") -> ["a", "b"]."""
return _QUOTED.findall(s)
@ -93,14 +117,14 @@ def _row_to_fields(row: pd.Series) -> tuple[str, str, list[str], list[str]]:
if isinstance(raw_parts, str):
parsed = _parse_r_vector(raw_parts)
raw_parts = parsed if parsed else [raw_parts]
raw_ingredients = [str(i) for i in (raw_parts or [])]
raw_ingredients = [str(i) for i in (_safe_list(raw_parts))]
raw_dirs = row.get("RecipeInstructions", [])
if isinstance(raw_dirs, str):
parsed_dirs = _parse_r_vector(raw_dirs)
directions = parsed_dirs if parsed_dirs else [raw_dirs]
else:
directions = [str(d) for d in (raw_dirs or [])]
directions = [str(d) for d in (_safe_list(raw_dirs))]
title = str(row.get("Name", ""))[:500]
external_id = str(row.get("RecipeId", ""))
@ -144,12 +168,18 @@ def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
json.dumps(ingredient_names),
json.dumps(directions),
str(row.get("RecipeCategory", "") or ""),
json.dumps(list(row.get("Keywords", []) or [])),
float(row.get("Calories") or 0) or None,
float(row.get("FatContent") or 0) or None,
float(row.get("ProteinContent") or 0) or None,
float(row.get("SodiumContent") or 0) or None,
json.dumps(_safe_list(row.get("Keywords"))),
_float_or_none(row.get("Calories")),
_float_or_none(row.get("FatContent")),
_float_or_none(row.get("ProteinContent")),
_float_or_none(row.get("SodiumContent")),
json.dumps(coverage),
# New macro columns (migration 014)
_float_or_none(row.get("SugarContent")),
_float_or_none(row.get("CarbohydrateContent")),
_float_or_none(row.get("FiberContent")),
_float_or_none(row.get("RecipeServings")),
0, # nutrition_estimated — food.com direct data is authoritative
))
if len(batch) >= batch_size:
@ -157,8 +187,10 @@ def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
conn.executemany("""
INSERT OR REPLACE INTO recipes
(external_id, title, ingredients, ingredient_names, directions,
category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
category, keywords, calories, fat_g, protein_g, sodium_mg,
element_coverage,
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", batch)
conn.commit()
inserted += conn.total_changes - before
@ -170,8 +202,10 @@ def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
conn.executemany("""
INSERT OR REPLACE INTO recipes
(external_id, title, ingredients, ingredient_names, directions,
category, keywords, calories, fat_g, protein_g, sodium_mg, element_coverage)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
category, keywords, calories, fat_g, protein_g, sodium_mg,
element_coverage,
sugar_g, carbs_g, fiber_g, servings, nutrition_estimated)
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
""", batch)
conn.commit()
inserted += conn.total_changes - before

View file

@ -0,0 +1,109 @@
"""
Estimate macro nutrition for recipes that have no direct data.
For each recipe where sugar_g / carbs_g / fiber_g / calories are NULL,
look up the matched ingredient_profiles and average their per-100g values,
then scale by a rough 150g-per-ingredient portion assumption.
Mark such rows with nutrition_estimated=1 so the UI can display a disclaimer.
Recipes with food.com direct data (nutrition_estimated=0 and values set) are untouched.
Usage:
conda run -n job-seeker python scripts/pipeline/estimate_recipe_nutrition.py \
--db /path/to/kiwi.db
"""
from __future__ import annotations
import argparse
import json
import sqlite3
from pathlib import Path
# Rough grams per ingredient when no quantity data is available.
_GRAMS_PER_INGREDIENT = 150.0
def estimate(db_path: Path) -> None:
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
# Load ingredient_profiles macro data into memory for fast lookup.
profile_macros: dict[str, dict[str, float]] = {}
for row in conn.execute(
"SELECT name, calories_per_100g, carbs_g_per_100g, fiber_g_per_100g, sugar_g_per_100g "
"FROM ingredient_profiles"
):
name, cal, carbs, fiber, sugar = row
if name:
profile_macros[name] = {
"calories": float(cal or 0),
"carbs": float(carbs or 0),
"fiber": float(fiber or 0),
"sugar": float(sugar or 0),
}
# Select recipes with no direct nutrition data.
rows = conn.execute(
"SELECT id, ingredient_names FROM recipes "
"WHERE sugar_g IS NULL AND carbs_g IS NULL AND fiber_g IS NULL"
).fetchall()
updated = 0
batch: list[tuple] = []
for recipe_id, ingredient_names_json in rows:
try:
names: list[str] = json.loads(ingredient_names_json or "[]")
except Exception:
names = []
matched = [profile_macros[n] for n in names if n in profile_macros]
if not matched:
continue
# Average per-100g macros across matched ingredients,
# then multiply by assumed portion weight per ingredient.
n = len(matched)
portion_factor = _GRAMS_PER_INGREDIENT / 100.0
total_cal = sum(m["calories"] for m in matched) / n * portion_factor * n
total_carbs = sum(m["carbs"] for m in matched) / n * portion_factor * n
total_fiber = sum(m["fiber"] for m in matched) / n * portion_factor * n
total_sugar = sum(m["sugar"] for m in matched) / n * portion_factor * n
batch.append((
round(total_cal, 1) or None,
round(total_carbs, 2) or None,
round(total_fiber, 2) or None,
round(total_sugar, 2) or None,
recipe_id,
))
if len(batch) >= 5000:
conn.executemany(
"UPDATE recipes SET calories=?, carbs_g=?, fiber_g=?, sugar_g=?, "
"nutrition_estimated=1 WHERE id=?",
batch,
)
conn.commit()
updated += len(batch)
print(f" {updated} recipes estimated...")
batch = []
if batch:
conn.executemany(
"UPDATE recipes SET calories=?, carbs_g=?, fiber_g=?, sugar_g=?, "
"nutrition_estimated=1 WHERE id=?",
batch,
)
conn.commit()
updated += len(batch)
conn.close()
print(f"Total: {updated} recipes received estimated nutrition")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--db", required=True, type=Path)
args = parser.parse_args()
estimate(args.db)