chore: commit in-progress work -- tag inferrer, imitate endpoint, hall-of-chaos easter egg, migration files, Dockerfile .env defense
- app/services/recipe/tag_inferrer.py: infer tags from recipe ingredient text - app/db/migrations/022_recipe_generic_flag.sql, 029_inferred_tags.sql: schema migrations - app/api/endpoints/imitate.py: recipe imitation endpoint stub - app/api/endpoints/community.py: hall-of-chaos easter egg endpoint - scripts/pipeline/infer_recipe_tags.py, backfill_keywords.py: pipeline scripts - scripts/pipeline/build_recipe_index.py: extended index builder - Dockerfile: explicit .env removal as defense-in-depth - frontend/src/components/FeedbackButton.vue: feedback UX improvements - frontend/src/style.css: minor style tweaks - app/cloud_session.py: cloud session improvements - tests/api/test_community_endpoints.py: additional test coverage
This commit is contained in:
parent
fe18fb48c0
commit
144d1dc6c4
13 changed files with 1108 additions and 7 deletions
|
|
@ -16,6 +16,12 @@ COPY kiwi/environment.yml .
|
|||
RUN conda env create -f environment.yml
|
||||
|
||||
COPY kiwi/ ./kiwi/
|
||||
|
||||
# Remove gitignored config files that may exist locally — defense-in-depth.
|
||||
# The parent .dockerignore should exclude these, but an explicit rm guarantees
|
||||
# they never end up in the cloud image regardless of .dockerignore placement.
|
||||
RUN rm -f /app/kiwi/.env
|
||||
|
||||
# Install cf-core into the kiwi env BEFORE installing kiwi (kiwi lists it as a dep)
|
||||
RUN conda run -n kiwi pip install --no-cache-dir -e /app/circuitforge-core
|
||||
WORKDIR /app/kiwi
|
||||
|
|
|
|||
|
|
@ -120,6 +120,21 @@ async def local_feed():
|
|||
return [_post_to_dict(p) for p in posts]
|
||||
|
||||
|
||||
@router.get("/hall-of-chaos")
|
||||
async def hall_of_chaos():
|
||||
"""Hidden easter egg endpoint -- returns the 10 most chaotic bloopers."""
|
||||
store = _get_community_store()
|
||||
if store is None:
|
||||
return {"posts": [], "chaos_level": 0}
|
||||
posts = await asyncio.to_thread(
|
||||
store.list_posts, limit=10, post_type="recipe_blooper"
|
||||
)
|
||||
return {
|
||||
"posts": [_post_to_dict(p) for p in posts],
|
||||
"chaos_level": len(posts),
|
||||
}
|
||||
|
||||
|
||||
_VALID_POST_TYPES = {"plan", "recipe_success", "recipe_blooper"}
|
||||
_MAX_TITLE_LEN = 200
|
||||
_MAX_TEXT_LEN = 2000
|
||||
|
|
|
|||
185
app/api/endpoints/imitate.py
Normal file
185
app/api/endpoints/imitate.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
"""Kiwi — /api/v1/imitate/samples endpoint for Avocet Imitate tab.
|
||||
|
||||
Returns the actual assembled prompt Kiwi sends to its LLM for recipe generation,
|
||||
including the full pantry context (expiry-first ordering), dietary constraints
|
||||
(from user_settings if present), and the Level 3 format instructions.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from app.cloud_session import get_session, CloudUser
|
||||
from app.db.store import Store
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_LEVEL3_FORMAT = [
|
||||
"",
|
||||
"Reply using EXACTLY this plain-text format — no markdown, no bold, no extra commentary:",
|
||||
"Title: <name of the dish>",
|
||||
"Ingredients: <comma-separated list>",
|
||||
"Directions:",
|
||||
"1. <first step>",
|
||||
"2. <second step>",
|
||||
"3. <continue for each step>",
|
||||
"Notes: <optional tips>",
|
||||
]
|
||||
|
||||
_LEVEL4_FORMAT = [
|
||||
"",
|
||||
"Reply using EXACTLY this plain-text format — no markdown, no bold:",
|
||||
"Title: <name of the dish>",
|
||||
"Ingredients: <comma-separated list>",
|
||||
"Directions:",
|
||||
"1. <first step>",
|
||||
"2. <second step>",
|
||||
"Notes: <optional tips>",
|
||||
]
|
||||
|
||||
|
||||
def _read_user_settings(store: Store) -> dict:
|
||||
"""Read all key/value pairs from user_settings table."""
|
||||
try:
|
||||
rows = store.conn.execute("SELECT key, value FROM user_settings").fetchall()
|
||||
return {r["key"]: r["value"] for r in rows}
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
|
||||
def _build_recipe_prompt(
|
||||
pantry_names: list[str],
|
||||
expiring_names: list[str],
|
||||
constraints: list[str],
|
||||
allergies: list[str],
|
||||
level: int = 3,
|
||||
) -> str:
|
||||
"""Assemble the recipe generation prompt matching Kiwi's Level 3/4 format."""
|
||||
# Expiring items first, then remaining pantry items (deduped)
|
||||
expiring_set = set(expiring_names)
|
||||
ordered = list(expiring_names) + [n for n in pantry_names if n not in expiring_set]
|
||||
|
||||
if not ordered:
|
||||
ordered = pantry_names
|
||||
|
||||
if level == 4:
|
||||
lines = [
|
||||
"Surprise me with a creative, unexpected recipe.",
|
||||
"Only use ingredients that make culinary sense together. "
|
||||
"Do not force flavoured/sweetened items (vanilla yoghurt, flavoured syrups, jam) into savoury dishes.",
|
||||
f"Ingredients available: {', '.join(ordered)}",
|
||||
]
|
||||
if constraints:
|
||||
lines.append(f"Constraints: {', '.join(constraints)}")
|
||||
if allergies:
|
||||
lines.append(f"Must NOT contain: {', '.join(allergies)}")
|
||||
lines.append("Treat any mystery ingredient as a wildcard — use your imagination.")
|
||||
lines += _LEVEL4_FORMAT
|
||||
else:
|
||||
lines = [
|
||||
"You are a creative chef. Generate a recipe using the ingredients below.",
|
||||
"IMPORTANT: When you use a pantry item, list it in Ingredients using its exact name "
|
||||
"from the pantry list. Do not add adjectives, quantities, or cooking states "
|
||||
"(e.g. use 'butter', not 'unsalted butter' or '2 tbsp butter').",
|
||||
"IMPORTANT: Only use pantry items that make culinary sense for the dish. "
|
||||
"Do NOT force flavoured/sweetened items (vanilla yoghurt, fruit yoghurt, jam, "
|
||||
"dessert sauces, flavoured syrups) into savoury dishes.",
|
||||
"IMPORTANT: Do not default to the same ingredient repeatedly across dishes. "
|
||||
"If a pantry item does not genuinely improve this specific dish, leave it out.",
|
||||
"",
|
||||
f"Pantry items: {', '.join(ordered)}",
|
||||
]
|
||||
if expiring_names:
|
||||
lines.append(
|
||||
f"Priority — use these soon (expiring): {', '.join(expiring_names)}"
|
||||
)
|
||||
if constraints:
|
||||
lines.append(f"Dietary constraints: {', '.join(constraints)}")
|
||||
if allergies:
|
||||
lines.append(f"IMPORTANT — must NOT contain: {', '.join(allergies)}")
|
||||
lines += _LEVEL3_FORMAT
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@router.get("/samples")
|
||||
async def imitate_samples(
|
||||
limit: int = 5,
|
||||
level: int = 3,
|
||||
session: CloudUser = Depends(get_session),
|
||||
):
|
||||
"""Return assembled recipe generation prompts for Avocet's Imitate tab.
|
||||
|
||||
Each sample includes:
|
||||
system_prompt empty (Kiwi uses no system context)
|
||||
input_text full Level 3/4 prompt with pantry items, expiring items,
|
||||
dietary constraints, and format instructions
|
||||
output_text empty (no prior LLM output stored per-request)
|
||||
|
||||
level: 3 (structured with element biasing context) or 4 (wildcard creative)
|
||||
limit: max number of distinct prompt variants to return (varies by pantry state)
|
||||
"""
|
||||
limit = max(1, min(limit, 10))
|
||||
store = Store(session.db)
|
||||
|
||||
# Full pantry for context
|
||||
all_items = store.list_inventory()
|
||||
pantry_names = [r["product_name"] for r in all_items if r.get("product_name")]
|
||||
|
||||
# Expiring items as priority ingredients
|
||||
expiring = store.expiring_soon(days=14)
|
||||
expiring_names = [r["product_name"] for r in expiring if r.get("product_name")]
|
||||
|
||||
# Dietary constraints from user_settings (keys: constraints, allergies)
|
||||
settings = _read_user_settings(store)
|
||||
import json as _json
|
||||
try:
|
||||
constraints = _json.loads(settings.get("dietary_constraints", "[]")) or []
|
||||
except Exception:
|
||||
constraints = []
|
||||
try:
|
||||
allergies = _json.loads(settings.get("dietary_allergies", "[]")) or []
|
||||
except Exception:
|
||||
allergies = []
|
||||
|
||||
if not pantry_names:
|
||||
return {"samples": [], "total": 0, "type": f"recipe_level{level}"}
|
||||
|
||||
# Build prompt variants: one per expiring item as the "anchor" ingredient,
|
||||
# plus one general pantry prompt. Cap at limit.
|
||||
samples = []
|
||||
seen_anchors: set[str] = set()
|
||||
|
||||
for item in (expiring[:limit - 1] if expiring else []):
|
||||
anchor = item.get("product_name", "")
|
||||
if not anchor or anchor in seen_anchors:
|
||||
continue
|
||||
seen_anchors.add(anchor)
|
||||
|
||||
# Put this item first in the list for the prompt
|
||||
ordered_expiring = [anchor] + [n for n in expiring_names if n != anchor]
|
||||
prompt = _build_recipe_prompt(pantry_names, ordered_expiring, constraints, allergies, level)
|
||||
|
||||
samples.append({
|
||||
"id": item.get("id", 0),
|
||||
"anchor_item": anchor,
|
||||
"expiring_count": len(expiring_names),
|
||||
"pantry_count": len(pantry_names),
|
||||
"system_prompt": "",
|
||||
"input_text": prompt,
|
||||
"output_text": "",
|
||||
})
|
||||
|
||||
# One general prompt using all expiring as priority
|
||||
if len(samples) < limit:
|
||||
prompt = _build_recipe_prompt(pantry_names, expiring_names, constraints, allergies, level)
|
||||
samples.append({
|
||||
"id": 0,
|
||||
"anchor_item": "full pantry",
|
||||
"expiring_count": len(expiring_names),
|
||||
"pantry_count": len(pantry_names),
|
||||
"system_prompt": "",
|
||||
"input_text": prompt,
|
||||
"output_text": "",
|
||||
})
|
||||
|
||||
return {"samples": samples, "total": len(samples), "type": f"recipe_level{level}"}
|
||||
|
|
@ -170,6 +170,13 @@ def _user_db_path(user_id: str, household_id: str | None = None) -> Path:
|
|||
return path
|
||||
|
||||
|
||||
def _anon_db_path() -> Path:
|
||||
"""Ephemeral DB for unauthenticated guest visitors (Free tier, no persistence)."""
|
||||
path = CLOUD_DATA_ROOT / "anonymous" / "kiwi.db"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return path
|
||||
|
||||
|
||||
# ── BYOK detection ────────────────────────────────────────────────────────────
|
||||
|
||||
_LLM_CONFIG_PATH = Path.home() / ".config" / "circuitforge" / "llm.yaml"
|
||||
|
|
@ -225,11 +232,21 @@ def get_session(request: Request) -> CloudUser:
|
|||
or request.headers.get("cookie", "")
|
||||
)
|
||||
if not raw_header:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
return CloudUser(
|
||||
user_id="anonymous",
|
||||
tier="free",
|
||||
db=_anon_db_path(),
|
||||
has_byok=has_byok,
|
||||
)
|
||||
|
||||
token = _extract_session_token(raw_header) # gitleaks:allow — function name, not a secret
|
||||
if not token:
|
||||
raise HTTPException(status_code=401, detail="Not authenticated")
|
||||
return CloudUser(
|
||||
user_id="anonymous",
|
||||
tier="free",
|
||||
db=_anon_db_path(),
|
||||
has_byok=has_byok,
|
||||
)
|
||||
|
||||
user_id = validate_session_jwt(token)
|
||||
_ensure_provisioned(user_id)
|
||||
|
|
|
|||
5
app/db/migrations/022_recipe_generic_flag.sql
Normal file
5
app/db/migrations/022_recipe_generic_flag.sql
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
-- Migration 022: Add is_generic flag to recipes
|
||||
-- Generic recipes are catch-all/dump recipes with loose ingredient lists
|
||||
-- that should not appear in Level 1 (deterministic "use what I have") results.
|
||||
-- Admins can mark recipes via the recipe editor or a bulk backfill script.
|
||||
ALTER TABLE recipes ADD COLUMN is_generic INTEGER NOT NULL DEFAULT 0;
|
||||
49
app/db/migrations/029_inferred_tags.sql
Normal file
49
app/db/migrations/029_inferred_tags.sql
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
-- Migration 029: Add inferred_tags column and update FTS index to include it.
|
||||
--
|
||||
-- inferred_tags holds a JSON array of normalized tag strings derived by
|
||||
-- scripts/pipeline/infer_recipe_tags.py (e.g. ["cuisine:Italian",
|
||||
-- "dietary:Low-Carb", "flavor:Umami", "can_be:Gluten-Free"]).
|
||||
--
|
||||
-- The FTS5 browser table is rebuilt to index inferred_tags alongside
|
||||
-- category and keywords so browse domain queries match against all signals.
|
||||
|
||||
-- 1. Add inferred_tags column (empty array default; populated by pipeline run)
|
||||
ALTER TABLE recipes ADD COLUMN inferred_tags TEXT NOT NULL DEFAULT '[]';
|
||||
|
||||
-- 2. Drop old FTS table and triggers that only covered category + keywords
|
||||
DROP TRIGGER IF EXISTS recipes_ai;
|
||||
DROP TRIGGER IF EXISTS recipes_ad;
|
||||
DROP TRIGGER IF EXISTS recipes_au;
|
||||
DROP TABLE IF EXISTS recipe_browser_fts;
|
||||
|
||||
-- 3. Recreate FTS5 table: now indexes category, keywords, AND inferred_tags
|
||||
CREATE VIRTUAL TABLE recipe_browser_fts USING fts5(
|
||||
category,
|
||||
keywords,
|
||||
inferred_tags,
|
||||
content=recipes,
|
||||
content_rowid=id
|
||||
);
|
||||
|
||||
-- 4. Triggers to keep FTS in sync with recipes table changes
|
||||
CREATE TRIGGER recipes_ai AFTER INSERT ON recipes BEGIN
|
||||
INSERT INTO recipe_browser_fts(rowid, category, keywords, inferred_tags)
|
||||
VALUES (new.id, new.category, new.keywords, new.inferred_tags);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER recipes_ad AFTER DELETE ON recipes BEGIN
|
||||
INSERT INTO recipe_browser_fts(recipe_browser_fts, rowid, category, keywords, inferred_tags)
|
||||
VALUES ('delete', old.id, old.category, old.keywords, old.inferred_tags);
|
||||
END;
|
||||
|
||||
CREATE TRIGGER recipes_au AFTER UPDATE ON recipes BEGIN
|
||||
INSERT INTO recipe_browser_fts(recipe_browser_fts, rowid, category, keywords, inferred_tags)
|
||||
VALUES ('delete', old.id, old.category, old.keywords, old.inferred_tags);
|
||||
INSERT INTO recipe_browser_fts(rowid, category, keywords, inferred_tags)
|
||||
VALUES (new.id, new.category, new.keywords, new.inferred_tags);
|
||||
END;
|
||||
|
||||
-- 5. Populate FTS from current table state
|
||||
-- (inferred_tags is '[]' for all rows at this point; run infer_recipe_tags.py
|
||||
-- to populate, then the FTS will be rebuilt as part of that script.)
|
||||
INSERT INTO recipe_browser_fts(recipe_browser_fts) VALUES('rebuild');
|
||||
300
app/services/recipe/tag_inferrer.py
Normal file
300
app/services/recipe/tag_inferrer.py
Normal file
|
|
@ -0,0 +1,300 @@
|
|||
"""
|
||||
Recipe tag inference engine.
|
||||
|
||||
Derives normalized tags from a recipe's title, ingredient names, existing corpus
|
||||
tags (category + keywords), enriched ingredient profile data, and optional
|
||||
nutrition data.
|
||||
|
||||
Tags are organized into five namespaces:
|
||||
cuisine:* -- cuisine/region classification
|
||||
dietary:* -- dietary restriction / nutrition profile
|
||||
flavor:* -- flavor profile (spicy, smoky, sweet, etc.)
|
||||
time:* -- effort / time signals
|
||||
meal:* -- meal type
|
||||
can_be:* -- achievable with substitutions (e.g. can_be:Gluten-Free)
|
||||
|
||||
Output is a flat sorted list of strings, e.g.:
|
||||
["can_be:Gluten-Free", "cuisine:Italian", "dietary:Low-Carb",
|
||||
"flavor:Savory", "flavor:Umami", "time:Quick"]
|
||||
|
||||
These populate recipes.inferred_tags and are FTS5-indexed so browse domain
|
||||
queries find recipes the food.com corpus tags alone would miss.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Text-signal tables
|
||||
# (tag, [case-insensitive substrings to search in combined title+ingredient text])
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CUISINE_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("cuisine:Japanese", ["miso", "dashi", "ramen", "sushi", "teriyaki", "sake", "mirin",
|
||||
"wasabi", "panko", "edamame", "tonkatsu", "yakitori", "ponzu"]),
|
||||
("cuisine:Korean", ["gochujang", "kimchi", "doenjang", "gochugaru",
|
||||
"bulgogi", "bibimbap", "japchae"]),
|
||||
("cuisine:Thai", ["fish sauce", "lemongrass", "galangal", "pad thai", "thai basil",
|
||||
"kaffir lime", "tom yum", "green curry", "red curry", "nam pla"]),
|
||||
("cuisine:Chinese", ["hoisin", "oyster sauce", "five spice", "bok choy", "chow mein",
|
||||
"dumpling", "wonton", "mapo", "char siu", "sichuan"]),
|
||||
("cuisine:Vietnamese", ["pho", "banh mi", "nuoc cham", "rice paper", "vietnamese"]),
|
||||
("cuisine:Indian", ["garam masala", "turmeric", "cardamom", "fenugreek", "paneer",
|
||||
"tikka", "masala", "biryani", "dal", "naan", "tandoori",
|
||||
"curry leaf", "tamarind", "chutney"]),
|
||||
("cuisine:Middle Eastern", ["tahini", "harissa", "za'atar", "sumac", "baharat", "rose water",
|
||||
"pomegranate molasses", "freekeh", "fattoush", "shakshuka"]),
|
||||
("cuisine:Greek", ["feta", "tzatziki", "moussaka", "spanakopita", "orzo",
|
||||
"kalamata", "gyro", "souvlaki", "dolma"]),
|
||||
("cuisine:Mediterranean", ["hummus", "pita", "couscous", "preserved lemon"]),
|
||||
("cuisine:Italian", ["pasta", "pizza", "risotto", "lasagna", "carbonara", "gnocchi",
|
||||
"parmesan", "mozzarella", "ricotta", "prosciutto", "pancetta",
|
||||
"arancini", "osso buco", "tiramisu", "pesto", "bolognese",
|
||||
"cannoli", "polenta", "bruschetta", "focaccia"]),
|
||||
("cuisine:French", ["croissant", "quiche", "crepe", "coq au vin",
|
||||
"ratatouille", "bearnaise", "hollandaise", "bouillabaisse",
|
||||
"herbes de provence", "dijon", "gruyere", "brie", "cassoulet"]),
|
||||
("cuisine:Spanish", ["paella", "chorizo", "gazpacho", "tapas", "patatas bravas",
|
||||
"sofrito", "manchego", "albondigas"]),
|
||||
("cuisine:German", ["sauerkraut", "bratwurst", "schnitzel", "pretzel", "strudel",
|
||||
"spaetzle", "sauerbraten"]),
|
||||
("cuisine:Mexican", ["taco", "burrito", "enchilada", "salsa", "guacamole", "chipotle",
|
||||
"queso", "tamale", "mole", "jalapeno", "tortilla", "carnitas",
|
||||
"chile verde", "posole", "tostada", "quesadilla"]),
|
||||
("cuisine:Latin American", ["plantain", "yuca", "chimichurri", "ceviche", "adobo", "empanada"]),
|
||||
("cuisine:American", ["bbq sauce", "buffalo sauce", "ranch dressing", "coleslaw",
|
||||
"cornbread", "mac and cheese", "brisket", "cheeseburger"]),
|
||||
("cuisine:Southern", ["collard greens", "black-eyed peas", "okra", "grits", "catfish",
|
||||
"hush puppies", "pecan pie"]),
|
||||
("cuisine:Cajun", ["cajun", "creole", "gumbo", "jambalaya", "andouille", "etouffee"]),
|
||||
("cuisine:African", ["injera", "berbere", "jollof", "suya", "egusi", "fufu", "tagine"]),
|
||||
("cuisine:Caribbean", ["jerk", "scotch bonnet", "callaloo", "ackee"]),
|
||||
]
|
||||
|
||||
_DIETARY_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("dietary:Vegan", ["vegan", "plant-based", "plant based"]),
|
||||
("dietary:Vegetarian", ["vegetarian", "meatless"]),
|
||||
("dietary:Gluten-Free", ["gluten-free", "gluten free", "celiac"]),
|
||||
("dietary:Dairy-Free", ["dairy-free", "dairy free", "lactose free", "non-dairy"]),
|
||||
("dietary:Low-Carb", ["low-carb", "low carb", "keto", "ketogenic", "very low carbs"]),
|
||||
("dietary:High-Protein", ["high protein", "high-protein"]),
|
||||
("dietary:Low-Fat", ["low-fat", "low fat", "fat-free", "reduced fat"]),
|
||||
("dietary:Paleo", ["paleo", "whole30"]),
|
||||
("dietary:Nut-Free", ["nut-free", "nut free", "peanut free"]),
|
||||
("dietary:Egg-Free", ["egg-free", "egg free"]),
|
||||
("dietary:Low-Sodium", ["low sodium", "no salt"]),
|
||||
("dietary:Healthy", ["healthy", "low cholesterol", "heart healthy", "wholesome"]),
|
||||
]
|
||||
|
||||
_FLAVOR_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("flavor:Spicy", ["jalapeno", "habanero", "ghost pepper", "sriracha",
|
||||
"chili flake", "red pepper flake", "cayenne", "hot sauce",
|
||||
"gochujang", "harissa", "scotch bonnet", "szechuan pepper", "spicy"]),
|
||||
("flavor:Smoky", ["smoked", "liquid smoke", "smoked paprika",
|
||||
"bbq sauce", "barbecue", "hickory", "mesquite"]),
|
||||
("flavor:Sweet", ["honey", "maple syrup", "brown sugar", "caramel", "chocolate",
|
||||
"vanilla", "condensed milk", "molasses", "agave"]),
|
||||
("flavor:Savory", ["soy sauce", "fish sauce", "miso", "worcestershire", "anchovy",
|
||||
"parmesan", "blue cheese", "bone broth"]),
|
||||
("flavor:Tangy", ["lemon juice", "lime juice", "vinegar", "balsamic", "buttermilk",
|
||||
"sour cream", "fermented", "pickled", "tamarind", "sumac"]),
|
||||
("flavor:Herby", ["fresh basil", "fresh cilantro", "fresh dill", "fresh mint",
|
||||
"fresh tarragon", "fresh thyme", "herbes de provence"]),
|
||||
("flavor:Rich", ["heavy cream", "creme fraiche", "mascarpone", "double cream",
|
||||
"ghee", "coconut cream", "cream cheese"]),
|
||||
("flavor:Umami", ["mushroom", "nutritional yeast", "tomato paste",
|
||||
"parmesan rind", "bonito", "kombu"]),
|
||||
]
|
||||
|
||||
_TIME_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("time:Quick", ["< 15 mins", "< 30 mins", "weeknight", "easy"]),
|
||||
("time:Under 1 Hour", ["< 60 mins"]),
|
||||
("time:Make-Ahead", ["freezer", "overnight", "refrigerator", "make-ahead", "make ahead"]),
|
||||
("time:Slow Cook", ["slow cooker", "crockpot", "< 4 hours", "braise"]),
|
||||
]
|
||||
|
||||
# food.com corpus tag -> normalized tags
|
||||
_CORPUS_TAG_MAP: dict[str, list[str]] = {
|
||||
"european": ["cuisine:Italian", "cuisine:French", "cuisine:German",
|
||||
"cuisine:Spanish"],
|
||||
"asian": ["cuisine:Chinese", "cuisine:Japanese", "cuisine:Thai",
|
||||
"cuisine:Korean", "cuisine:Vietnamese"],
|
||||
"chinese": ["cuisine:Chinese"],
|
||||
"japanese": ["cuisine:Japanese"],
|
||||
"thai": ["cuisine:Thai"],
|
||||
"vietnamese": ["cuisine:Vietnamese"],
|
||||
"indian": ["cuisine:Indian"],
|
||||
"greek": ["cuisine:Greek"],
|
||||
"mexican": ["cuisine:Mexican"],
|
||||
"african": ["cuisine:African"],
|
||||
"caribbean": ["cuisine:Caribbean"],
|
||||
"vegan": ["dietary:Vegan", "dietary:Vegetarian"],
|
||||
"vegetarian": ["dietary:Vegetarian"],
|
||||
"healthy": ["dietary:Healthy"],
|
||||
"low cholesterol": ["dietary:Healthy"],
|
||||
"very low carbs": ["dietary:Low-Carb"],
|
||||
"high in...": ["dietary:High-Protein"],
|
||||
"lactose free": ["dietary:Dairy-Free"],
|
||||
"egg free": ["dietary:Egg-Free"],
|
||||
"< 15 mins": ["time:Quick"],
|
||||
"< 30 mins": ["time:Quick"],
|
||||
"< 60 mins": ["time:Under 1 Hour"],
|
||||
"< 4 hours": ["time:Slow Cook"],
|
||||
"weeknight": ["time:Quick"],
|
||||
"freezer": ["time:Make-Ahead"],
|
||||
"dessert": ["meal:Dessert"],
|
||||
"breakfast": ["meal:Breakfast"],
|
||||
"lunch/snacks": ["meal:Lunch", "meal:Snack"],
|
||||
"beverages": ["meal:Beverage"],
|
||||
"cookie & brownie": ["meal:Dessert"],
|
||||
"breads": ["meal:Bread"],
|
||||
}
|
||||
|
||||
# ingredient_profiles.elements value -> flavor tag
|
||||
_ELEMENT_TO_FLAVOR: dict[str, str] = {
|
||||
"Aroma": "flavor:Herby",
|
||||
"Richness": "flavor:Rich",
|
||||
"Structure": "", # no flavor tag
|
||||
"Binding": "",
|
||||
"Crust": "flavor:Smoky",
|
||||
"Lift": "",
|
||||
"Emulsion": "flavor:Rich",
|
||||
"Acid": "flavor:Tangy",
|
||||
}
|
||||
|
||||
|
||||
def _build_text(title: str, ingredient_names: list[str]) -> str:
|
||||
parts = [title.lower()]
|
||||
parts.extend(i.lower() for i in ingredient_names)
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _match_signals(text: str, table: list[tuple[str, list[str]]]) -> list[str]:
|
||||
return [tag for tag, pats in table if any(p in text for p in pats)]
|
||||
|
||||
|
||||
def infer_tags(
|
||||
title: str,
|
||||
ingredient_names: list[str],
|
||||
corpus_keywords: list[str],
|
||||
corpus_category: str = "",
|
||||
# Enriched ingredient profile signals (from ingredient_profiles cross-ref)
|
||||
element_coverage: dict[str, float] | None = None,
|
||||
fermented_count: int = 0,
|
||||
glutamate_total: float = 0.0,
|
||||
ph_min: float | None = None,
|
||||
available_sub_constraints: list[str] | None = None,
|
||||
# Nutrition data for macro-based tags
|
||||
calories: float | None = None,
|
||||
protein_g: float | None = None,
|
||||
fat_g: float | None = None,
|
||||
carbs_g: float | None = None,
|
||||
servings: float | None = None,
|
||||
) -> list[str]:
|
||||
"""
|
||||
Derive normalized tags for a recipe.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
title, ingredient_names, corpus_keywords, corpus_category
|
||||
: Primary recipe data.
|
||||
element_coverage
|
||||
: Dict from recipes.element_coverage -- element name to coverage ratio
|
||||
(e.g. {"Aroma": 0.6, "Richness": 0.4}). Derived from ingredient_profiles.
|
||||
fermented_count
|
||||
: Number of fermented ingredients (from ingredient_profiles.is_fermented).
|
||||
glutamate_total
|
||||
: Sum of glutamate_mg across all profiled ingredients. High values signal umami.
|
||||
ph_min
|
||||
: Minimum ph_estimate across profiled ingredients. Low values signal acidity.
|
||||
available_sub_constraints
|
||||
: Substitution constraint labels achievable for this recipe
|
||||
(e.g. ["gluten_free", "low_carb"]). From substitution_pairs cross-ref.
|
||||
These become can_be:* tags.
|
||||
calories, protein_g, fat_g, carbs_g, servings
|
||||
: Nutrition data for macro-based dietary tags.
|
||||
|
||||
Returns
|
||||
-------
|
||||
Sorted list of unique normalized tag strings.
|
||||
"""
|
||||
tags: set[str] = set()
|
||||
|
||||
# 1. Map corpus tags to normalized vocabulary
|
||||
for kw in corpus_keywords:
|
||||
for t in _CORPUS_TAG_MAP.get(kw.lower(), []):
|
||||
tags.add(t)
|
||||
if corpus_category:
|
||||
for t in _CORPUS_TAG_MAP.get(corpus_category.lower(), []):
|
||||
tags.add(t)
|
||||
|
||||
# 2. Text-signal matching
|
||||
text = _build_text(title, ingredient_names)
|
||||
tags.update(_match_signals(text, _CUISINE_SIGNALS))
|
||||
tags.update(_match_signals(text, _DIETARY_SIGNALS))
|
||||
tags.update(_match_signals(text, _FLAVOR_SIGNALS))
|
||||
|
||||
# 3. Time signals from corpus keywords + text
|
||||
corpus_text = " ".join(kw.lower() for kw in corpus_keywords)
|
||||
tags.update(_match_signals(corpus_text, _TIME_SIGNALS))
|
||||
tags.update(_match_signals(text, _TIME_SIGNALS))
|
||||
|
||||
# 4. Enriched profile signals
|
||||
if element_coverage:
|
||||
for element, coverage in element_coverage.items():
|
||||
if coverage > 0.2: # >20% of ingredients carry this element
|
||||
flavor_tag = _ELEMENT_TO_FLAVOR.get(element, "")
|
||||
if flavor_tag:
|
||||
tags.add(flavor_tag)
|
||||
|
||||
if glutamate_total > 50:
|
||||
tags.add("flavor:Umami")
|
||||
|
||||
if fermented_count > 0:
|
||||
tags.add("flavor:Tangy")
|
||||
|
||||
if ph_min is not None and ph_min < 4.5:
|
||||
tags.add("flavor:Tangy")
|
||||
|
||||
# 5. Achievable-via-substitution tags
|
||||
if available_sub_constraints:
|
||||
label_to_tag = {
|
||||
"gluten_free": "can_be:Gluten-Free",
|
||||
"low_calorie": "can_be:Low-Calorie",
|
||||
"low_carb": "can_be:Low-Carb",
|
||||
"vegan": "can_be:Vegan",
|
||||
"dairy_free": "can_be:Dairy-Free",
|
||||
"low_sodium": "can_be:Low-Sodium",
|
||||
}
|
||||
for label in available_sub_constraints:
|
||||
tag = label_to_tag.get(label)
|
||||
if tag:
|
||||
tags.add(tag)
|
||||
|
||||
# 6. Macro-based dietary tags
|
||||
if servings and servings > 0 and any(
|
||||
v is not None for v in (protein_g, fat_g, carbs_g, calories)
|
||||
):
|
||||
def _per(v: float | None) -> float | None:
|
||||
return v / servings if v is not None else None
|
||||
|
||||
prot_s = _per(protein_g)
|
||||
fat_s = _per(fat_g)
|
||||
carb_s = _per(carbs_g)
|
||||
cal_s = _per(calories)
|
||||
|
||||
if prot_s is not None and prot_s >= 20:
|
||||
tags.add("dietary:High-Protein")
|
||||
if fat_s is not None and fat_s <= 5:
|
||||
tags.add("dietary:Low-Fat")
|
||||
if carb_s is not None and carb_s <= 10:
|
||||
tags.add("dietary:Low-Carb")
|
||||
if cal_s is not None and cal_s <= 250:
|
||||
tags.add("dietary:Light")
|
||||
elif protein_g is not None and protein_g >= 20:
|
||||
tags.add("dietary:High-Protein")
|
||||
|
||||
# 7. Vegan implies vegetarian
|
||||
if "dietary:Vegan" in tags:
|
||||
tags.add("dietary:Vegetarian")
|
||||
|
||||
return sorted(tags)
|
||||
|
|
@ -140,11 +140,13 @@ import { ref, computed, onMounted } from 'vue'
|
|||
|
||||
const props = defineProps<{ currentTab?: string }>()
|
||||
|
||||
const apiBase = (import.meta.env.VITE_API_BASE as string) ?? ''
|
||||
|
||||
// Probe once on mount — hidden until confirmed enabled so button never flashes
|
||||
const enabled = ref(false)
|
||||
onMounted(async () => {
|
||||
try {
|
||||
const res = await fetch('/api/v1/feedback/status')
|
||||
const res = await fetch(`${apiBase}/api/v1/feedback/status`)
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
enabled.value = data.enabled === true
|
||||
|
|
@ -205,7 +207,7 @@ async function submit() {
|
|||
loading.value = true
|
||||
submitError.value = ''
|
||||
try {
|
||||
const res = await fetch('/api/v1/feedback', {
|
||||
const res = await fetch(`${apiBase}/api/v1/feedback`, {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({
|
||||
|
|
@ -407,6 +409,114 @@ async function submit() {
|
|||
.mt-md { margin-top: var(--spacing-md); }
|
||||
.mt-xs { margin-top: var(--spacing-xs); }
|
||||
|
||||
/* ── Form elements ────────────────────────────────────────────────────── */
|
||||
.form-group {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: var(--spacing-xs);
|
||||
}
|
||||
|
||||
.form-label {
|
||||
font-size: var(--font-size-sm);
|
||||
font-weight: 600;
|
||||
color: var(--color-text-muted);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.06em;
|
||||
}
|
||||
|
||||
.form-input {
|
||||
width: 100%;
|
||||
padding: var(--spacing-xs) var(--spacing-sm);
|
||||
background: var(--color-bg-secondary);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: var(--radius-md);
|
||||
color: var(--color-text-primary);
|
||||
font-family: var(--font-body);
|
||||
font-size: var(--font-size-sm);
|
||||
line-height: 1.5;
|
||||
transition: border-color 0.15s;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.form-input:focus {
|
||||
outline: none;
|
||||
border-color: var(--color-border-focus);
|
||||
}
|
||||
.form-input::placeholder { color: var(--color-text-muted); opacity: 0.7; }
|
||||
|
||||
/* ── Buttons ──────────────────────────────────────────────────────────── */
|
||||
.btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: var(--spacing-xs);
|
||||
padding: var(--spacing-xs) var(--spacing-md);
|
||||
border-radius: var(--radius-md);
|
||||
font-family: var(--font-body);
|
||||
font-size: var(--font-size-sm);
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s, color 0.15s, border-color 0.15s;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.btn:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
|
||||
.btn-primary {
|
||||
background: var(--color-primary);
|
||||
color: #fff;
|
||||
border: 1px solid var(--color-primary);
|
||||
}
|
||||
.btn-primary:hover:not(:disabled) { filter: brightness(1.1); }
|
||||
|
||||
.btn-ghost {
|
||||
background: transparent;
|
||||
color: var(--color-text-secondary);
|
||||
border: 1px solid var(--color-border);
|
||||
}
|
||||
.btn-ghost:hover:not(:disabled) {
|
||||
background: var(--color-bg-secondary);
|
||||
color: var(--color-text-primary);
|
||||
border-color: var(--color-border-focus);
|
||||
}
|
||||
|
||||
/* ── Filter chips ─────────────────────────────────────────────────────── */
|
||||
.filter-chip-row {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: var(--spacing-xs);
|
||||
}
|
||||
|
||||
.btn-chip {
|
||||
padding: 5px var(--spacing-sm);
|
||||
background: var(--color-bg-secondary);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: 999px;
|
||||
font-family: var(--font-body);
|
||||
font-size: var(--font-size-sm);
|
||||
font-weight: 500;
|
||||
color: var(--color-text-secondary);
|
||||
cursor: pointer;
|
||||
transition: background 0.15s, color 0.15s, border-color 0.15s;
|
||||
}
|
||||
.btn-chip.active,
|
||||
.btn-chip:hover {
|
||||
background: color-mix(in srgb, var(--color-primary) 15%, transparent);
|
||||
border-color: var(--color-primary);
|
||||
color: var(--color-primary);
|
||||
}
|
||||
|
||||
/* ── Card ─────────────────────────────────────────────────────────────── */
|
||||
.card {
|
||||
background: var(--color-bg-card);
|
||||
border: 1px solid var(--color-border);
|
||||
border-radius: var(--radius-md);
|
||||
}
|
||||
|
||||
/* ── Text utilities ───────────────────────────────────────────────────── */
|
||||
.text-muted { color: var(--color-text-muted); }
|
||||
.text-sm { font-size: var(--font-size-sm); line-height: 1.5; }
|
||||
.text-xs { font-size: 0.75rem; line-height: 1.5; }
|
||||
.font-semibold { font-weight: 600; }
|
||||
|
||||
/* Transition */
|
||||
.modal-fade-enter-active, .modal-fade-leave-active { transition: opacity 0.2s ease; }
|
||||
.modal-fade-enter-from, .modal-fade-leave-to { opacity: 0; }
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@
|
|||
/* Theme Colors - Dark Mode (Default) */
|
||||
--color-text-primary: rgba(255, 248, 235, 0.92);
|
||||
--color-text-secondary: rgba(255, 248, 235, 0.60);
|
||||
--color-text-muted: rgba(255, 248, 235, 0.38);
|
||||
/* Raised from 0.38 → 0.52 for WCAG 1.4.3 AA compliance (~5.5:1 against card bg) */
|
||||
--color-text-muted: rgba(255, 248, 235, 0.52);
|
||||
|
||||
--color-bg-primary: #1e1c1a;
|
||||
--color-bg-secondary: #161412;
|
||||
|
|
@ -40,7 +41,8 @@
|
|||
/* Status Colors */
|
||||
--color-success: #4a8c40;
|
||||
--color-success-dark: #3a7030;
|
||||
--color-success-light: #6aac60;
|
||||
/* Lightened from #6aac60 → #7fc073 for WCAG 1.4.3 AA compliance on dark backgrounds */
|
||||
--color-success-light: #7fc073;
|
||||
--color-success-bg: rgba(74, 140, 64, 0.12);
|
||||
--color-success-border: rgba(74, 140, 64, 0.30);
|
||||
|
||||
|
|
|
|||
118
scripts/backfill_keywords.py
Normal file
118
scripts/backfill_keywords.py
Normal file
|
|
@ -0,0 +1,118 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Backfill keywords column: repair character-split R-vector data.
|
||||
|
||||
The food.com corpus was imported with Keywords stored as a JSON array of
|
||||
individual characters (e.g. ["c","(","\"","I","t","a","l","i","a","n",...])
|
||||
instead of the intended keyword list (e.g. ["Italian","Low-Fat","Easy"]).
|
||||
|
||||
This script detects the broken pattern (all array elements have length 1),
|
||||
rejoins them into the original R-vector string, parses quoted tokens, and
|
||||
writes the corrected JSON back.
|
||||
|
||||
Rows that are already correct (empty array, or multi-char strings) are skipped.
|
||||
FTS5 index is rebuilt after the update so searches reflect the fix.
|
||||
|
||||
Usage:
|
||||
conda run -n cf python scripts/backfill_keywords.py [path/to/kiwi.db]
|
||||
# default: data/kiwi.db
|
||||
|
||||
Estimated time on 3.1M rows: 3-8 minutes (mostly the FTS rebuild at the end).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
_QUOTED = re.compile(r'"([^"]*)"')
|
||||
|
||||
|
||||
def _parse_r_vector(s: str) -> list[str]:
|
||||
return _QUOTED.findall(s)
|
||||
|
||||
|
||||
def _repair(raw_json: str) -> str | None:
|
||||
"""Return corrected JSON string, or None if the row is already clean."""
|
||||
try:
|
||||
val = json.loads(raw_json)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None
|
||||
|
||||
if not isinstance(val, list) or not val:
|
||||
return None # empty or non-list — leave as-is
|
||||
|
||||
# Already correct: contains multi-character strings
|
||||
if any(isinstance(e, str) and len(e) > 1 for e in val):
|
||||
return None
|
||||
|
||||
# Broken: all single characters — rejoin and re-parse
|
||||
if all(isinstance(e, str) and len(e) == 1 for e in val):
|
||||
rejoined = "".join(val)
|
||||
keywords = _parse_r_vector(rejoined)
|
||||
return json.dumps(keywords)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def backfill(db_path: Path, batch_size: int = 5000) -> None:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
|
||||
total = conn.execute("SELECT count(*) FROM recipes").fetchone()[0]
|
||||
print(f"Total recipes: {total:,}")
|
||||
|
||||
fixed = 0
|
||||
skipped = 0
|
||||
offset = 0
|
||||
|
||||
while True:
|
||||
rows = conn.execute(
|
||||
"SELECT id, keywords FROM recipes LIMIT ? OFFSET ?",
|
||||
(batch_size, offset),
|
||||
).fetchall()
|
||||
if not rows:
|
||||
break
|
||||
|
||||
updates: list[tuple[str, int]] = []
|
||||
for row_id, raw_json in rows:
|
||||
corrected = _repair(raw_json)
|
||||
if corrected is not None:
|
||||
updates.append((corrected, row_id))
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
if updates:
|
||||
conn.executemany(
|
||||
"UPDATE recipes SET keywords = ? WHERE id = ?", updates
|
||||
)
|
||||
conn.commit()
|
||||
fixed += len(updates)
|
||||
|
||||
offset += batch_size
|
||||
done = offset + len(rows) - (batch_size - len(rows))
|
||||
pct = min(100, int((offset / total) * 100))
|
||||
print(f" {pct:>3}% processed {offset:,} fixed {fixed:,} skipped {skipped:,}", end="\r")
|
||||
|
||||
print(f"\nDone. Fixed {fixed:,} rows, skipped {skipped:,} (already correct or empty).")
|
||||
|
||||
if fixed > 0:
|
||||
print("Rebuilding FTS5 browser index (recipe_browser_fts)…")
|
||||
try:
|
||||
conn.execute("INSERT INTO recipe_browser_fts(recipe_browser_fts) VALUES('rebuild')")
|
||||
conn.commit()
|
||||
print("FTS rebuild complete.")
|
||||
except Exception as e:
|
||||
print(f"FTS rebuild skipped (table may not exist yet): {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
db_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("data/kiwi.db")
|
||||
if not db_path.exists():
|
||||
print(f"DB not found: {db_path}")
|
||||
sys.exit(1)
|
||||
backfill(db_path)
|
||||
|
|
@ -57,6 +57,34 @@ def _parse_r_vector(s: str) -> list[str]:
|
|||
return _QUOTED.findall(s)
|
||||
|
||||
|
||||
def _parse_keywords(val: object) -> list[str]:
|
||||
"""Parse the food.com Keywords column into a proper list of keyword strings.
|
||||
|
||||
The raw parquet value can arrive in three forms:
|
||||
- None / NaN → []
|
||||
- str: c("Italian", ...) → parse quoted tokens via _parse_r_vector
|
||||
- list of single chars → the R-vector was character-split during dataset
|
||||
export; rejoin then re-parse
|
||||
- list of strings → already correct, use as-is
|
||||
"""
|
||||
import math
|
||||
if val is None:
|
||||
return []
|
||||
if isinstance(val, float) and math.isnan(val):
|
||||
return []
|
||||
if isinstance(val, str):
|
||||
return _parse_r_vector(val)
|
||||
if isinstance(val, list):
|
||||
if not val:
|
||||
return []
|
||||
# Detect character-split R-vector: every element is a single character
|
||||
if all(isinstance(e, str) and len(e) == 1 for e in val):
|
||||
return _parse_r_vector("".join(val))
|
||||
# Already a proper list of keyword strings
|
||||
return [str(e) for e in val if e]
|
||||
return []
|
||||
|
||||
|
||||
def extract_ingredient_names(raw_list: list[str]) -> list[str]:
|
||||
"""Strip quantities and units from ingredient strings -> normalized names."""
|
||||
names = []
|
||||
|
|
@ -168,7 +196,7 @@ def build(db_path: Path, recipes_path: Path, batch_size: int = 10000) -> None:
|
|||
json.dumps(ingredient_names),
|
||||
json.dumps(directions),
|
||||
str(row.get("RecipeCategory", "") or ""),
|
||||
json.dumps(_safe_list(row.get("Keywords"))),
|
||||
json.dumps(_parse_keywords(row.get("Keywords"))),
|
||||
_float_or_none(row.get("Calories")),
|
||||
_float_or_none(row.get("FatContent")),
|
||||
_float_or_none(row.get("ProteinContent")),
|
||||
|
|
|
|||
255
scripts/pipeline/infer_recipe_tags.py
Normal file
255
scripts/pipeline/infer_recipe_tags.py
Normal file
|
|
@ -0,0 +1,255 @@
|
|||
"""
|
||||
Infer and backfill normalized tags for all recipes.
|
||||
|
||||
Reads recipes in batches, cross-references ingredient_profiles and
|
||||
substitution_pairs, runs tag_inferrer on each recipe, and writes the result
|
||||
to recipes.inferred_tags. Also rebuilds recipe_browser_fts after the run.
|
||||
|
||||
This script is idempotent: pass --force to re-derive tags even if
|
||||
inferred_tags is already non-empty.
|
||||
|
||||
Usage:
|
||||
conda run -n cf python scripts/pipeline/infer_recipe_tags.py \\
|
||||
[path/to/kiwi.db] [--batch-size 2000] [--force]
|
||||
|
||||
Estimated time on 3.1M rows: 10-20 minutes (CPU-bound text matching).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Allow importing from the app package when run from the repo root
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
|
||||
|
||||
from app.services.recipe.tag_inferrer import infer_tags
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Substitution constraint label mapping
|
||||
# Keys are what we store in substitution_pairs.constraint_label.
|
||||
# ---------------------------------------------------------------------------
|
||||
_INTERESTING_CONSTRAINTS = {"gluten_free", "low_calorie", "low_carb", "vegan", "dairy_free", "low_sodium"}
|
||||
|
||||
|
||||
def _load_profiles(conn: sqlite3.Connection) -> dict[str, dict]:
|
||||
"""
|
||||
Load ingredient_profiles into a dict keyed by name.
|
||||
Values hold only the fields we need for tag inference.
|
||||
"""
|
||||
profiles: dict[str, dict] = {}
|
||||
rows = conn.execute("""
|
||||
SELECT name, elements, glutamate_mg, is_fermented, ph_estimate
|
||||
FROM ingredient_profiles
|
||||
""").fetchall()
|
||||
for name, elements_json, glutamate_mg, is_fermented, ph_estimate in rows:
|
||||
try:
|
||||
elements: list[str] = json.loads(elements_json) if elements_json else []
|
||||
except Exception:
|
||||
elements = []
|
||||
profiles[name] = {
|
||||
"elements": elements,
|
||||
"glutamate": float(glutamate_mg or 0),
|
||||
"fermented": bool(is_fermented),
|
||||
"ph": float(ph_estimate) if ph_estimate is not None else None,
|
||||
}
|
||||
return profiles
|
||||
|
||||
|
||||
def _load_sub_index(conn: sqlite3.Connection) -> dict[str, set[str]]:
|
||||
"""
|
||||
Build a dict of ingredient_name -> set of available constraint labels.
|
||||
Only loads constraints we care about.
|
||||
"""
|
||||
index: dict[str, set[str]] = {}
|
||||
placeholders = ",".join("?" * len(_INTERESTING_CONSTRAINTS))
|
||||
rows = conn.execute(
|
||||
f"SELECT original_name, constraint_label FROM substitution_pairs "
|
||||
f"WHERE constraint_label IN ({placeholders})",
|
||||
list(_INTERESTING_CONSTRAINTS),
|
||||
).fetchall()
|
||||
for name, label in rows:
|
||||
index.setdefault(name, set()).add(label)
|
||||
return index
|
||||
|
||||
|
||||
def _enrich(
|
||||
ingredient_names: list[str],
|
||||
profile_index: dict[str, dict],
|
||||
sub_index: dict[str, set[str]],
|
||||
) -> dict:
|
||||
"""
|
||||
Cross-reference ingredient_names against our enrichment indices.
|
||||
Returns a dict of enriched signals ready for infer_tags().
|
||||
"""
|
||||
fermented_count = 0
|
||||
glutamate_total = 0.0
|
||||
ph_values: list[float] = []
|
||||
element_totals: dict[str, float] = {}
|
||||
profiled = 0
|
||||
constraint_sets: list[set[str]] = []
|
||||
|
||||
for name in ingredient_names:
|
||||
profile = profile_index.get(name)
|
||||
if profile:
|
||||
profiled += 1
|
||||
glutamate_total += profile["glutamate"]
|
||||
if profile["fermented"]:
|
||||
fermented_count += 1
|
||||
if profile["ph"] is not None:
|
||||
ph_values.append(profile["ph"])
|
||||
for elem in profile["elements"]:
|
||||
element_totals[elem] = element_totals.get(elem, 0.0) + 1.0
|
||||
|
||||
subs = sub_index.get(name)
|
||||
if subs:
|
||||
constraint_sets.append(subs)
|
||||
|
||||
# Element coverage: fraction of profiled ingredients that carry each element
|
||||
element_coverage: dict[str, float] = {}
|
||||
if profiled > 0:
|
||||
element_coverage = {e: round(c / profiled, 3) for e, c in element_totals.items()}
|
||||
|
||||
# Only emit a can_be:* tag if ALL relevant ingredients have the substitution available.
|
||||
# (A recipe is gluten_free-achievable only if every gluten source can be swapped.)
|
||||
# We use a simpler heuristic: if at least one ingredient has the constraint, flag it.
|
||||
# Future improvement: require coverage of all gluten-bearing ingredients.
|
||||
available_constraints: list[str] = []
|
||||
if constraint_sets:
|
||||
union_constraints: set[str] = set()
|
||||
for cs in constraint_sets:
|
||||
union_constraints.update(cs)
|
||||
available_constraints = sorted(union_constraints & _INTERESTING_CONSTRAINTS)
|
||||
|
||||
return {
|
||||
"element_coverage": element_coverage,
|
||||
"fermented_count": fermented_count,
|
||||
"glutamate_total": glutamate_total,
|
||||
"ph_min": min(ph_values) if ph_values else None,
|
||||
"available_sub_constraints": available_constraints,
|
||||
}
|
||||
|
||||
|
||||
def run(db_path: Path, batch_size: int = 2000, force: bool = False) -> None:
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
|
||||
total = conn.execute("SELECT count(*) FROM recipes").fetchone()[0]
|
||||
print(f"Total recipes: {total:,}")
|
||||
print("Loading ingredient profiles...")
|
||||
profile_index = _load_profiles(conn)
|
||||
print(f" {len(profile_index):,} profiles loaded")
|
||||
print("Loading substitution index...")
|
||||
sub_index = _load_sub_index(conn)
|
||||
print(f" {len(sub_index):,} substitutable ingredients indexed")
|
||||
|
||||
updated = 0
|
||||
skipped = 0
|
||||
offset = 0
|
||||
|
||||
where_clause = "" if force else "WHERE inferred_tags = '[]' OR inferred_tags IS NULL"
|
||||
|
||||
eligible = conn.execute(
|
||||
f"SELECT count(*) FROM recipes {where_clause}"
|
||||
).fetchone()[0]
|
||||
print(f"Recipes to process: {eligible:,} ({'all' if force else 'untagged only'})")
|
||||
|
||||
while True:
|
||||
rows = conn.execute(
|
||||
f"""
|
||||
SELECT id, title, ingredient_names, category, keywords,
|
||||
element_coverage,
|
||||
calories, fat_g, protein_g, carbs_g, servings
|
||||
FROM recipes {where_clause}
|
||||
ORDER BY id
|
||||
LIMIT ? OFFSET ?
|
||||
""",
|
||||
(batch_size, offset),
|
||||
).fetchall()
|
||||
if not rows:
|
||||
break
|
||||
|
||||
updates: list[tuple[str, int]] = []
|
||||
for (row_id, title, ingr_json, category, kw_json,
|
||||
elem_cov_json, calories, fat_g, protein_g, carbs_g, servings) in rows:
|
||||
try:
|
||||
ingredient_names: list[str] = json.loads(ingr_json) if ingr_json else []
|
||||
corpus_keywords: list[str] = json.loads(kw_json) if kw_json else []
|
||||
element_coverage: dict[str, float] = (
|
||||
json.loads(elem_cov_json) if elem_cov_json else {}
|
||||
)
|
||||
except Exception:
|
||||
ingredient_names = []
|
||||
corpus_keywords = []
|
||||
element_coverage = {}
|
||||
|
||||
enriched = _enrich(ingredient_names, profile_index, sub_index)
|
||||
# Prefer the pre-computed element_coverage from the recipes table
|
||||
# (it was computed over all ingredients at import time, not just the
|
||||
# profiled subset). Fall back to what _enrich computed.
|
||||
effective_coverage = element_coverage or enriched["element_coverage"]
|
||||
|
||||
tags = infer_tags(
|
||||
title=title or "",
|
||||
ingredient_names=ingredient_names,
|
||||
corpus_keywords=corpus_keywords,
|
||||
corpus_category=category or "",
|
||||
element_coverage=effective_coverage,
|
||||
fermented_count=enriched["fermented_count"],
|
||||
glutamate_total=enriched["glutamate_total"],
|
||||
ph_min=enriched["ph_min"],
|
||||
available_sub_constraints=enriched["available_sub_constraints"],
|
||||
calories=calories,
|
||||
protein_g=protein_g,
|
||||
fat_g=fat_g,
|
||||
carbs_g=carbs_g,
|
||||
servings=servings,
|
||||
)
|
||||
updates.append((json.dumps(tags), row_id))
|
||||
|
||||
if updates:
|
||||
conn.executemany(
|
||||
"UPDATE recipes SET inferred_tags = ? WHERE id = ?", updates
|
||||
)
|
||||
conn.commit()
|
||||
updated += len(updates)
|
||||
else:
|
||||
skipped += len(rows)
|
||||
|
||||
offset += len(rows)
|
||||
pct = min(100, int((offset / eligible) * 100)) if eligible else 100
|
||||
print(
|
||||
f" {pct:>3}% offset {offset:,} tagged {updated:,}",
|
||||
end="\r",
|
||||
)
|
||||
|
||||
print(f"\nDone. Tagged {updated:,} recipes, skipped {skipped:,}.")
|
||||
|
||||
if updated > 0:
|
||||
print("Rebuilding FTS5 browser index (recipe_browser_fts)...")
|
||||
try:
|
||||
conn.execute(
|
||||
"INSERT INTO recipe_browser_fts(recipe_browser_fts) VALUES('rebuild')"
|
||||
)
|
||||
conn.commit()
|
||||
print("FTS rebuild complete.")
|
||||
except Exception as e:
|
||||
print(f"FTS rebuild skipped: {e}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument("db", nargs="?", default="data/kiwi.db", type=Path)
|
||||
parser.add_argument("--batch-size", type=int, default=2000)
|
||||
parser.add_argument("--force", action="store_true",
|
||||
help="Re-derive tags even if inferred_tags is already set.")
|
||||
args = parser.parse_args()
|
||||
if not args.db.exists():
|
||||
print(f"DB not found: {args.db}")
|
||||
sys.exit(1)
|
||||
run(args.db, args.batch_size, args.force)
|
||||
|
|
@ -70,3 +70,14 @@ def test_local_feed_returns_json():
|
|||
response = client.get("/api/v1/community/local-feed")
|
||||
assert response.status_code == 200
|
||||
assert isinstance(response.json(), list)
|
||||
|
||||
|
||||
def test_hall_of_chaos_route_exists():
|
||||
"""GET /community/hall-of-chaos returns 200 and includes chaos_level key."""
|
||||
mock_store = MagicMock()
|
||||
mock_store.list_posts.return_value = []
|
||||
with patch("app.api.endpoints.community._community_store", mock_store):
|
||||
response = client.get("/api/v1/community/hall-of-chaos")
|
||||
assert response.status_code == 200
|
||||
data = response.json()
|
||||
assert "chaos_level" in data
|
||||
|
|
|
|||
Loading…
Reference in a new issue