From 6f097cd43dbb2abc4f13f16f33bbec59d977bb59 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 27 Apr 2026 11:38:37 -0700 Subject: [PATCH] fix: wire browse domains to inferred_tag vocabulary, fix can_be leak in dietary - Dinner: replace non-matching text keywords with main:X protein inferred tags (0 -> 815k results) - All meal_type categories: add meal:X structured tag phrases - Dietary: switch to dietary:X-only phrases; bare text keywords matched can_be:X tags (nearly all recipes), inflating counts to 1.3M+ falsely - Cuisine: add cuisine:X structured tag phrases to Italian, Mexican, Asian, Indian, Mediterranean, American, BBQ, European, Latin American - Side Dish: use main:Vegetables + main:Grains as proxy (no meal:Side Dish tag exists) - Dessert: remove 'sweet' keyword (matched flavor:Sweet on all recipes) - New dietary categories: Low-Sodium, Paleo Closes #122. Partial progress on #123. Follow-up: #125 (expand meal: tag inferrer coverage) --- app/services/recipe/browser_domains.py | 123 ++++++++++++++++--------- 1 file changed, 81 insertions(+), 42 deletions(-) diff --git a/app/services/recipe/browser_domains.py b/app/services/recipe/browser_domains.py index 1d0950d..cec01a6 100644 --- a/app/services/recipe/browser_domains.py +++ b/app/services/recipe/browser_domains.py @@ -26,7 +26,7 @@ DOMAINS: dict[str, dict] = { "label": "Cuisine", "categories": { "Italian": { - "keywords": ["italian", "pasta", "pizza", "risotto", "lasagna", "carbonara"], + "keywords": ["cuisine:Italian", "italian", "pasta", "pizza", "risotto", "lasagna", "carbonara"], "subcategories": { "Sicilian": ["sicilian", "sicily", "arancini", "caponata", "involtini", "cannoli"], @@ -43,8 +43,8 @@ DOMAINS: dict[str, dict] = { }, }, "Mexican": { - "keywords": ["mexican", "taco", "enchilada", "burrito", "salsa", - "guacamole", "mole", "tamale"], + "keywords": ["cuisine:Mexican", "mexican", "taco", "enchilada", "burrito", + "salsa", "guacamole", "mole", "tamale"], "subcategories": { "Oaxacan": ["oaxacan", "oaxaca", "mole negro", "tlayuda", "chapulines", "mezcal", "tasajo", "memelas"], @@ -67,7 +67,9 @@ DOMAINS: dict[str, dict] = { }, }, "Asian": { - "keywords": ["asian", "chinese", "japanese", "thai", "korean", "vietnamese", + "keywords": ["cuisine:Chinese", "cuisine:Japanese", "cuisine:Korean", + "cuisine:Thai", "cuisine:Vietnamese", + "asian", "chinese", "japanese", "thai", "korean", "vietnamese", "stir fry", "stir-fry", "ramen", "sushi", "malaysian", "taiwanese", "singaporean", "burmese", "cambodian", "laotian", "mongolian", "hong kong"], @@ -128,7 +130,7 @@ DOMAINS: dict[str, dict] = { }, }, "Indian": { - "keywords": ["indian", "curry", "lentil", "dal", "tikka", "masala", + "keywords": ["cuisine:Indian", "indian", "curry", "lentil", "dal", "tikka", "masala", "biryani", "naan", "chutney", "pakistani", "sri lankan", "bangladeshi", "nepali"], "subcategories": { @@ -156,7 +158,8 @@ DOMAINS: dict[str, dict] = { }, }, "Mediterranean": { - "keywords": ["mediterranean", "greek", "middle eastern", "turkish", + "keywords": ["cuisine:Mediterranean", "cuisine:Greek", "cuisine:Middle Eastern", + "mediterranean", "greek", "middle eastern", "turkish", "lebanese", "jewish", "palestinian", "yemeni", "egyptian", "syrian", "iraqi", "jordanian"], "subcategories": { @@ -190,7 +193,8 @@ DOMAINS: dict[str, dict] = { }, }, "American": { - "keywords": ["american", "southern", "comfort food", "cajun", "creole", + "keywords": ["cuisine:American", "cuisine:Southern", "cuisine:Cajun", + "american", "southern", "comfort food", "cajun", "creole", "hawaiian", "tex-mex", "soul food"], "subcategories": { "Southern": ["southern", "soul food", "fried chicken", @@ -214,10 +218,8 @@ DOMAINS: dict[str, dict] = { }, }, "BBQ & Smoke": { - # Top-level keywords use broad corpus-friendly terms that appear in - # food.com keyword/category fields (e.g. "BBQ", "Oven BBQ", "Smoker"). - # Subcategory keywords remain specific for drill-down filtering. - "keywords": ["bbq", "barbecue", "barbeque", "smoked", "smoky", + # Top-level keywords: cuisine:BBQ inferred tag + broad corpus terms. + "keywords": ["cuisine:BBQ", "bbq", "barbecue", "barbeque", "smoked", "smoky", "smoke", "pit", "smoke ring", "low and slow", "brisket", "pulled pork", "ribs", "spare ribs", "baby back", "baby back ribs", "dry rub", "wet rub", @@ -251,7 +253,8 @@ DOMAINS: dict[str, dict] = { }, }, "European": { - "keywords": ["french", "german", "spanish", "british", "irish", "scottish", + "keywords": ["cuisine:French", "cuisine:German", "cuisine:Spanish", + "french", "german", "spanish", "british", "irish", "scottish", "welsh", "scandinavian", "nordic", "eastern european"], "subcategories": { "French": ["french", "provencal", "beurre", "crepe", @@ -281,7 +284,8 @@ DOMAINS: dict[str, dict] = { }, }, "Latin American": { - "keywords": ["latin american", "peruvian", "argentinian", "colombian", + "keywords": ["cuisine:Latin American", "cuisine:Caribbean", + "latin american", "peruvian", "argentinian", "colombian", "cuban", "caribbean", "brazilian", "venezuelan", "chilean"], "subcategories": { "Peruvian": ["peruvian", "ceviche", "lomo saltado", "anticucho", @@ -425,12 +429,18 @@ DOMAINS: dict[str, dict] = { "meal_type": { "label": "Meal Type", "categories": { + # Keywords use two complementary sources: + # 1. inferred_tag phrases ("meal:X", "main:X") — indexed in recipe_browser_fts.inferred_tags. + # FTS5 tokenises "meal:Breakfast" → ["meal","breakfast"], so the quoted phrase + # "meal:Breakfast" matches exactly that consecutive token pair. + # 2. Corpus keyword/category text — only covers the ~1,200 keyword-tagged recipes. + # Kept as a fallback; not the primary signal. "Breakfast": { - "keywords": ["breakfast", "brunch", "eggs", "pancakes", "waffles", - "oatmeal", "muffin"], + "keywords": ["meal:Breakfast", "breakfast", "brunch", "pancakes", + "waffles", "oatmeal", "muffin"], "subcategories": { - "Eggs": ["egg", "omelette", "frittata", "quiche", - "scrambled", "benedict", "shakshuka"], + "Eggs": ["meal:Breakfast", "egg", "omelette", "frittata", + "quiche", "scrambled", "benedict", "shakshuka"], "Pancakes & Waffles": ["pancake", "waffle", "crepe", "french toast"], "Baked Goods": ["muffin", "scone", "biscuit", "quick bread", "coffee cake", "danish"], @@ -439,12 +449,15 @@ DOMAINS: dict[str, dict] = { }, }, "Lunch": { - "keywords": ["lunch", "sandwich", "wrap", "salad", "soup", "light meal"], + # meal:Lunch tag covers explicitly-tagged recipes. + # Coverage is limited — most lunch-style recipes have no distinct meal-type tag. + "keywords": ["meal:Lunch", "lunch", "sandwich", "wrap", "salad", + "soup", "light meal"], "subcategories": { "Sandwiches": ["sandwich", "sub", "hoagie", "panini", "club", "grilled cheese", "blt"], "Salads": ["salad", "grain bowl", "chopped", "caesar", - "niçoise", "cobb"], + "cobb"], "Soups": ["soup", "bisque", "chowder", "gazpacho", "minestrone", "lentil soup"], "Wraps": ["wrap", "burrito bowl", "pita", "lettuce wrap", @@ -452,23 +465,27 @@ DOMAINS: dict[str, dict] = { }, }, "Dinner": { - "keywords": ["dinner", "main dish", "entree", "main course", "supper"], + # Primary: main:X inferred tags (800k+ recipes). + # "meal:Dinner" does not exist in the inferred-tag vocabulary — main-protein + # tags are the best available proxy for main-course dinner recipes. + "keywords": ["main:Chicken", "main:Beef", "main:Pork", "main:Fish", + "main:Pasta", "dinner", "main dish", "entree", + "main course", "supper"], "subcategories": { - "Casseroles": ["casserole", "bake", "gratin", "lasagna", - "sheperd's pie", "pot pie"], + "Chicken": ["main:Chicken"], + "Beef": ["main:Beef"], + "Pork": ["main:Pork"], + "Fish & Seafood": ["main:Fish"], + "Pasta": ["main:Pasta"], + "Casseroles": ["casserole", "bake", "gratin", "pot pie"], "Stews": ["stew", "braise", "slow cooker", "pot roast", - "daube", "ragù"], - "Grilled": ["grilled", "grill", "barbecue", "charred", - "kebab", "skewer"], - "Stir-Fries": ["stir fry", "stir-fry", "wok", "sauté", - "sauteed"], - "Roasts": ["roast", "roasted", "oven", "baked chicken", - "pot roast"], + "daube"], + "Grilled": ["grilled", "grill", "barbecue", "kebab", "skewer"], }, }, "Snack": { - "keywords": ["snack", "appetizer", "finger food", "dip", "bite", - "starter"], + "keywords": ["meal:Snack", "snack", "appetizer", "finger food", + "dip", "bite", "starter"], "subcategories": { "Dips & Spreads": ["dip", "spread", "hummus", "guacamole", "salsa", "pate"], @@ -479,8 +496,9 @@ DOMAINS: dict[str, dict] = { }, }, "Dessert": { - "keywords": ["dessert", "cake", "cookie", "pie", "sweet", "pudding", - "ice cream", "brownie"], + # "sweet" removed — it matches flavor:Sweet inferred tags, causing false positives. + "keywords": ["meal:Dessert", "dessert", "cake", "cookie", "pie", + "pudding", "ice cream", "brownie"], "subcategories": { "Cakes": ["cake", "cupcake", "layer cake", "bundt", "cheesecake", "torte"], @@ -496,20 +514,41 @@ DOMAINS: dict[str, dict] = { "caramel", "toffee"], }, }, - "Beverage": ["drink", "smoothie", "cocktail", "beverage", "juice", "shake"], - "Side Dish": ["side dish", "side", "accompaniment", "garnish"], + "Beverage": ["meal:Beverage", "drink", "smoothie", "cocktail", "beverage", + "juice", "shake", "lemonade"], + "Side Dish": { + # meal:Side Dish not in inferred-tag vocabulary. + # main:Vegetables and main:Grains are the best proxies — will overlap + # with some vegetarian mains, which is acceptable. + "keywords": ["main:Vegetables", "main:Grains", "side dish", "side", + "pilaf", "accompaniment"], + "subcategories": { + "Vegetables": ["main:Vegetables"], + "Grains & Rice": ["main:Grains", "rice", "pilaf", "quinoa"], + "Bread": ["meal:Bread", "bread", "roll", "biscuit"], + }, + }, }, }, "dietary": { "label": "Dietary", + # Primary: dietary:X inferred tags (indexed in recipe_browser_fts.inferred_tags). + # Secondary: text tokens kept as fallback for keyword-tagged recipes. + # IMPORTANT: Use ONLY structured dietary:X phrases here. + # Bare text keywords like "vegan", "low-carb" also match can_be:Vegan, + # can_be:Low-Carb etc. — those are "achievable with substitutions", not + # "recipe already is". The structured phrase "dietary:Vegan" (consecutive + # FTS tokens "dietary"+"vegan") does NOT match can_be:Vegan. "categories": { - "Vegetarian": ["vegetarian"], - "Vegan": ["vegan", "plant-based", "plant based"], - "Gluten-Free": ["gluten-free", "gluten free", "celiac"], - "Low-Carb": ["low-carb", "low carb", "keto", "ketogenic"], - "High-Protein": ["high protein", "high-protein"], - "Low-Fat": ["low-fat", "low fat", "light"], - "Dairy-Free": ["dairy-free", "dairy free", "lactose"], + "Vegetarian": ["dietary:Vegetarian"], + "Vegan": ["dietary:Vegan"], + "Gluten-Free": ["dietary:Gluten-Free"], + "Low-Carb": ["dietary:Low-Carb"], + "High-Protein": ["dietary:High-Protein"], + "Low-Fat": ["dietary:Low-Fat"], + "Dairy-Free": ["dietary:Dairy-Free"], + "Low-Sodium": ["dietary:Low-Sodium"], + "Paleo": ["dietary:Paleo"], }, }, "main_ingredient": {