fix: wire browse domains to inferred_tag vocabulary, fix can_be leak in dietary

- Dinner: replace non-matching text keywords with main:X protein inferred tags (0 -> 815k results)
- All meal_type categories: add meal:X structured tag phrases
- Dietary: switch to dietary:X-only phrases; bare text keywords matched can_be:X
  tags (nearly all recipes), inflating counts to 1.3M+ falsely
- Cuisine: add cuisine:X structured tag phrases to Italian, Mexican, Asian,
  Indian, Mediterranean, American, BBQ, European, Latin American
- Side Dish: use main:Vegetables + main:Grains as proxy (no meal:Side Dish tag exists)
- Dessert: remove 'sweet' keyword (matched flavor:Sweet on all recipes)
- New dietary categories: Low-Sodium, Paleo

Closes #122. Partial progress on #123.
Follow-up: #125 (expand meal: tag inferrer coverage)
This commit is contained in:
pyr0ball 2026-04-27 11:38:37 -07:00
parent 46778d62e3
commit 6f097cd43d

View file

@ -26,7 +26,7 @@ DOMAINS: dict[str, dict] = {
"label": "Cuisine", "label": "Cuisine",
"categories": { "categories": {
"Italian": { "Italian": {
"keywords": ["italian", "pasta", "pizza", "risotto", "lasagna", "carbonara"], "keywords": ["cuisine:Italian", "italian", "pasta", "pizza", "risotto", "lasagna", "carbonara"],
"subcategories": { "subcategories": {
"Sicilian": ["sicilian", "sicily", "arancini", "caponata", "Sicilian": ["sicilian", "sicily", "arancini", "caponata",
"involtini", "cannoli"], "involtini", "cannoli"],
@ -43,8 +43,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Mexican": { "Mexican": {
"keywords": ["mexican", "taco", "enchilada", "burrito", "salsa", "keywords": ["cuisine:Mexican", "mexican", "taco", "enchilada", "burrito",
"guacamole", "mole", "tamale"], "salsa", "guacamole", "mole", "tamale"],
"subcategories": { "subcategories": {
"Oaxacan": ["oaxacan", "oaxaca", "mole negro", "tlayuda", "Oaxacan": ["oaxacan", "oaxaca", "mole negro", "tlayuda",
"chapulines", "mezcal", "tasajo", "memelas"], "chapulines", "mezcal", "tasajo", "memelas"],
@ -67,7 +67,9 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Asian": { "Asian": {
"keywords": ["asian", "chinese", "japanese", "thai", "korean", "vietnamese", "keywords": ["cuisine:Chinese", "cuisine:Japanese", "cuisine:Korean",
"cuisine:Thai", "cuisine:Vietnamese",
"asian", "chinese", "japanese", "thai", "korean", "vietnamese",
"stir fry", "stir-fry", "ramen", "sushi", "malaysian", "stir fry", "stir-fry", "ramen", "sushi", "malaysian",
"taiwanese", "singaporean", "burmese", "cambodian", "taiwanese", "singaporean", "burmese", "cambodian",
"laotian", "mongolian", "hong kong"], "laotian", "mongolian", "hong kong"],
@ -128,7 +130,7 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Indian": { "Indian": {
"keywords": ["indian", "curry", "lentil", "dal", "tikka", "masala", "keywords": ["cuisine:Indian", "indian", "curry", "lentil", "dal", "tikka", "masala",
"biryani", "naan", "chutney", "pakistani", "sri lankan", "biryani", "naan", "chutney", "pakistani", "sri lankan",
"bangladeshi", "nepali"], "bangladeshi", "nepali"],
"subcategories": { "subcategories": {
@ -156,7 +158,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Mediterranean": { "Mediterranean": {
"keywords": ["mediterranean", "greek", "middle eastern", "turkish", "keywords": ["cuisine:Mediterranean", "cuisine:Greek", "cuisine:Middle Eastern",
"mediterranean", "greek", "middle eastern", "turkish",
"lebanese", "jewish", "palestinian", "yemeni", "egyptian", "lebanese", "jewish", "palestinian", "yemeni", "egyptian",
"syrian", "iraqi", "jordanian"], "syrian", "iraqi", "jordanian"],
"subcategories": { "subcategories": {
@ -190,7 +193,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"American": { "American": {
"keywords": ["american", "southern", "comfort food", "cajun", "creole", "keywords": ["cuisine:American", "cuisine:Southern", "cuisine:Cajun",
"american", "southern", "comfort food", "cajun", "creole",
"hawaiian", "tex-mex", "soul food"], "hawaiian", "tex-mex", "soul food"],
"subcategories": { "subcategories": {
"Southern": ["southern", "soul food", "fried chicken", "Southern": ["southern", "soul food", "fried chicken",
@ -214,10 +218,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"BBQ & Smoke": { "BBQ & Smoke": {
# Top-level keywords use broad corpus-friendly terms that appear in # Top-level keywords: cuisine:BBQ inferred tag + broad corpus terms.
# food.com keyword/category fields (e.g. "BBQ", "Oven BBQ", "Smoker"). "keywords": ["cuisine:BBQ", "bbq", "barbecue", "barbeque", "smoked", "smoky",
# Subcategory keywords remain specific for drill-down filtering.
"keywords": ["bbq", "barbecue", "barbeque", "smoked", "smoky",
"smoke", "pit", "smoke ring", "low and slow", "smoke", "pit", "smoke ring", "low and slow",
"brisket", "pulled pork", "ribs", "spare ribs", "brisket", "pulled pork", "ribs", "spare ribs",
"baby back", "baby back ribs", "dry rub", "wet rub", "baby back", "baby back ribs", "dry rub", "wet rub",
@ -251,7 +253,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"European": { "European": {
"keywords": ["french", "german", "spanish", "british", "irish", "scottish", "keywords": ["cuisine:French", "cuisine:German", "cuisine:Spanish",
"french", "german", "spanish", "british", "irish", "scottish",
"welsh", "scandinavian", "nordic", "eastern european"], "welsh", "scandinavian", "nordic", "eastern european"],
"subcategories": { "subcategories": {
"French": ["french", "provencal", "beurre", "crepe", "French": ["french", "provencal", "beurre", "crepe",
@ -281,7 +284,8 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Latin American": { "Latin American": {
"keywords": ["latin american", "peruvian", "argentinian", "colombian", "keywords": ["cuisine:Latin American", "cuisine:Caribbean",
"latin american", "peruvian", "argentinian", "colombian",
"cuban", "caribbean", "brazilian", "venezuelan", "chilean"], "cuban", "caribbean", "brazilian", "venezuelan", "chilean"],
"subcategories": { "subcategories": {
"Peruvian": ["peruvian", "ceviche", "lomo saltado", "anticucho", "Peruvian": ["peruvian", "ceviche", "lomo saltado", "anticucho",
@ -425,12 +429,18 @@ DOMAINS: dict[str, dict] = {
"meal_type": { "meal_type": {
"label": "Meal Type", "label": "Meal Type",
"categories": { "categories": {
# Keywords use two complementary sources:
# 1. inferred_tag phrases ("meal:X", "main:X") — indexed in recipe_browser_fts.inferred_tags.
# FTS5 tokenises "meal:Breakfast" → ["meal","breakfast"], so the quoted phrase
# "meal:Breakfast" matches exactly that consecutive token pair.
# 2. Corpus keyword/category text — only covers the ~1,200 keyword-tagged recipes.
# Kept as a fallback; not the primary signal.
"Breakfast": { "Breakfast": {
"keywords": ["breakfast", "brunch", "eggs", "pancakes", "waffles", "keywords": ["meal:Breakfast", "breakfast", "brunch", "pancakes",
"oatmeal", "muffin"], "waffles", "oatmeal", "muffin"],
"subcategories": { "subcategories": {
"Eggs": ["egg", "omelette", "frittata", "quiche", "Eggs": ["meal:Breakfast", "egg", "omelette", "frittata",
"scrambled", "benedict", "shakshuka"], "quiche", "scrambled", "benedict", "shakshuka"],
"Pancakes & Waffles": ["pancake", "waffle", "crepe", "french toast"], "Pancakes & Waffles": ["pancake", "waffle", "crepe", "french toast"],
"Baked Goods": ["muffin", "scone", "biscuit", "quick bread", "Baked Goods": ["muffin", "scone", "biscuit", "quick bread",
"coffee cake", "danish"], "coffee cake", "danish"],
@ -439,12 +449,15 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Lunch": { "Lunch": {
"keywords": ["lunch", "sandwich", "wrap", "salad", "soup", "light meal"], # meal:Lunch tag covers explicitly-tagged recipes.
# Coverage is limited — most lunch-style recipes have no distinct meal-type tag.
"keywords": ["meal:Lunch", "lunch", "sandwich", "wrap", "salad",
"soup", "light meal"],
"subcategories": { "subcategories": {
"Sandwiches": ["sandwich", "sub", "hoagie", "panini", "club", "Sandwiches": ["sandwich", "sub", "hoagie", "panini", "club",
"grilled cheese", "blt"], "grilled cheese", "blt"],
"Salads": ["salad", "grain bowl", "chopped", "caesar", "Salads": ["salad", "grain bowl", "chopped", "caesar",
"niçoise", "cobb"], "cobb"],
"Soups": ["soup", "bisque", "chowder", "gazpacho", "Soups": ["soup", "bisque", "chowder", "gazpacho",
"minestrone", "lentil soup"], "minestrone", "lentil soup"],
"Wraps": ["wrap", "burrito bowl", "pita", "lettuce wrap", "Wraps": ["wrap", "burrito bowl", "pita", "lettuce wrap",
@ -452,23 +465,27 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Dinner": { "Dinner": {
"keywords": ["dinner", "main dish", "entree", "main course", "supper"], # Primary: main:X inferred tags (800k+ recipes).
# "meal:Dinner" does not exist in the inferred-tag vocabulary — main-protein
# tags are the best available proxy for main-course dinner recipes.
"keywords": ["main:Chicken", "main:Beef", "main:Pork", "main:Fish",
"main:Pasta", "dinner", "main dish", "entree",
"main course", "supper"],
"subcategories": { "subcategories": {
"Casseroles": ["casserole", "bake", "gratin", "lasagna", "Chicken": ["main:Chicken"],
"sheperd's pie", "pot pie"], "Beef": ["main:Beef"],
"Pork": ["main:Pork"],
"Fish & Seafood": ["main:Fish"],
"Pasta": ["main:Pasta"],
"Casseroles": ["casserole", "bake", "gratin", "pot pie"],
"Stews": ["stew", "braise", "slow cooker", "pot roast", "Stews": ["stew", "braise", "slow cooker", "pot roast",
"daube", "ragù"], "daube"],
"Grilled": ["grilled", "grill", "barbecue", "charred", "Grilled": ["grilled", "grill", "barbecue", "kebab", "skewer"],
"kebab", "skewer"],
"Stir-Fries": ["stir fry", "stir-fry", "wok", "sauté",
"sauteed"],
"Roasts": ["roast", "roasted", "oven", "baked chicken",
"pot roast"],
}, },
}, },
"Snack": { "Snack": {
"keywords": ["snack", "appetizer", "finger food", "dip", "bite", "keywords": ["meal:Snack", "snack", "appetizer", "finger food",
"starter"], "dip", "bite", "starter"],
"subcategories": { "subcategories": {
"Dips & Spreads": ["dip", "spread", "hummus", "guacamole", "Dips & Spreads": ["dip", "spread", "hummus", "guacamole",
"salsa", "pate"], "salsa", "pate"],
@ -479,8 +496,9 @@ DOMAINS: dict[str, dict] = {
}, },
}, },
"Dessert": { "Dessert": {
"keywords": ["dessert", "cake", "cookie", "pie", "sweet", "pudding", # "sweet" removed — it matches flavor:Sweet inferred tags, causing false positives.
"ice cream", "brownie"], "keywords": ["meal:Dessert", "dessert", "cake", "cookie", "pie",
"pudding", "ice cream", "brownie"],
"subcategories": { "subcategories": {
"Cakes": ["cake", "cupcake", "layer cake", "bundt", "Cakes": ["cake", "cupcake", "layer cake", "bundt",
"cheesecake", "torte"], "cheesecake", "torte"],
@ -496,20 +514,41 @@ DOMAINS: dict[str, dict] = {
"caramel", "toffee"], "caramel", "toffee"],
}, },
}, },
"Beverage": ["drink", "smoothie", "cocktail", "beverage", "juice", "shake"], "Beverage": ["meal:Beverage", "drink", "smoothie", "cocktail", "beverage",
"Side Dish": ["side dish", "side", "accompaniment", "garnish"], "juice", "shake", "lemonade"],
"Side Dish": {
# meal:Side Dish not in inferred-tag vocabulary.
# main:Vegetables and main:Grains are the best proxies — will overlap
# with some vegetarian mains, which is acceptable.
"keywords": ["main:Vegetables", "main:Grains", "side dish", "side",
"pilaf", "accompaniment"],
"subcategories": {
"Vegetables": ["main:Vegetables"],
"Grains & Rice": ["main:Grains", "rice", "pilaf", "quinoa"],
"Bread": ["meal:Bread", "bread", "roll", "biscuit"],
},
},
}, },
}, },
"dietary": { "dietary": {
"label": "Dietary", "label": "Dietary",
# Primary: dietary:X inferred tags (indexed in recipe_browser_fts.inferred_tags).
# Secondary: text tokens kept as fallback for keyword-tagged recipes.
# IMPORTANT: Use ONLY structured dietary:X phrases here.
# Bare text keywords like "vegan", "low-carb" also match can_be:Vegan,
# can_be:Low-Carb etc. — those are "achievable with substitutions", not
# "recipe already is". The structured phrase "dietary:Vegan" (consecutive
# FTS tokens "dietary"+"vegan") does NOT match can_be:Vegan.
"categories": { "categories": {
"Vegetarian": ["vegetarian"], "Vegetarian": ["dietary:Vegetarian"],
"Vegan": ["vegan", "plant-based", "plant based"], "Vegan": ["dietary:Vegan"],
"Gluten-Free": ["gluten-free", "gluten free", "celiac"], "Gluten-Free": ["dietary:Gluten-Free"],
"Low-Carb": ["low-carb", "low carb", "keto", "ketogenic"], "Low-Carb": ["dietary:Low-Carb"],
"High-Protein": ["high protein", "high-protein"], "High-Protein": ["dietary:High-Protein"],
"Low-Fat": ["low-fat", "low fat", "light"], "Low-Fat": ["dietary:Low-Fat"],
"Dairy-Free": ["dairy-free", "dairy free", "lactose"], "Dairy-Free": ["dietary:Dairy-Free"],
"Low-Sodium": ["dietary:Low-Sodium"],
"Paleo": ["dietary:Paleo"],
}, },
}, },
"main_ingredient": { "main_ingredient": {