feat(tags): add meal type inference from recipe titles (#125)
Adds _MEAL_SIGNALS table to tag_inferrer with title-only matching for: meal:Breakfast — pancakes, waffles, frittata, oatmeal, granola, etc. meal:Dessert — cake, cookie, brownie, pudding, ice cream, tart, etc. meal:Snack — dip, chips, popcorn, nachos, energy balls, etc. meal:Beverage — smoothie, cocktail, juice, lemonade, etc. meal:Lunch — sandwich, wrap, panini, grilled cheese, etc. meal:Bread — bread, sourdough, focaccia, dinner roll, etc. Uses word-boundary + optional-plural regex (\bWORD(?:s|es)?\b) so: - "pancakes" matches the "pancake" pattern but "pancake" != "cake" - "tartare" does not match "tart" (no word boundary after tart in tartare) - "dipping" does not match "dip" (extra chars prevent boundary) Title-only matching (not ingredient text) avoids false positives from ingredient names like "cake flour" or "sandwich bread". Estimated browse impact after backfill (--force on 3.19M recipes): Breakfast: 43 → ~70k Dessert: 372 → ~350k (real desserts, not flavor:Sweet) Snack: 57 → ~60k Beverage: 43 → ~36k Lunch: 69 → ~26k
This commit is contained in:
parent
6f097cd43d
commit
7fd92d5179
1 changed files with 72 additions and 0 deletions
|
|
@ -22,6 +22,8 @@ queries find recipes the food.com corpus tags alone would miss.
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Text-signal tables
|
||||
|
|
@ -121,6 +123,50 @@ _TIME_SIGNALS: list[tuple[str, list[str]]] = [
|
|||
("time:Slow Cook", ["slow cooker", "crockpot", "< 4 hours", "braise"]),
|
||||
]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Meal type signals — matched against TITLE ONLY (not ingredient text).
|
||||
# Ingredient names frequently contain words like "cake flour" or "sandwich
|
||||
# bread" which would produce false meal-type tags if matched against the full
|
||||
# title+ingredient string.
|
||||
# ---------------------------------------------------------------------------
|
||||
_MEAL_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("meal:Breakfast", [
|
||||
"breakfast", "pancake", "waffle", "french toast", "scrambled egg",
|
||||
"frittata", "hash brown", "hash browns", "breakfast burrito",
|
||||
"breakfast sandwich", "breakfast casserole", "overnight oat",
|
||||
"granola", "oatmeal", "muffin", "morning glory", "eggs benedict",
|
||||
"shakshuka", "crepe", "scone",
|
||||
]),
|
||||
("meal:Dessert", [
|
||||
"dessert", "cake", "cookie", "brownie", "cheesecake", "pudding",
|
||||
"fudge", "ice cream", "sorbet", "cupcake", "mousse", "candy",
|
||||
"truffle", "gelato", "donut", "doughnut", "cobbler", "crisp",
|
||||
"crumble", "tiramisu", "eclair", "sundae", "milkshake", "parfait",
|
||||
"biscotti", "macaron", "panna cotta", "baklava", "churro", "tart",
|
||||
"torte", "strudel", "compote", "semifreddo",
|
||||
]),
|
||||
("meal:Snack", [
|
||||
"snack", "appetizer", "dip", "chips", "popcorn", "trail mix",
|
||||
"energy ball", "deviled egg", "cheese ball", "nachos",
|
||||
"pretzel bites", "protein ball", "granola bar",
|
||||
]),
|
||||
("meal:Beverage", [
|
||||
"smoothie", "cocktail", "mocktail", "lemonade", "limeade",
|
||||
"margarita", "sangria", "punch", "milkshake", "milk shake",
|
||||
"juice", "spritzer", "iced tea", "hot chocolate", "chai latte",
|
||||
"mulled wine", "eggnog", "slushie", "frappe", "horchata",
|
||||
"agua fresca", "shrub", "switchel",
|
||||
]),
|
||||
("meal:Lunch", [
|
||||
"lunch", "sandwich", "panini", "grilled cheese", "wrap",
|
||||
"lunchbox", "lunch box",
|
||||
]),
|
||||
("meal:Bread", [
|
||||
"bread", "sourdough", "focaccia", "flatbread", "dinner roll",
|
||||
"loaf", "baguette", "ciabatta", "brioche", "challah", "pita",
|
||||
]),
|
||||
]
|
||||
|
||||
_MAIN_INGREDIENT_SIGNALS: list[tuple[str, list[str]]] = [
|
||||
("main:Chicken", ["chicken", "poultry", "turkey"]),
|
||||
("main:Beef", ["beef", "ground beef", "steak", "brisket", "pot roast"]),
|
||||
|
|
@ -196,6 +242,29 @@ def _match_signals(text: str, table: list[tuple[str, list[str]]]) -> list[str]:
|
|||
return [tag for tag, pats in table if any(p in text for p in pats)]
|
||||
|
||||
|
||||
def _match_title_signals(title: str, table: list[tuple[str, list[str]]]) -> list[str]:
|
||||
"""Match signals against title text only, using word-boundary + optional plural.
|
||||
|
||||
Pattern: `\\bWORD(?:s|es)?\\b`
|
||||
|
||||
This handles:
|
||||
- Plurals: "cookie" matches "cookies", "sandwich" matches "sandwiches"
|
||||
- Substring rejection: "cake" does NOT match "pancake" (no word boundary
|
||||
before 'c' in pan|cake), "tart" does NOT match "tartare" (after "tart"
|
||||
the 'a' is a word char, not a boundary)
|
||||
- Avoids false positives from ingredient text ("cake flour", "sandwich bread")
|
||||
by only matching the recipe title, not the full title+ingredient string.
|
||||
"""
|
||||
t = title.lower()
|
||||
return [
|
||||
tag for tag, pats in table
|
||||
if any(
|
||||
re.search(r"\b" + re.escape(p.strip()) + r"(?:s|es)?\b", t)
|
||||
for p in pats
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def infer_tags(
|
||||
title: str,
|
||||
ingredient_names: list[str],
|
||||
|
|
@ -258,6 +327,9 @@ def infer_tags(
|
|||
tags.update(_match_signals(text, _FLAVOR_SIGNALS))
|
||||
tags.update(_match_signals(text, _MAIN_INGREDIENT_SIGNALS))
|
||||
|
||||
# Meal type: title-only to avoid "cake flour" → meal:Dessert false positives
|
||||
tags.update(_match_title_signals(title, _MEAL_SIGNALS))
|
||||
|
||||
# 3. Time signals from corpus keywords + text
|
||||
corpus_text = " ".join(kw.lower() for kw in corpus_keywords)
|
||||
tags.update(_match_signals(corpus_text, _TIME_SIGNALS))
|
||||
|
|
|
|||
Loading…
Reference in a new issue