feat(tags): add meal type inference from recipe titles (#125)

Adds _MEAL_SIGNALS table to tag_inferrer with title-only matching for:
  meal:Breakfast — pancakes, waffles, frittata, oatmeal, granola, etc.
  meal:Dessert   — cake, cookie, brownie, pudding, ice cream, tart, etc.
  meal:Snack     — dip, chips, popcorn, nachos, energy balls, etc.
  meal:Beverage  — smoothie, cocktail, juice, lemonade, etc.
  meal:Lunch     — sandwich, wrap, panini, grilled cheese, etc.
  meal:Bread     — bread, sourdough, focaccia, dinner roll, etc.

Uses word-boundary + optional-plural regex (\bWORD(?:s|es)?\b) so:
- "pancakes" matches the "pancake" pattern but "pancake" != "cake"
- "tartare" does not match "tart" (no word boundary after tart in tartare)
- "dipping" does not match "dip" (extra chars prevent boundary)

Title-only matching (not ingredient text) avoids false positives from
ingredient names like "cake flour" or "sandwich bread".

Estimated browse impact after backfill (--force on 3.19M recipes):
  Breakfast: 43 → ~70k
  Dessert:   372 → ~350k  (real desserts, not flavor:Sweet)
  Snack:     57  → ~60k
  Beverage:  43  → ~36k
  Lunch:     69  → ~26k
This commit is contained in:
pyr0ball 2026-04-27 12:24:31 -07:00
parent 6f097cd43d
commit 7fd92d5179

View file

@ -22,6 +22,8 @@ queries find recipes the food.com corpus tags alone would miss.
""" """
from __future__ import annotations from __future__ import annotations
import re
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Text-signal tables # Text-signal tables
@ -121,6 +123,50 @@ _TIME_SIGNALS: list[tuple[str, list[str]]] = [
("time:Slow Cook", ["slow cooker", "crockpot", "< 4 hours", "braise"]), ("time:Slow Cook", ["slow cooker", "crockpot", "< 4 hours", "braise"]),
] ]
# ---------------------------------------------------------------------------
# Meal type signals — matched against TITLE ONLY (not ingredient text).
# Ingredient names frequently contain words like "cake flour" or "sandwich
# bread" which would produce false meal-type tags if matched against the full
# title+ingredient string.
# ---------------------------------------------------------------------------
_MEAL_SIGNALS: list[tuple[str, list[str]]] = [
("meal:Breakfast", [
"breakfast", "pancake", "waffle", "french toast", "scrambled egg",
"frittata", "hash brown", "hash browns", "breakfast burrito",
"breakfast sandwich", "breakfast casserole", "overnight oat",
"granola", "oatmeal", "muffin", "morning glory", "eggs benedict",
"shakshuka", "crepe", "scone",
]),
("meal:Dessert", [
"dessert", "cake", "cookie", "brownie", "cheesecake", "pudding",
"fudge", "ice cream", "sorbet", "cupcake", "mousse", "candy",
"truffle", "gelato", "donut", "doughnut", "cobbler", "crisp",
"crumble", "tiramisu", "eclair", "sundae", "milkshake", "parfait",
"biscotti", "macaron", "panna cotta", "baklava", "churro", "tart",
"torte", "strudel", "compote", "semifreddo",
]),
("meal:Snack", [
"snack", "appetizer", "dip", "chips", "popcorn", "trail mix",
"energy ball", "deviled egg", "cheese ball", "nachos",
"pretzel bites", "protein ball", "granola bar",
]),
("meal:Beverage", [
"smoothie", "cocktail", "mocktail", "lemonade", "limeade",
"margarita", "sangria", "punch", "milkshake", "milk shake",
"juice", "spritzer", "iced tea", "hot chocolate", "chai latte",
"mulled wine", "eggnog", "slushie", "frappe", "horchata",
"agua fresca", "shrub", "switchel",
]),
("meal:Lunch", [
"lunch", "sandwich", "panini", "grilled cheese", "wrap",
"lunchbox", "lunch box",
]),
("meal:Bread", [
"bread", "sourdough", "focaccia", "flatbread", "dinner roll",
"loaf", "baguette", "ciabatta", "brioche", "challah", "pita",
]),
]
_MAIN_INGREDIENT_SIGNALS: list[tuple[str, list[str]]] = [ _MAIN_INGREDIENT_SIGNALS: list[tuple[str, list[str]]] = [
("main:Chicken", ["chicken", "poultry", "turkey"]), ("main:Chicken", ["chicken", "poultry", "turkey"]),
("main:Beef", ["beef", "ground beef", "steak", "brisket", "pot roast"]), ("main:Beef", ["beef", "ground beef", "steak", "brisket", "pot roast"]),
@ -196,6 +242,29 @@ def _match_signals(text: str, table: list[tuple[str, list[str]]]) -> list[str]:
return [tag for tag, pats in table if any(p in text for p in pats)] return [tag for tag, pats in table if any(p in text for p in pats)]
def _match_title_signals(title: str, table: list[tuple[str, list[str]]]) -> list[str]:
"""Match signals against title text only, using word-boundary + optional plural.
Pattern: `\\bWORD(?:s|es)?\\b`
This handles:
- Plurals: "cookie" matches "cookies", "sandwich" matches "sandwiches"
- Substring rejection: "cake" does NOT match "pancake" (no word boundary
before 'c' in pan|cake), "tart" does NOT match "tartare" (after "tart"
the 'a' is a word char, not a boundary)
- Avoids false positives from ingredient text ("cake flour", "sandwich bread")
by only matching the recipe title, not the full title+ingredient string.
"""
t = title.lower()
return [
tag for tag, pats in table
if any(
re.search(r"\b" + re.escape(p.strip()) + r"(?:s|es)?\b", t)
for p in pats
)
]
def infer_tags( def infer_tags(
title: str, title: str,
ingredient_names: list[str], ingredient_names: list[str],
@ -258,6 +327,9 @@ def infer_tags(
tags.update(_match_signals(text, _FLAVOR_SIGNALS)) tags.update(_match_signals(text, _FLAVOR_SIGNALS))
tags.update(_match_signals(text, _MAIN_INGREDIENT_SIGNALS)) tags.update(_match_signals(text, _MAIN_INGREDIENT_SIGNALS))
# Meal type: title-only to avoid "cake flour" → meal:Dessert false positives
tags.update(_match_title_signals(title, _MEAL_SIGNALS))
# 3. Time signals from corpus keywords + text # 3. Time signals from corpus keywords + text
corpus_text = " ".join(kw.lower() for kw in corpus_keywords) corpus_text = " ".join(kw.lower() for kw in corpus_keywords)
tags.update(_match_signals(corpus_text, _TIME_SIGNALS)) tags.update(_match_signals(corpus_text, _TIME_SIGNALS))