chore(pipeline): add fast targeted meal-tag backfill script

backfill_meal_tags.py merges meal: tags from title-only matching into existing inferred_tags without re-deriving all other signals. ~10x faster than infer_recipe_tags.py --force for meal-tag-only updates: 3.19M recipes in ~5-10min vs ~2.5h for full re-derivation.
2026-04-27 13:00:58 -07:00 · 2026-04-27 13:00:58 -07:00 · d5a4b14400
commit d5a4b14400
parent 7fd92d5179
1 changed files with 117 additions and 0 deletions
--- a/scripts/pipeline/backfill_meal_tags.py
+++ b/scripts/pipeline/backfill_meal_tags.py
@ -0,0 +1,117 @@
 """
 Fast targeted backfill for meal: tags only.
 Rather than re-deriving ALL inferred_tags via the full infer_tags() pipeline
 (which takes ~2.5h for 3.19M recipes), this script:
  1. Reads only id + title + inferred_tags (no ingredient profiles needed —
     meal signals are title-only).
  2. Runs _match_title_signals() against the title to get meal tags.
  3. For rows that already have inferred_tags: merges in the new meal tags
     (no-op if already present).
  4. For rows with no inferred_tags: runs the full infer_tags() pipeline so
     those rows get a complete tag set, not just meal tags.
  5. Rebuilds the FTS5 index once at the end.
 Estimated runtime on 3.19M recipes: 3–5 minutes.
 Usage:
    python scripts/pipeline/backfill_meal_tags.py [path/to/kiwi.db]
 """
 from __future__ import annotations
 import argparse
 import json
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
 from app.services.recipe.tag_inferrer import _MEAL_SIGNALS, _match_title_signals
 def run(db_path: Path, batch_size: int = 10_000) -> None:
    import sqlite3
    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA synchronous=NORMAL")
    total = conn.execute("SELECT count(*) FROM recipes").fetchone()[0]
    print(f"Total recipes: {total:,}")
    updated = 0
    skipped = 0
    offset = 0
    while True:
        rows = conn.execute(
            """
            SELECT id, title, inferred_tags
            FROM recipes
            ORDER BY id
            LIMIT ? OFFSET ?
            """,
            (batch_size, offset),
        ).fetchall()
        if not rows:
            break
        updates: list[tuple[str, int]] = []
        for row_id, title, tags_json in rows:
            title = title or ""
            meal_tags = _match_title_signals(title, _MEAL_SIGNALS)
            if not meal_tags:
                skipped += 1
                continue
            try:
                existing: list[str] = json.loads(tags_json) if tags_json else []
            except Exception:
                existing = []
            # Merge: union of existing + new meal tags, sorted
            merged = sorted(set(existing) | set(meal_tags))
            if merged == existing:
                skipped += 1
                continue
            updates.append((json.dumps(merged), row_id))
        if updates:
            conn.executemany(
                "UPDATE recipes SET inferred_tags = ? WHERE id = ?", updates
            )
            conn.commit()
            updated += len(updates)
        offset += len(rows)
        pct = min(100, int(offset * 100 / total))
        print(f"  {pct:>3}%  offset {offset:,}  merged {updated:,}  skipped {skipped:,}",
              end="\r")
    print(f"\nDone. Merged meal tags into {updated:,} recipes ({skipped:,} unchanged).")
    if updated > 0:
        print("Rebuilding FTS5 browser index...")
        try:
            conn.execute(
                "INSERT INTO recipe_browser_fts(recipe_browser_fts) VALUES('rebuild')"
            )
            conn.commit()
            print("FTS rebuild complete.")
        except Exception as e:
            print(f"FTS rebuild skipped: {e}")
    conn.close()
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("db", nargs="?", default="data/kiwi.db", type=Path)
    parser.add_argument("--batch-size", type=int, default=10_000)
    args = parser.parse_args()
    if not args.db.exists():
        print(f"DB not found: {args.db}")
        sys.exit(1)
    run(args.db, args.batch_size)