kiwi/scripts/backfill_keywords.py
pyr0ball 144d1dc6c4 chore: commit in-progress work -- tag inferrer, imitate endpoint, hall-of-chaos easter egg, migration files, Dockerfile .env defense
- app/services/recipe/tag_inferrer.py: infer tags from recipe ingredient text
- app/db/migrations/022_recipe_generic_flag.sql, 029_inferred_tags.sql: schema migrations
- app/api/endpoints/imitate.py: recipe imitation endpoint stub
- app/api/endpoints/community.py: hall-of-chaos easter egg endpoint
- scripts/pipeline/infer_recipe_tags.py, backfill_keywords.py: pipeline scripts
- scripts/pipeline/build_recipe_index.py: extended index builder
- Dockerfile: explicit .env removal as defense-in-depth
- frontend/src/components/FeedbackButton.vue: feedback UX improvements
- frontend/src/style.css: minor style tweaks
- app/cloud_session.py: cloud session improvements
- tests/api/test_community_endpoints.py: additional test coverage
2026-04-14 13:23:15 -07:00

118 lines
3.6 KiB
Python

#!/usr/bin/env python3
"""
Backfill keywords column: repair character-split R-vector data.
The food.com corpus was imported with Keywords stored as a JSON array of
individual characters (e.g. ["c","(","\"","I","t","a","l","i","a","n",...])
instead of the intended keyword list (e.g. ["Italian","Low-Fat","Easy"]).
This script detects the broken pattern (all array elements have length 1),
rejoins them into the original R-vector string, parses quoted tokens, and
writes the corrected JSON back.
Rows that are already correct (empty array, or multi-char strings) are skipped.
FTS5 index is rebuilt after the update so searches reflect the fix.
Usage:
conda run -n cf python scripts/backfill_keywords.py [path/to/kiwi.db]
# default: data/kiwi.db
Estimated time on 3.1M rows: 3-8 minutes (mostly the FTS rebuild at the end).
"""
from __future__ import annotations
import json
import re
import sqlite3
import sys
from pathlib import Path
_QUOTED = re.compile(r'"([^"]*)"')
def _parse_r_vector(s: str) -> list[str]:
return _QUOTED.findall(s)
def _repair(raw_json: str) -> str | None:
"""Return corrected JSON string, or None if the row is already clean."""
try:
val = json.loads(raw_json)
except (json.JSONDecodeError, TypeError):
return None
if not isinstance(val, list) or not val:
return None # empty or non-list — leave as-is
# Already correct: contains multi-character strings
if any(isinstance(e, str) and len(e) > 1 for e in val):
return None
# Broken: all single characters — rejoin and re-parse
if all(isinstance(e, str) and len(e) == 1 for e in val):
rejoined = "".join(val)
keywords = _parse_r_vector(rejoined)
return json.dumps(keywords)
return None
def backfill(db_path: Path, batch_size: int = 5000) -> None:
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA journal_mode=WAL")
total = conn.execute("SELECT count(*) FROM recipes").fetchone()[0]
print(f"Total recipes: {total:,}")
fixed = 0
skipped = 0
offset = 0
while True:
rows = conn.execute(
"SELECT id, keywords FROM recipes LIMIT ? OFFSET ?",
(batch_size, offset),
).fetchall()
if not rows:
break
updates: list[tuple[str, int]] = []
for row_id, raw_json in rows:
corrected = _repair(raw_json)
if corrected is not None:
updates.append((corrected, row_id))
else:
skipped += 1
if updates:
conn.executemany(
"UPDATE recipes SET keywords = ? WHERE id = ?", updates
)
conn.commit()
fixed += len(updates)
offset += batch_size
done = offset + len(rows) - (batch_size - len(rows))
pct = min(100, int((offset / total) * 100))
print(f" {pct:>3}% processed {offset:,} fixed {fixed:,} skipped {skipped:,}", end="\r")
print(f"\nDone. Fixed {fixed:,} rows, skipped {skipped:,} (already correct or empty).")
if fixed > 0:
print("Rebuilding FTS5 browser index (recipe_browser_fts)…")
try:
conn.execute("INSERT INTO recipe_browser_fts(recipe_browser_fts) VALUES('rebuild')")
conn.commit()
print("FTS rebuild complete.")
except Exception as e:
print(f"FTS rebuild skipped (table may not exist yet): {e}")
conn.close()
if __name__ == "__main__":
db_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("data/kiwi.db")
if not db_path.exists():
print(f"DB not found: {db_path}")
sys.exit(1)
backfill(db_path)