""" Browse counts cache — pre-computes and persists recipe counts for all browse domain keyword sets so category/subcategory page loads never hit the 3.8 GB FTS index at request time. Counts change only when the corpus changes (after a pipeline run). The cache is a small SQLite file separate from both the read-only corpus DB and per-user kiwi.db files, so the container can write it. Refresh triggers: 1. Startup — if cache is missing or older than STALE_DAYS 2. Nightly — asyncio background task started in main.py lifespan 3. Pipeline — infer_recipe_tags.py calls refresh() at end of run The in-memory _COUNT_CACHE in store.py is pre-warmed from this file on startup, so FTS queries are never needed for known keyword sets. """ from __future__ import annotations import logging import sqlite3 from datetime import datetime, timezone from pathlib import Path logger = logging.getLogger(__name__) STALE_DAYS = 7 # --------------------------------------------------------------------------- # Internal helpers # --------------------------------------------------------------------------- def _kw_key(keywords: list[str]) -> str: """Stable string key for a keyword list — sorted and pipe-joined.""" return "|".join(sorted(keywords)) def _fts_match_expr(keywords: list[str]) -> str: phrases = ['"' + kw.replace('"', '""') + '"' for kw in keywords] return " OR ".join(phrases) def _ensure_schema(conn: sqlite3.Connection) -> None: conn.execute(""" CREATE TABLE IF NOT EXISTS browse_counts ( keywords_key TEXT PRIMARY KEY, count INTEGER NOT NULL, computed_at TEXT NOT NULL ) """) conn.execute(""" CREATE TABLE IF NOT EXISTS browse_counts_meta ( key TEXT PRIMARY KEY, value TEXT NOT NULL ) """) conn.commit() # --------------------------------------------------------------------------- # Public API # --------------------------------------------------------------------------- def is_stale(cache_path: Path, max_age_days: int = STALE_DAYS) -> bool: """Return True if the cache is missing, empty, or older than max_age_days.""" if not cache_path.exists(): return True try: conn = sqlite3.connect(cache_path) row = conn.execute( "SELECT value FROM browse_counts_meta WHERE key = 'refreshed_at'" ).fetchone() conn.close() if row is None: return True age = (datetime.now(timezone.utc) - datetime.fromisoformat(row[0])).days return age >= max_age_days except Exception: return True def load_into_memory(cache_path: Path, count_cache: dict, corpus_path: str) -> int: """ Load all rows from the cache file into the in-memory count_cache dict. Uses corpus_path (the current RECIPE_DB_PATH env value) as the cache key, not what was stored in the file — the file may have been built against a different mount path (e.g. pipeline ran on host, container sees a different path). Counts are corpus-content-derived and path-independent. Returns the number of entries loaded. """ if not cache_path.exists(): return 0 try: conn = sqlite3.connect(cache_path) rows = conn.execute("SELECT keywords_key, count FROM browse_counts").fetchall() conn.close() loaded = 0 for kw_key, count in rows: keywords = kw_key.split("|") if kw_key else [] cache_key = (corpus_path, *sorted(keywords)) count_cache[cache_key] = count loaded += 1 logger.info("browse_counts: warmed %d entries from %s", loaded, cache_path) return loaded except Exception as exc: logger.warning("browse_counts: load failed: %s", exc) return 0 def refresh(corpus_path: str, cache_path: Path) -> int: """ Run FTS5 queries for every keyword set in browser_domains.DOMAINS and write results to cache_path. Safe to call from both the host pipeline script and the in-container nightly task. The corpus_path must be reachable and readable from the calling process. Returns the number of keyword sets computed. """ from app.services.recipe.browser_domains import DOMAINS # local import — avoid circular cache_path.parent.mkdir(parents=True, exist_ok=True) cache_conn = sqlite3.connect(cache_path) _ensure_schema(cache_conn) # Collect every unique keyword list across all domains/categories/subcategories. # DOMAINS structure: {domain: {label: str, categories: {cat_name: {keywords, subcategories}}}} seen: dict[str, list[str]] = {} for domain_data in DOMAINS.values(): for cat_data in domain_data.get("categories", {}).values(): if not isinstance(cat_data, dict): continue top_kws = cat_data.get("keywords", []) if top_kws: seen[_kw_key(top_kws)] = top_kws for subcat_kws in cat_data.get("subcategories", {}).values(): if subcat_kws: seen[_kw_key(subcat_kws)] = subcat_kws try: corpus_conn = sqlite3.connect(f"file:{corpus_path}?mode=ro", uri=True) except Exception as exc: logger.error("browse_counts: cannot open corpus %s: %s", corpus_path, exc) cache_conn.close() return 0 now = datetime.now(timezone.utc).isoformat() computed = 0 try: for kw_key, kws in seen.items(): try: row = corpus_conn.execute( "SELECT count(*) FROM recipe_browser_fts WHERE recipe_browser_fts MATCH ?", (_fts_match_expr(kws),), ).fetchone() count = row[0] if row else 0 cache_conn.execute( "INSERT OR REPLACE INTO browse_counts (keywords_key, count, computed_at)" " VALUES (?, ?, ?)", (kw_key, count, now), ) computed += 1 except Exception as exc: logger.warning("browse_counts: query failed key=%r: %s", kw_key[:60], exc) cache_conn.execute( "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('refreshed_at', ?)", (now,), ) cache_conn.execute( "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('corpus_path', ?)", (corpus_path,), ) cache_conn.commit() logger.info("browse_counts: wrote %d counts → %s", computed, cache_path) finally: corpus_conn.close() cache_conn.close() return computed