kiwi/app/services/recipe/browse_counts_cache.py

"""
Browse counts cache — pre-computes and persists recipe counts for all
browse domain keyword sets so category/subcategory page loads never
hit the 3.8 GB FTS index at request time.

Counts change only when the corpus changes (after a pipeline run).
The cache is a small SQLite file separate from both the read-only
corpus DB and per-user kiwi.db files, so the container can write it.

Refresh triggers:
  1. Startup     — if cache is missing or older than STALE_DAYS
  2. Nightly     — asyncio background task started in main.py lifespan
  3. Pipeline    — infer_recipe_tags.py calls refresh() at end of run

The in-memory _COUNT_CACHE in store.py is pre-warmed from this file
on startup, so FTS queries are never needed for known keyword sets.
"""
from __future__ import annotations

import logging
import sqlite3
from datetime import datetime, timezone
from pathlib import Path

logger = logging.getLogger(__name__)

STALE_DAYS = 7


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------

def _kw_key(keywords: list[str]) -> str:
    """Stable string key for a keyword list — sorted and pipe-joined."""
    return "|".join(sorted(keywords))


def _fts_match_expr(keywords: list[str]) -> str:
    phrases = ['"' + kw.replace('"', '""') + '"' for kw in keywords]
    return " OR ".join(phrases)


def _ensure_schema(conn: sqlite3.Connection) -> None:
    conn.execute("""
        CREATE TABLE IF NOT EXISTS browse_counts (
            keywords_key  TEXT PRIMARY KEY,
            count         INTEGER NOT NULL,
            computed_at   TEXT NOT NULL
        )
    """)
    conn.execute("""
        CREATE TABLE IF NOT EXISTS browse_counts_meta (
            key   TEXT PRIMARY KEY,
            value TEXT NOT NULL
        )
    """)
    conn.commit()


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def is_stale(cache_path: Path, max_age_days: int = STALE_DAYS) -> bool:
    """Return True if the cache is missing, empty, or older than max_age_days."""
    if not cache_path.exists():
        return True
    try:
        conn = sqlite3.connect(cache_path)
        row = conn.execute(
            "SELECT value FROM browse_counts_meta WHERE key = 'refreshed_at'"
        ).fetchone()
        conn.close()
        if row is None:
            return True
        age = (datetime.now(timezone.utc) - datetime.fromisoformat(row[0])).days
        return age >= max_age_days
    except Exception:
        return True


def load_into_memory(cache_path: Path, count_cache: dict, corpus_path: str) -> int:
    """
    Load all rows from the cache file into the in-memory count_cache dict.

    Uses corpus_path (the current RECIPE_DB_PATH env value) as the cache key,
    not what was stored in the file — the file may have been built against a
    different mount path (e.g. pipeline ran on host, container sees a different
    path). Counts are corpus-content-derived and path-independent.

    Returns the number of entries loaded.
    """
    if not cache_path.exists():
        return 0
    try:
        conn = sqlite3.connect(cache_path)
        rows = conn.execute("SELECT keywords_key, count FROM browse_counts").fetchall()
        conn.close()
        loaded = 0
        for kw_key, count in rows:
            keywords = kw_key.split("|") if kw_key else []
            cache_key = (corpus_path, *sorted(keywords))
            count_cache[cache_key] = count
            loaded += 1
        logger.info("browse_counts: warmed %d entries from %s", loaded, cache_path)
        return loaded
    except Exception as exc:
        logger.warning("browse_counts: load failed: %s", exc)
        return 0


def refresh(corpus_path: str, cache_path: Path) -> int:
    """
    Run FTS5 queries for every keyword set in browser_domains.DOMAINS
    and write results to cache_path.

    Safe to call from both the host pipeline script and the in-container
    nightly task. The corpus_path must be reachable and readable from
    the calling process.

    Returns the number of keyword sets computed.
    """
    from app.services.recipe.browser_domains import DOMAINS  # local import — avoid circular

    cache_path.parent.mkdir(parents=True, exist_ok=True)
    cache_conn = sqlite3.connect(cache_path)
    _ensure_schema(cache_conn)

    # Collect every unique keyword list across all domains/categories/subcategories.
    # DOMAINS structure: {domain: {label: str, categories: {cat_name: {keywords, subcategories}}}}
    seen: dict[str, list[str]] = {}
    for domain_data in DOMAINS.values():
        for cat_data in domain_data.get("categories", {}).values():
            if not isinstance(cat_data, dict):
                continue
            top_kws = cat_data.get("keywords", [])
            if top_kws:
                seen[_kw_key(top_kws)] = top_kws
            for subcat_kws in cat_data.get("subcategories", {}).values():
                if subcat_kws:
                    seen[_kw_key(subcat_kws)] = subcat_kws

    try:
        corpus_conn = sqlite3.connect(f"file:{corpus_path}?mode=ro", uri=True)
    except Exception as exc:
        logger.error("browse_counts: cannot open corpus %s: %s", corpus_path, exc)
        cache_conn.close()
        return 0

    now = datetime.now(timezone.utc).isoformat()
    computed = 0

    try:
        for kw_key, kws in seen.items():
            try:
                row = corpus_conn.execute(
                    "SELECT count(*) FROM recipe_browser_fts WHERE recipe_browser_fts MATCH ?",
                    (_fts_match_expr(kws),),
                ).fetchone()
                count = row[0] if row else 0
                cache_conn.execute(
                    "INSERT OR REPLACE INTO browse_counts (keywords_key, count, computed_at)"
                    " VALUES (?, ?, ?)",
                    (kw_key, count, now),
                )
                computed += 1
            except Exception as exc:
                logger.warning("browse_counts: query failed key=%r: %s", kw_key[:60], exc)

        cache_conn.execute(
            "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('refreshed_at', ?)",
            (now,),
        )
        cache_conn.execute(
            "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('corpus_path', ?)",
            (corpus_path,),
        )
        cache_conn.commit()
        logger.info("browse_counts: wrote %d counts → %s", computed, cache_path)
    finally:
        corpus_conn.close()
        cache_conn.close()

    return computed