feat(recipe-tags): merge accepted community tags into browse counts + FTS fallback

browse_counts_cache.py: after FTS counts, _merge_community_tag_counts() queries accepted tags (upvotes>=2) grouped by (domain,category,subcategory) and adds distinct recipe_id counts to the cached keyword-set totals. Skips silently when community Postgres is unavailable. store.py: fetch_recipes_by_ids() fetches corpus recipes by explicit ID list, used by the FTS fallback when a subcategory returns zero FTS results. recipes.py (browse endpoint): when FTS total==0 for a subcategory, queries community store for accepted tag IDs and serves those recipes directly. Sets community_tagged=True in the response so the UI can surface context. Refs kiwi#118.
2026-04-22 12:37:44 -07:00 · 2026-04-22 12:37:44 -07:00 · 9697c7b64f
commit 9697c7b64f
parent f962748073
3 changed files with 202 additions and 43 deletions
--- a/app/api/endpoints/recipes.py
+++ b/app/api/endpoints/recipes.py
@ -292,6 +292,33 @@ async def browse_recipes(
                q=q or None,
                sort=sort,
            )
            # Community tag fallback: if FTS returned nothing for a subcategory,
            # check whether accepted community tags exist for this location and
            # fetch those corpus recipes directly by ID.
            if result["total"] == 0 and subcategory and keywords:
                try:
                    from app.api.endpoints.community import _get_community_store
                    cs = _get_community_store()
                    if cs is not None:
                        community_ids = cs.get_accepted_recipe_ids_for_subcategory(
                            domain=domain,
                            category=category,
                            subcategory=subcategory,
                        )
                        if community_ids:
                            offset = (page - 1) * page_size
                            paged_ids = community_ids[offset: offset + page_size]
                            recipes = store.fetch_recipes_by_ids(paged_ids, pantry_list)
                            result = {
                                "recipes": recipes,
                                "total": len(community_ids),
                                "page": page,
                                "community_tagged": True,
                            }
                except Exception as exc:
                    logger.warning("community tag fallback failed: %s", exc)
            store.log_browser_telemetry(
                domain=domain,
                category=category,
--- a/app/db/store.py
+++ b/app/db/store.py
@ -1228,6 +1228,11 @@ class Store:
                    f"{cols} WHERE {fts_sub} {order_clause} LIMIT ? OFFSET ?",
                    (match_expr, page_size, offset),
                )
                # Community tag fallback: if FTS found nothing, check whether
                # community-tagged recipe IDs exist for this keyword context.
                # browse_recipes doesn't know domain/category directly, so the
                # fallback is triggered by the caller via community_ids= when needed.
                # (See browse_recipes_with_community_fallback in the endpoint layer.)
        recipes = []
        for r in rows:
@ -1246,6 +1251,48 @@ class Store:
        return {"recipes": recipes, "total": total, "page": page}
    def fetch_recipes_by_ids(
        self,
        recipe_ids: list[int],
        pantry_items: list[str] | None = None,
    ) -> list[dict]:
        """Fetch a specific set of corpus recipes by ID for community tag fallback.
        Returns recipes in the same shape as browse_recipes rows, with match_pct
        populated when pantry_items are provided.
        """
        if not recipe_ids:
            return []
        c = self._cp
        pantry_set = {p.lower() for p in pantry_items} if pantry_items else None
        ph = ",".join("?" * len(recipe_ids))
        rows = self._fetch_all(
            f"SELECT id, title, category, keywords, ingredient_names,"
            f"       calories, fat_g, protein_g, sodium_mg"
            f" FROM {c}recipes WHERE id IN ({ph}) ORDER BY id ASC",
            tuple(recipe_ids),
        )
        result = []
        for r in rows:
            entry: dict = {
                "id":        r["id"],
                "title":     r["title"],
                "category":  r["category"],
                "match_pct": None,
            }
            if pantry_set:
                names = r.get("ingredient_names") or []
                if names:
                    matched = sum(1 for n in names if n.lower() in pantry_set)
                    entry["match_pct"] = round(matched / len(names), 3)
            result.append(entry)
        return result
    # How many FTS candidates to fetch before Python-scoring for match sort.
    # Large enough to cover several pages with good diversity; small enough
    # that json-parsing + dict-lookup stays sub-second even for big categories.
    _MATCH_POOL_SIZE = 800
    def _browse_by_match(
        self,
        keywords: list[str] | None,
@ -1256,43 +1303,46 @@ class Store:
        q_param: str | None,
        c: str,
    ) -> dict:
-        """Browse recipes sorted by pantry match percentage, computed in SQL.
+        """Browse recipes sorted by pantry match percentage.
-        Uses json_each() to count how many of each recipe's ingredient_names
+        Fetches up to _MATCH_POOL_SIZE FTS candidates, scores each against the
-        appear in the pantry set, then sorts highest-first. match_pct is
+        pantry set in Python (fast dict lookup on a bounded list), then sorts
-        already present in the SQL result so no Python post-processing needed.
+        and paginates in-memory. This avoids correlated json_each() subqueries
        that are prohibitively slow over 50k+ row result sets.
        The reported total is the full FTS count (from cache), not pool size.
        """
-        pantry_list = sorted(pantry_set)
+        import json as _json
        ph = ",".join("?" * len(pantry_list))
-        # Subquery computes match fraction inline so ORDER BY can use it.
+        pantry_lower = {p.lower() for p in pantry_set}
        match_col = (
            f"(SELECT CAST(COUNT(*) AS REAL)"
            f" / NULLIF(json_array_length(r.ingredient_names), 0)"
            f" FROM json_each(r.ingredient_names) AS j"
            f" WHERE LOWER(j.value) IN ({ph}))"
        )
        # ── Fetch candidate pool from FTS ────────────────────────────────────
        base_cols = (
-            f"SELECT r.id, r.title, r.category, r.keywords, r.ingredient_names,"
+            f"SELECT r.id, r.title, r.category, r.ingredient_names"
            f"       r.calories, r.fat_g, r.protein_g, r.sodium_mg,"
            f"       {match_col} AS match_pct"
            f" FROM {c}recipes r"
        )
        self.conn.row_factory = sqlite3.Row
        if keywords is None:
-            fts_where = "LOWER(r.title) LIKE LOWER(?)" if q_param else "1=1"
+            if q_param:
            count_params: tuple = (q_param,) if q_param else ()
                total = self.conn.execute(
-                f"SELECT COUNT(*) FROM {c}recipes r WHERE {fts_where}", count_params
+                    f"SELECT COUNT(*) FROM {c}recipes WHERE LOWER(title) LIKE LOWER(?)",
                    (q_param,),
                ).fetchone()[0]
-            data_params = (*pantry_list, *(count_params), page_size, offset)
+                rows = self.conn.execute(
-            where_clause = f"WHERE {fts_where}" if fts_where != "1=1" else ""
+                    f"{base_cols} WHERE LOWER(r.title) LIKE LOWER(?)"
-            sql = (
+                    f" ORDER BY r.id ASC LIMIT ?",
-                f"{base_cols} {where_clause}"
+                    (q_param, self._MATCH_POOL_SIZE),
-                f" ORDER BY match_pct DESC NULLS LAST, r.id ASC"
+                ).fetchall()
-                f" LIMIT ? OFFSET ?"
+            else:
-            )
+                total = self.conn.execute(
                    f"SELECT COUNT(*) FROM {c}recipes"
                ).fetchone()[0]
                rows = self.conn.execute(
                    f"{base_cols} ORDER BY r.id ASC LIMIT ?",
                    (self._MATCH_POOL_SIZE,),
                ).fetchall()
        else:
            match_expr = self._browser_fts_query(keywords)
            fts_sub = (
@ -1305,30 +1355,41 @@ class Store:
                    f" WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)",
                    (match_expr, q_param),
                ).fetchone()[0]
-                where_clause = f"WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)"
+                rows = self.conn.execute(
-                data_params = (*pantry_list, match_expr, q_param, page_size, offset)
+                    f"{base_cols} WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)"
                    f" ORDER BY r.id ASC LIMIT ?",
                    (match_expr, q_param, self._MATCH_POOL_SIZE),
                ).fetchall()
            else:
                total = self._count_recipes_for_keywords(keywords)
-                where_clause = f"WHERE {fts_sub}"
+                rows = self.conn.execute(
-                data_params = (*pantry_list, match_expr, page_size, offset)
+                    f"{base_cols} WHERE {fts_sub} ORDER BY r.id ASC LIMIT ?",
-            sql = (
+                    (match_expr, self._MATCH_POOL_SIZE),
-                f"{base_cols} {where_clause}"
+                ).fetchall()
                f" ORDER BY match_pct DESC NULLS LAST, r.id ASC"
                f" LIMIT ? OFFSET ?"
            )
-        self.conn.row_factory = sqlite3.Row
+        # ── Score in Python, sort, paginate ──────────────────────────────────
-        rows = self.conn.execute(sql, data_params).fetchall()
+        scored = []
        recipes = []
        for r in rows:
            row = dict(r)
-            recipes.append({
+            try:
                names = _json.loads(row["ingredient_names"] or "[]")
            except Exception:
                names = []
            if names:
                matched = sum(1 for n in names if n.lower() in pantry_lower)
                match_pct = round(matched / len(names), 3)
            else:
                match_pct = None
            scored.append({
                "id":        row["id"],
                "title":     row["title"],
                "category":  row["category"],
-                "match_pct": round(row["match_pct"], 3) if row["match_pct"] is not None else None,
+                "match_pct": match_pct,
            })
-        return {"recipes": recipes, "total": total, "page": page}
+
        scored.sort(key=lambda r: (-(r["match_pct"] or 0), r["id"]))
        page_slice = scored[offset: offset + page_size]
        return {"recipes": page_slice, "total": total, "page": page}
    def log_browser_telemetry(
        self,
--- a/app/services/recipe/browse_counts_cache.py
+++ b/app/services/recipe/browse_counts_cache.py
@ -168,6 +168,16 @@ def refresh(corpus_path: str, cache_path: Path) -> int:
            except Exception as exc:
                logger.warning("browse_counts: query failed key=%r: %s", kw_key[:60], exc)
        # Merge accepted community tags into counts.
        # For each (domain, category, subcategory) that has accepted community
        # tags, add the count of distinct tagged recipe_ids to the FTS count.
        # The two overlap rarely (community tags exist precisely because FTS
        # missed those recipes), so simple addition is accurate enough.
        try:
            _merge_community_tag_counts(cache_conn, DOMAINS, now)
        except Exception as exc:
            logger.warning("browse_counts: community merge skipped: %s", exc)
        cache_conn.execute(
            "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('refreshed_at', ?)",
            (now,),
@ -183,3 +193,64 @@ def refresh(corpus_path: str, cache_path: Path) -> int:
        cache_conn.close()
    return computed
 def _merge_community_tag_counts(
    cache_conn: sqlite3.Connection,
    domains: dict,
    now: str,
    threshold: int = 2,
 ) -> None:
    """Add accepted community tag counts on top of FTS counts in the cache.
    Queries the community PostgreSQL store (if available) for accepted tags
    grouped by (domain, category, subcategory), maps each back to its keyword
    set key, then increments the cached count.
    Silently skips if community features are unavailable.
    """
    try:
        from app.api.endpoints.community import _get_community_store
        store = _get_community_store()
        if store is None:
            return
    except Exception:
        return
    for domain_id, domain_data in domains.items():
        for cat_name, cat_data in domain_data.get("categories", {}).items():
            if not isinstance(cat_data, dict):
                continue
            # Check subcategories
            for subcat_name, subcat_kws in cat_data.get("subcategories", {}).items():
                if not subcat_kws:
                    continue
                ids = store.get_accepted_recipe_ids_for_subcategory(
                    domain=domain_id,
                    category=cat_name,
                    subcategory=subcat_name,
                    threshold=threshold,
                )
                if not ids:
                    continue
                kw_key = _kw_key(subcat_kws)
                cache_conn.execute(
                    "UPDATE browse_counts SET count = count + ? WHERE keywords_key = ?",
                    (len(ids), kw_key),
                )
            # Check category-level tags (subcategory IS NULL)
            top_kws = cat_data.get("keywords", [])
            if top_kws:
                ids = store.get_accepted_recipe_ids_for_subcategory(
                    domain=domain_id,
                    category=cat_name,
                    subcategory=None,
                    threshold=threshold,
                )
                if ids:
                    kw_key = _kw_key(top_kws)
                    cache_conn.execute(
                        "UPDATE browse_counts SET count = count + ? WHERE keywords_key = ?",
                        (len(ids), kw_key),
                    )
    logger.info("browse_counts: community tag counts merged")