From 9697c7b64f2ca3cfe65327599bc31fb17bca1b39 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Wed, 22 Apr 2026 12:37:44 -0700
Subject: [PATCH] feat(recipe-tags): merge accepted community tags into browse
 counts + FTS fallback

browse_counts_cache.py: after FTS counts, _merge_community_tag_counts() queries
  accepted tags (upvotes>=2) grouped by (domain,category,subcategory) and adds
  distinct recipe_id counts to the cached keyword-set totals. Skips silently
  when community Postgres is unavailable.

store.py: fetch_recipes_by_ids() fetches corpus recipes by explicit ID list,
  used by the FTS fallback when a subcategory returns zero FTS results.

recipes.py (browse endpoint): when FTS total==0 for a subcategory, queries
  community store for accepted tag IDs and serves those recipes directly.
  Sets community_tagged=True in the response so the UI can surface context.
  Refs kiwi#118.
---
 app/api/endpoints/recipes.py               |  27 ++++
 app/db/store.py                            | 147 +++++++++++++++------
 app/services/recipe/browse_counts_cache.py |  71 ++++++++++
 3 files changed, 202 insertions(+), 43 deletions(-)

diff --git a/app/api/endpoints/recipes.py b/app/api/endpoints/recipes.py
index 20b70d5..146d884 100644
--- a/app/api/endpoints/recipes.py
+++ b/app/api/endpoints/recipes.py
@@ -292,6 +292,33 @@ async def browse_recipes(
                 q=q or None,
                 sort=sort,
             )
+
+            # Community tag fallback: if FTS returned nothing for a subcategory,
+            # check whether accepted community tags exist for this location and
+            # fetch those corpus recipes directly by ID.
+            if result["total"] == 0 and subcategory and keywords:
+                try:
+                    from app.api.endpoints.community import _get_community_store
+                    cs = _get_community_store()
+                    if cs is not None:
+                        community_ids = cs.get_accepted_recipe_ids_for_subcategory(
+                            domain=domain,
+                            category=category,
+                            subcategory=subcategory,
+                        )
+                        if community_ids:
+                            offset = (page - 1) * page_size
+                            paged_ids = community_ids[offset: offset + page_size]
+                            recipes = store.fetch_recipes_by_ids(paged_ids, pantry_list)
+                            result = {
+                                "recipes": recipes,
+                                "total": len(community_ids),
+                                "page": page,
+                                "community_tagged": True,
+                            }
+                except Exception as exc:
+                    logger.warning("community tag fallback failed: %s", exc)
+
             store.log_browser_telemetry(
                 domain=domain,
                 category=category,
diff --git a/app/db/store.py b/app/db/store.py
index 6066ed9..aaaf8a4 100644
--- a/app/db/store.py
+++ b/app/db/store.py
@@ -1228,6 +1228,11 @@ class Store:
                     f"{cols} WHERE {fts_sub} {order_clause} LIMIT ? OFFSET ?",
                     (match_expr, page_size, offset),
                 )
+                # Community tag fallback: if FTS found nothing, check whether
+                # community-tagged recipe IDs exist for this keyword context.
+                # browse_recipes doesn't know domain/category directly, so the
+                # fallback is triggered by the caller via community_ids= when needed.
+                # (See browse_recipes_with_community_fallback in the endpoint layer.)
 
         recipes = []
         for r in rows:
@@ -1246,6 +1251,48 @@ class Store:
 
         return {"recipes": recipes, "total": total, "page": page}
 
+    def fetch_recipes_by_ids(
+        self,
+        recipe_ids: list[int],
+        pantry_items: list[str] | None = None,
+    ) -> list[dict]:
+        """Fetch a specific set of corpus recipes by ID for community tag fallback.
+
+        Returns recipes in the same shape as browse_recipes rows, with match_pct
+        populated when pantry_items are provided.
+        """
+        if not recipe_ids:
+            return []
+        c = self._cp
+        pantry_set = {p.lower() for p in pantry_items} if pantry_items else None
+        ph = ",".join("?" * len(recipe_ids))
+        rows = self._fetch_all(
+            f"SELECT id, title, category, keywords, ingredient_names,"
+            f"       calories, fat_g, protein_g, sodium_mg"
+            f" FROM {c}recipes WHERE id IN ({ph}) ORDER BY id ASC",
+            tuple(recipe_ids),
+        )
+        result = []
+        for r in rows:
+            entry: dict = {
+                "id":        r["id"],
+                "title":     r["title"],
+                "category":  r["category"],
+                "match_pct": None,
+            }
+            if pantry_set:
+                names = r.get("ingredient_names") or []
+                if names:
+                    matched = sum(1 for n in names if n.lower() in pantry_set)
+                    entry["match_pct"] = round(matched / len(names), 3)
+            result.append(entry)
+        return result
+
+    # How many FTS candidates to fetch before Python-scoring for match sort.
+    # Large enough to cover several pages with good diversity; small enough
+    # that json-parsing + dict-lookup stays sub-second even for big categories.
+    _MATCH_POOL_SIZE = 800
+
     def _browse_by_match(
         self,
         keywords: list[str] | None,
@@ -1256,43 +1303,46 @@ class Store:
         q_param: str | None,
         c: str,
     ) -> dict:
-        """Browse recipes sorted by pantry match percentage, computed in SQL.
+        """Browse recipes sorted by pantry match percentage.
 
-        Uses json_each() to count how many of each recipe's ingredient_names
-        appear in the pantry set, then sorts highest-first. match_pct is
-        already present in the SQL result so no Python post-processing needed.
+        Fetches up to _MATCH_POOL_SIZE FTS candidates, scores each against the
+        pantry set in Python (fast dict lookup on a bounded list), then sorts
+        and paginates in-memory. This avoids correlated json_each() subqueries
+        that are prohibitively slow over 50k+ row result sets.
+
+        The reported total is the full FTS count (from cache), not pool size.
         """
-        pantry_list = sorted(pantry_set)
-        ph = ",".join("?" * len(pantry_list))
+        import json as _json
 
-        # Subquery computes match fraction inline so ORDER BY can use it.
-        match_col = (
-            f"(SELECT CAST(COUNT(*) AS REAL)"
-            f" / NULLIF(json_array_length(r.ingredient_names), 0)"
-            f" FROM json_each(r.ingredient_names) AS j"
-            f" WHERE LOWER(j.value) IN ({ph}))"
-        )
+        pantry_lower = {p.lower() for p in pantry_set}
 
+        # ── Fetch candidate pool from FTS ────────────────────────────────────
         base_cols = (
-            f"SELECT r.id, r.title, r.category, r.keywords, r.ingredient_names,"
-            f"       r.calories, r.fat_g, r.protein_g, r.sodium_mg,"
-            f"       {match_col} AS match_pct"
+            f"SELECT r.id, r.title, r.category, r.ingredient_names"
             f" FROM {c}recipes r"
         )
 
+        self.conn.row_factory = sqlite3.Row
+
         if keywords is None:
-            fts_where = "LOWER(r.title) LIKE LOWER(?)" if q_param else "1=1"
-            count_params: tuple = (q_param,) if q_param else ()
-            total = self.conn.execute(
-                f"SELECT COUNT(*) FROM {c}recipes r WHERE {fts_where}", count_params
-            ).fetchone()[0]
-            data_params = (*pantry_list, *(count_params), page_size, offset)
-            where_clause = f"WHERE {fts_where}" if fts_where != "1=1" else ""
-            sql = (
-                f"{base_cols} {where_clause}"
-                f" ORDER BY match_pct DESC NULLS LAST, r.id ASC"
-                f" LIMIT ? OFFSET ?"
-            )
+            if q_param:
+                total = self.conn.execute(
+                    f"SELECT COUNT(*) FROM {c}recipes WHERE LOWER(title) LIKE LOWER(?)",
+                    (q_param,),
+                ).fetchone()[0]
+                rows = self.conn.execute(
+                    f"{base_cols} WHERE LOWER(r.title) LIKE LOWER(?)"
+                    f" ORDER BY r.id ASC LIMIT ?",
+                    (q_param, self._MATCH_POOL_SIZE),
+                ).fetchall()
+            else:
+                total = self.conn.execute(
+                    f"SELECT COUNT(*) FROM {c}recipes"
+                ).fetchone()[0]
+                rows = self.conn.execute(
+                    f"{base_cols} ORDER BY r.id ASC LIMIT ?",
+                    (self._MATCH_POOL_SIZE,),
+                ).fetchall()
         else:
             match_expr = self._browser_fts_query(keywords)
             fts_sub = (
@@ -1305,30 +1355,41 @@ class Store:
                     f" WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)",
                     (match_expr, q_param),
                 ).fetchone()[0]
-                where_clause = f"WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)"
-                data_params = (*pantry_list, match_expr, q_param, page_size, offset)
+                rows = self.conn.execute(
+                    f"{base_cols} WHERE {fts_sub} AND LOWER(r.title) LIKE LOWER(?)"
+                    f" ORDER BY r.id ASC LIMIT ?",
+                    (match_expr, q_param, self._MATCH_POOL_SIZE),
+                ).fetchall()
             else:
                 total = self._count_recipes_for_keywords(keywords)
-                where_clause = f"WHERE {fts_sub}"
-                data_params = (*pantry_list, match_expr, page_size, offset)
-            sql = (
-                f"{base_cols} {where_clause}"
-                f" ORDER BY match_pct DESC NULLS LAST, r.id ASC"
-                f" LIMIT ? OFFSET ?"
-            )
+                rows = self.conn.execute(
+                    f"{base_cols} WHERE {fts_sub} ORDER BY r.id ASC LIMIT ?",
+                    (match_expr, self._MATCH_POOL_SIZE),
+                ).fetchall()
 
-        self.conn.row_factory = sqlite3.Row
-        rows = self.conn.execute(sql, data_params).fetchall()
-        recipes = []
+        # ── Score in Python, sort, paginate ──────────────────────────────────
+        scored = []
         for r in rows:
             row = dict(r)
-            recipes.append({
+            try:
+                names = _json.loads(row["ingredient_names"] or "[]")
+            except Exception:
+                names = []
+            if names:
+                matched = sum(1 for n in names if n.lower() in pantry_lower)
+                match_pct = round(matched / len(names), 3)
+            else:
+                match_pct = None
+            scored.append({
                 "id":        row["id"],
                 "title":     row["title"],
                 "category":  row["category"],
-                "match_pct": round(row["match_pct"], 3) if row["match_pct"] is not None else None,
+                "match_pct": match_pct,
             })
-        return {"recipes": recipes, "total": total, "page": page}
+
+        scored.sort(key=lambda r: (-(r["match_pct"] or 0), r["id"]))
+        page_slice = scored[offset: offset + page_size]
+        return {"recipes": page_slice, "total": total, "page": page}
 
     def log_browser_telemetry(
         self,
diff --git a/app/services/recipe/browse_counts_cache.py b/app/services/recipe/browse_counts_cache.py
index 2a7497e..c27a775 100644
--- a/app/services/recipe/browse_counts_cache.py
+++ b/app/services/recipe/browse_counts_cache.py
@@ -168,6 +168,16 @@ def refresh(corpus_path: str, cache_path: Path) -> int:
             except Exception as exc:
                 logger.warning("browse_counts: query failed key=%r: %s", kw_key[:60], exc)
 
+        # Merge accepted community tags into counts.
+        # For each (domain, category, subcategory) that has accepted community
+        # tags, add the count of distinct tagged recipe_ids to the FTS count.
+        # The two overlap rarely (community tags exist precisely because FTS
+        # missed those recipes), so simple addition is accurate enough.
+        try:
+            _merge_community_tag_counts(cache_conn, DOMAINS, now)
+        except Exception as exc:
+            logger.warning("browse_counts: community merge skipped: %s", exc)
+
         cache_conn.execute(
             "INSERT OR REPLACE INTO browse_counts_meta (key, value) VALUES ('refreshed_at', ?)",
             (now,),
@@ -183,3 +193,64 @@ def refresh(corpus_path: str, cache_path: Path) -> int:
         cache_conn.close()
 
     return computed
+
+
+def _merge_community_tag_counts(
+    cache_conn: sqlite3.Connection,
+    domains: dict,
+    now: str,
+    threshold: int = 2,
+) -> None:
+    """Add accepted community tag counts on top of FTS counts in the cache.
+
+    Queries the community PostgreSQL store (if available) for accepted tags
+    grouped by (domain, category, subcategory), maps each back to its keyword
+    set key, then increments the cached count.
+
+    Silently skips if community features are unavailable.
+    """
+    try:
+        from app.api.endpoints.community import _get_community_store
+        store = _get_community_store()
+        if store is None:
+            return
+    except Exception:
+        return
+
+    for domain_id, domain_data in domains.items():
+        for cat_name, cat_data in domain_data.get("categories", {}).items():
+            if not isinstance(cat_data, dict):
+                continue
+            # Check subcategories
+            for subcat_name, subcat_kws in cat_data.get("subcategories", {}).items():
+                if not subcat_kws:
+                    continue
+                ids = store.get_accepted_recipe_ids_for_subcategory(
+                    domain=domain_id,
+                    category=cat_name,
+                    subcategory=subcat_name,
+                    threshold=threshold,
+                )
+                if not ids:
+                    continue
+                kw_key = _kw_key(subcat_kws)
+                cache_conn.execute(
+                    "UPDATE browse_counts SET count = count + ? WHERE keywords_key = ?",
+                    (len(ids), kw_key),
+                )
+            # Check category-level tags (subcategory IS NULL)
+            top_kws = cat_data.get("keywords", [])
+            if top_kws:
+                ids = store.get_accepted_recipe_ids_for_subcategory(
+                    domain=domain_id,
+                    category=cat_name,
+                    subcategory=None,
+                    threshold=threshold,
+                )
+                if ids:
+                    kw_key = _kw_key(top_kws)
+                    cache_conn.execute(
+                        "UPDATE browse_counts SET count = count + ? WHERE keywords_key = ?",
+                        (len(ids), kw_key),
+                    )
+    logger.info("browse_counts: community tag counts merged")