fix(recipe_scan): harden JSON parser for real-world LLM output quirks

- Strip <think>/<thinking> blocks before parsing (Qwen3/DeepSeek-R1 emit these before the actual JSON answer) - Replace greedy regex with brace-balanced _extract_json_object() so trailing prose after } doesn't corrupt the extract - Use non-greedy fence regex to pull JSON from inside ```json blocks - Pass system= to LLMRouter.complete() with a terse JSON-only instruction so Ollama models receive it as a system message, not buried in the user turn - Add logger.warning() on parse failure so raw output is diagnosable
2026-05-17 08:30:55 -07:00 · 2026-05-17 08:30:55 -07:00 · 430600c1af
commit 430600c1af
parent 21a9b85067
1 changed files with 57 additions and 18 deletions
--- a/app/services/recipe/recipe_scanner.py
+++ b/app/services/recipe/recipe_scanner.py
@ -291,7 +291,10 @@ def _call_vision_backend(
                        raise ValueError("Docuvision returned no text — image may not be a recipe")

                    _progress("structuring", "Parsing recipe structure...")
-                    text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr))
+                    text = LLMRouter().complete(
+                        _build_ocr_extraction_prompt(combined_ocr),
+                        system="You are a recipe data extractor. Return ONLY valid JSON. No markdown, no explanation, no code fences.",
+                    )
                    if text:
                        return text

@ -329,40 +332,76 @@ def _normalize_ingredient_name(name: str) -> str:
    return name.lower().strip()


+def _extract_json_object(text: str) -> str | None:
+    """Return the first balanced JSON object from text, or None if not found.
+
+    Uses brace-counting rather than a greedy regex so trailing prose and
+    nested objects are handled correctly.
+    """
+    start = text.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    in_string = False
+    escape_next = False
+    for i, ch in enumerate(text[start:], start):
+        if escape_next:
+            escape_next = False
+            continue
+        if ch == "\\" and in_string:
+            escape_next = True
+            continue
+        if ch == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                return text[start : i + 1]
+    return None
+
+
 def _parse_scanner_json(raw_text: str) -> dict:
    """Extract and return the JSON dict from VLM output.

    Handles:
    - Pure JSON
-    - JSON wrapped in ```json ... ``` markdown fences
-    - JSON preceded by a line of prose ("Here is the recipe: {...}")
+    - JSON in ```json ... ``` markdown fences
+    - Qwen3-style <think>...</think> or <thinking>...</thinking> preambles
+    - JSON preceded or followed by prose

    Raises ValueError on not_a_recipe or unparseable output.
    """
    text = raw_text.strip()

-    # Strip markdown fences if present
-    if text.startswith("```"):
-        parts = text.split("```")
-        for part in parts:
-            part = part.strip()
-            if part.startswith("json"):
-                part = part[4:].strip()
-            if part.startswith("{"):
-                text = part
-                break
+    # Strip thinking-token blocks emitted by reasoning models (Qwen3, DeepSeek-R1, etc.)
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()
+    text = re.sub(r"<thinking>.*?</thinking>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()

-    # Try direct parse first
+    # Strip markdown fences if present
+    if "```" in text:
+        # Find the content between the first ``` pair
+        fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
+        if fence_match:
+            text = fence_match.group(1).strip()
+
+    # Try direct parse
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
-        # Extract first JSON object embedded in prose
-        match = re.search(r"\{.*\}", text, re.DOTALL)
-        if not match:
+        # Fall back to brace-balanced extraction from anywhere in the output
+        candidate = _extract_json_object(text)
+        if not candidate:
+            logger.warning("Could not parse JSON from LLM output (first 400 chars): %r", text[:400])
            raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
        try:
-            data = json.loads(match.group(0))
+            data = json.loads(candidate)
        except json.JSONDecodeError as exc:
+            logger.warning("Brace-extracted JSON still invalid: %r", candidate[:400])
            raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc

    if isinstance(data, dict) and data.get("error") == "not_a_recipe":