fix(recipe_scan): harden JSON parser for real-world LLM output quirks

- Strip <think>/<thinking> blocks before parsing (Qwen3/DeepSeek-R1 emit these before the actual JSON answer) - Replace greedy regex with brace-balanced _extract_json_object() so trailing prose after } doesn't corrupt the extract - Use non-greedy fence regex to pull JSON from inside ```json blocks - Pass system= to LLMRouter.complete() with a terse JSON-only instruction so Ollama models receive it as a system message, not buried in the user turn - Add logger.warning() on parse failure so raw output is diagnosable
2026-05-17 08:30:55 -07:00 · 2026-05-17 08:30:55 -07:00 · 430600c1af
commit 430600c1af
parent 21a9b85067
1 changed files with 57 additions and 18 deletions
--- a/app/services/recipe/recipe_scanner.py
+++ b/app/services/recipe/recipe_scanner.py
@ -291,7 +291,10 @@ def _call_vision_backend(
                        raise ValueError("Docuvision returned no text — image may not be a recipe")
                    _progress("structuring", "Parsing recipe structure...")
-                    text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr))
+                    text = LLMRouter().complete(
                        _build_ocr_extraction_prompt(combined_ocr),
                        system="You are a recipe data extractor. Return ONLY valid JSON. No markdown, no explanation, no code fences.",
                    )
                    if text:
                        return text
@ -329,40 +332,76 @@ def _normalize_ingredient_name(name: str) -> str:
    return name.lower().strip()
 def _extract_json_object(text: str) -> str | None:
    """Return the first balanced JSON object from text, or None if not found.
    Uses brace-counting rather than a greedy regex so trailing prose and
    nested objects are handled correctly.
    """
    start = text.find("{")
    if start == -1:
        return None
    depth = 0
    in_string = False
    escape_next = False
    for i, ch in enumerate(text[start:], start):
        if escape_next:
            escape_next = False
            continue
        if ch == "\\" and in_string:
            escape_next = True
            continue
        if ch == '"':
            in_string = not in_string
            continue
        if in_string:
            continue
        if ch == "{":
            depth += 1
        elif ch == "}":
            depth -= 1
            if depth == 0:
                return text[start : i + 1]
    return None
 def _parse_scanner_json(raw_text: str) -> dict:
    """Extract and return the JSON dict from VLM output.
    Handles:
    - Pure JSON
-    - JSON wrapped in ```json ... ``` markdown fences
+    - JSON in ```json ... ``` markdown fences
-    - JSON preceded by a line of prose ("Here is the recipe: {...}")
+    - Qwen3-style <think>...</think> or <thinking>...</thinking> preambles
    - JSON preceded or followed by prose
    Raises ValueError on not_a_recipe or unparseable output.
    """
    text = raw_text.strip()
-    # Strip markdown fences if present
+    # Strip thinking-token blocks emitted by reasoning models (Qwen3, DeepSeek-R1, etc.)
-    if text.startswith("```"):
+    text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()
-        parts = text.split("```")
+    text = re.sub(r"<thinking>.*?</thinking>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()
        for part in parts:
            part = part.strip()
            if part.startswith("json"):
                part = part[4:].strip()
            if part.startswith("{"):
                text = part
                break
-    # Try direct parse first
+    # Strip markdown fences if present
    if "```" in text:
        # Find the content between the first ``` pair
        fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
        if fence_match:
            text = fence_match.group(1).strip()
    # Try direct parse
    try:
        data = json.loads(text)
    except json.JSONDecodeError:
-        # Extract first JSON object embedded in prose
+        # Fall back to brace-balanced extraction from anywhere in the output
-        match = re.search(r"\{.*\}", text, re.DOTALL)
+        candidate = _extract_json_object(text)
-        if not match:
+        if not candidate:
            logger.warning("Could not parse JSON from LLM output (first 400 chars): %r", text[:400])
            raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
        try:
-            data = json.loads(match.group(0))
+            data = json.loads(candidate)
        except json.JSONDecodeError as exc:
            logger.warning("Brace-extracted JSON still invalid: %r", candidate[:400])
            raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc
    if isinstance(data, dict) and data.get("error") == "not_a_recipe":