fix(recipe_scan): harden JSON parser for real-world LLM output quirks
- Strip <think>/<thinking> blocks before parsing (Qwen3/DeepSeek-R1 emit these before the actual JSON answer) - Replace greedy regex with brace-balanced _extract_json_object() so trailing prose after } doesn't corrupt the extract - Use non-greedy fence regex to pull JSON from inside ```json blocks - Pass system= to LLMRouter.complete() with a terse JSON-only instruction so Ollama models receive it as a system message, not buried in the user turn - Add logger.warning() on parse failure so raw output is diagnosable
This commit is contained in:
parent
21a9b85067
commit
430600c1af
1 changed files with 57 additions and 18 deletions
|
|
@ -291,7 +291,10 @@ def _call_vision_backend(
|
||||||
raise ValueError("Docuvision returned no text — image may not be a recipe")
|
raise ValueError("Docuvision returned no text — image may not be a recipe")
|
||||||
|
|
||||||
_progress("structuring", "Parsing recipe structure...")
|
_progress("structuring", "Parsing recipe structure...")
|
||||||
text = LLMRouter().complete(_build_ocr_extraction_prompt(combined_ocr))
|
text = LLMRouter().complete(
|
||||||
|
_build_ocr_extraction_prompt(combined_ocr),
|
||||||
|
system="You are a recipe data extractor. Return ONLY valid JSON. No markdown, no explanation, no code fences.",
|
||||||
|
)
|
||||||
if text:
|
if text:
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
@ -329,40 +332,76 @@ def _normalize_ingredient_name(name: str) -> str:
|
||||||
return name.lower().strip()
|
return name.lower().strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_json_object(text: str) -> str | None:
|
||||||
|
"""Return the first balanced JSON object from text, or None if not found.
|
||||||
|
|
||||||
|
Uses brace-counting rather than a greedy regex so trailing prose and
|
||||||
|
nested objects are handled correctly.
|
||||||
|
"""
|
||||||
|
start = text.find("{")
|
||||||
|
if start == -1:
|
||||||
|
return None
|
||||||
|
depth = 0
|
||||||
|
in_string = False
|
||||||
|
escape_next = False
|
||||||
|
for i, ch in enumerate(text[start:], start):
|
||||||
|
if escape_next:
|
||||||
|
escape_next = False
|
||||||
|
continue
|
||||||
|
if ch == "\\" and in_string:
|
||||||
|
escape_next = True
|
||||||
|
continue
|
||||||
|
if ch == '"':
|
||||||
|
in_string = not in_string
|
||||||
|
continue
|
||||||
|
if in_string:
|
||||||
|
continue
|
||||||
|
if ch == "{":
|
||||||
|
depth += 1
|
||||||
|
elif ch == "}":
|
||||||
|
depth -= 1
|
||||||
|
if depth == 0:
|
||||||
|
return text[start : i + 1]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _parse_scanner_json(raw_text: str) -> dict:
|
def _parse_scanner_json(raw_text: str) -> dict:
|
||||||
"""Extract and return the JSON dict from VLM output.
|
"""Extract and return the JSON dict from VLM output.
|
||||||
|
|
||||||
Handles:
|
Handles:
|
||||||
- Pure JSON
|
- Pure JSON
|
||||||
- JSON wrapped in ```json ... ``` markdown fences
|
- JSON in ```json ... ``` markdown fences
|
||||||
- JSON preceded by a line of prose ("Here is the recipe: {...}")
|
- Qwen3-style <think>...</think> or <thinking>...</thinking> preambles
|
||||||
|
- JSON preceded or followed by prose
|
||||||
|
|
||||||
Raises ValueError on not_a_recipe or unparseable output.
|
Raises ValueError on not_a_recipe or unparseable output.
|
||||||
"""
|
"""
|
||||||
text = raw_text.strip()
|
text = raw_text.strip()
|
||||||
|
|
||||||
# Strip markdown fences if present
|
# Strip thinking-token blocks emitted by reasoning models (Qwen3, DeepSeek-R1, etc.)
|
||||||
if text.startswith("```"):
|
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()
|
||||||
parts = text.split("```")
|
text = re.sub(r"<thinking>.*?</thinking>", "", text, flags=re.DOTALL | re.IGNORECASE).strip()
|
||||||
for part in parts:
|
|
||||||
part = part.strip()
|
|
||||||
if part.startswith("json"):
|
|
||||||
part = part[4:].strip()
|
|
||||||
if part.startswith("{"):
|
|
||||||
text = part
|
|
||||||
break
|
|
||||||
|
|
||||||
# Try direct parse first
|
# Strip markdown fences if present
|
||||||
|
if "```" in text:
|
||||||
|
# Find the content between the first ``` pair
|
||||||
|
fence_match = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", text, re.DOTALL)
|
||||||
|
if fence_match:
|
||||||
|
text = fence_match.group(1).strip()
|
||||||
|
|
||||||
|
# Try direct parse
|
||||||
try:
|
try:
|
||||||
data = json.loads(text)
|
data = json.loads(text)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
# Extract first JSON object embedded in prose
|
# Fall back to brace-balanced extraction from anywhere in the output
|
||||||
match = re.search(r"\{.*\}", text, re.DOTALL)
|
candidate = _extract_json_object(text)
|
||||||
if not match:
|
if not candidate:
|
||||||
|
logger.warning("Could not parse JSON from LLM output (first 400 chars): %r", text[:400])
|
||||||
raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
|
raise ValueError(f"Could not parse JSON from VLM output: {text[:200]!r}")
|
||||||
try:
|
try:
|
||||||
data = json.loads(match.group(0))
|
data = json.loads(candidate)
|
||||||
except json.JSONDecodeError as exc:
|
except json.JSONDecodeError as exc:
|
||||||
|
logger.warning("Brace-extracted JSON still invalid: %r", candidate[:400])
|
||||||
raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc
|
raise ValueError(f"Could not parse JSON from VLM output: {exc}") from exc
|
||||||
|
|
||||||
if isinstance(data, dict) and data.get("error") == "not_a_recipe":
|
if isinstance(data, dict) and data.get("error") == "not_a_recipe":
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue