fix(hypothesizer): extract first JSON array to handle reasoning model double-output

Reasoning models (e.g. foundation-sec-8b) emit valid JSON then repeat it inside a markdown fence block. json.loads() fails on the combined text. extract_first_json_array() scans for the first '[' and walks to its matching ']' with proper string/escape/nesting handling, then returns just that slice. Combined with strip_json_fences(), this handles all observed output patterns: - bare JSON array (standard models) - fenced JSON array (fence-wrapping models) - bare array followed by fenced repeat (reasoning models)
2026-05-25 21:01:14 -07:00 · 2026-05-25 21:01:14 -07:00 · e851099e5c
commit e851099e5c
parent b19bea8f2a
2 changed files with 45 additions and 2 deletions
--- a/app/services/diagnose/_llm_client.py
+++ b/app/services/diagnose/_llm_client.py
@ -43,6 +43,47 @@ def strip_json_fences(raw: str) -> str:
    return _JSON_FENCE_RE.sub("", raw).strip()


+def extract_first_json_array(raw: str) -> str:
+    """Extract the first complete JSON array from a string.
+
+    Reasoning models (e.g. foundation-sec-8b) sometimes emit valid JSON and
+    then repeat it inside a markdown fence. Standard json.loads() fails on the
+    combined text. This function scans for the first '[' and walks to its
+    matching ']', handling nested structures.
+
+    Returns the extracted substring, or the original string if no array found
+    (so the caller's json.loads() fails with the usual error message).
+    """
+    start = raw.find("[")
+    if start == -1:
+        return raw
+
+    depth = 0
+    in_string = False
+    escape_next = False
+
+    for i, ch in enumerate(raw[start:], start=start):
+        if escape_next:
+            escape_next = False
+            continue
+        if ch == "\\" and in_string:
+            escape_next = True
+            continue
+        if ch == '"':
+            in_string = not in_string
+            continue
+        if in_string:
+            continue
+        if ch == "[":
+            depth += 1
+        elif ch == "]":
+            depth -= 1
+            if depth == 0:
+                return raw[start : i + 1]
+
+    return raw  # unbalanced — return as-is so caller sees the error
+
+
 def call_llm(
    llm_url: str,
    llm_model: str,
--- a/app/services/diagnose/hypothesizer.py
+++ b/app/services/diagnose/hypothesizer.py
@ -6,7 +6,7 @@ import logging
 from uuid import uuid4

 from app.context.retriever import RetrievedContext
-from app.services.diagnose._llm_client import call_llm, strip_json_fences
+from app.services.diagnose._llm_client import call_llm, extract_first_json_array, strip_json_fences
 from app.services.diagnose.models import (
    ClassifiedTimeline,
    EventCluster,
@ -129,7 +129,9 @@ class RootCauseHypothesizer:
        triple-backtick fences despite being instructed not to.
        """
        try:
-            data = json.loads(strip_json_fences(raw))
+            # extract_first_json_array handles reasoning models that emit valid
+            # JSON then repeat it inside a markdown fence block.
+            data = json.loads(extract_first_json_array(strip_json_fences(raw)))
        except json.JSONDecodeError:
            logger.warning(
                "Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw