From 1c0a747c466c17f5e518da25ec5c274486b122c5 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Wed, 27 May 2026 22:23:36 -0700
Subject: [PATCH] fix(diagnose): add max_tokens to all LLM calls; fix reasoning
 card contrast

Truncation fix: call_llm() in _llm_client.py now accepts max_tokens (default
2048) and passes it in both the cf-orch task payload and the OpenAI-compat
fallback body. Hypothesizer uses max_tokens=1024 (JSON array output);
synthesizer and legacy summarize use 2048 (structured 5-section narrative).
Without this, backends use their own default (often 512 tokens), causing
mid-sentence truncation of the diagnosis output.

UI fix: reasoning card changed from bg-accent/5 border-accent/30 (opacity
modifiers on CSS variables don't compose reliably across themes) to the
callout pattern: bg-surface-raised with a solid border-l-4 border-accent.
Header label changed from text-text-dim to text-accent for visual anchoring.
Text remains text-text-primary for guaranteed contrast on both light and dark
themes.

Tracks: #56 (technical-level post-processor, filed as follow-on feature)
---
 app/services/diagnose/_llm_client.py  | 7 +++++--
 app/services/diagnose/hypothesizer.py | 1 +
 app/services/llm.py                   | 4 ++--
 web/src/components/QuickCapture.vue   | 6 +++---
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/app/services/diagnose/_llm_client.py b/app/services/diagnose/_llm_client.py
index 7a93196..6c4a507 100644
--- a/app/services/diagnose/_llm_client.py
+++ b/app/services/diagnose/_llm_client.py
@@ -91,6 +91,7 @@ def call_llm(
     messages: list[dict],
     task_name: str = "log_analysis",
     timeout: float = 120.0,
+    max_tokens: int = 2048,
 ) -> str | None:
     """Send messages to the LLM; return raw text or None on failure.
 
@@ -106,6 +107,8 @@ def call_llm(
         messages:    OpenAI-style message list (system + user turns).
         task_name:   cf-orch task name for product-routed inference (default: ``log_analysis``).
         timeout:     Request timeout in seconds (default: 120).
+        max_tokens:  Maximum tokens to generate (default: 2048). Prevents mid-sentence
+                     truncation when the backend default is lower than the output needs.
 
     Returns:
         Raw text content string, or None if both paths fail.
@@ -122,7 +125,7 @@ def call_llm(
             json={
                 "product": "turnstone",
                 "task": task_name,
-                "payload": {"messages": messages, "stream": False},
+                "payload": {"messages": messages, "stream": False, "max_tokens": max_tokens},
             },
             headers=headers,
             timeout=timeout,
@@ -146,7 +149,7 @@ def call_llm(
     try:
         resp = httpx.post(
             f"{llm_url.rstrip('/')}/v1/chat/completions",
-            json={"model": llm_model, "messages": messages, "stream": False},
+            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": max_tokens},
             headers=headers,
             timeout=timeout,
         )
diff --git a/app/services/diagnose/hypothesizer.py b/app/services/diagnose/hypothesizer.py
index 433d9fb..328e733 100644
--- a/app/services/diagnose/hypothesizer.py
+++ b/app/services/diagnose/hypothesizer.py
@@ -116,6 +116,7 @@ class RootCauseHypothesizer:
             llm_model=llm_model,
             llm_api_key=llm_api_key,
             messages=messages,
+            max_tokens=1024,  # JSON array of 2-4 hypotheses; 1024 is sufficient
         )
         if raw_response is None:
             return []
diff --git a/app/services/llm.py b/app/services/llm.py
index 9a33e10..6e26a16 100644
--- a/app/services/llm.py
+++ b/app/services/llm.py
@@ -73,7 +73,7 @@ def summarize(
             json={
                 "product": "turnstone",
                 "task": "log_analysis",
-                "payload": {"messages": messages, "stream": False},
+                "payload": {"messages": messages, "stream": False, "max_tokens": 1024},
             },
             headers=headers,
             timeout=timeout,
@@ -92,7 +92,7 @@ def summarize(
     try:
         resp = httpx.post(
             f"{llm_url.rstrip('/')}/v1/chat/completions",
-            json={"model": llm_model, "messages": messages, "stream": False},
+            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 1024},
             headers=headers,
             timeout=timeout,
         )
diff --git a/web/src/components/QuickCapture.vue b/web/src/components/QuickCapture.vue
index 0d40398..240c975 100644
--- a/web/src/components/QuickCapture.vue
+++ b/web/src/components/QuickCapture.vue
@@ -85,10 +85,10 @@
     <!-- LLM reasoning card -->
     <div
       v-if="reasoning"
-      class="mb-4 rounded border border-accent/30 bg-accent/5 p-4"
+      class="mb-4 rounded-r border-l-4 border-accent bg-surface-raised p-4"
     >
-      <div class="flex items-center gap-2 mb-2 text-xs text-text-dim font-medium uppercase tracking-wide">
-        <span>⚡</span>
+      <div class="flex items-center gap-2 mb-2 text-xs text-accent font-semibold uppercase tracking-wide">
+        <span aria-hidden="true">⚡</span>
         <span>Diagnosis</span>
       </div>
       <p class="text-sm text-text-primary leading-relaxed whitespace-pre-wrap">{{ reasoning }}</p>