fix(diagnose): add max_tokens to all LLM calls; fix reasoning card contrast

Truncation fix: call_llm() in _llm_client.py now accepts max_tokens (default
2048) and passes it in both the cf-orch task payload and the OpenAI-compat
fallback body. Hypothesizer uses max_tokens=1024 (JSON array output);
synthesizer and legacy summarize use 2048 (structured 5-section narrative).
Without this, backends use their own default (often 512 tokens), causing
mid-sentence truncation of the diagnosis output.

UI fix: reasoning card changed from bg-accent/5 border-accent/30 (opacity
modifiers on CSS variables don't compose reliably across themes) to the
callout pattern: bg-surface-raised with a solid border-l-4 border-accent.
Header label changed from text-text-dim to text-accent for visual anchoring.
Text remains text-text-primary for guaranteed contrast on both light and dark
themes.

Tracks: #56 (technical-level post-processor, filed as follow-on feature)
This commit is contained in:
pyr0ball 2026-05-27 22:23:36 -07:00
parent 7f49961ec4
commit 73a14bd782
4 changed files with 11 additions and 7 deletions

View file

@ -91,6 +91,7 @@ def call_llm(
messages: list[dict],
task_name: str = "log_analysis",
timeout: float = 120.0,
max_tokens: int = 2048,
) -> str | None:
"""Send messages to the LLM; return raw text or None on failure.
@ -106,6 +107,8 @@ def call_llm(
messages: OpenAI-style message list (system + user turns).
task_name: cf-orch task name for product-routed inference (default: ``log_analysis``).
timeout: Request timeout in seconds (default: 120).
max_tokens: Maximum tokens to generate (default: 2048). Prevents mid-sentence
truncation when the backend default is lower than the output needs.
Returns:
Raw text content string, or None if both paths fail.
@ -122,7 +125,7 @@ def call_llm(
json={
"product": "turnstone",
"task": task_name,
"payload": {"messages": messages, "stream": False},
"payload": {"messages": messages, "stream": False, "max_tokens": max_tokens},
},
headers=headers,
timeout=timeout,
@ -146,7 +149,7 @@ def call_llm(
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/v1/chat/completions",
json={"model": llm_model, "messages": messages, "stream": False},
json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": max_tokens},
headers=headers,
timeout=timeout,
)

View file

@ -116,6 +116,7 @@ class RootCauseHypothesizer:
llm_model=llm_model,
llm_api_key=llm_api_key,
messages=messages,
max_tokens=1024, # JSON array of 2-4 hypotheses; 1024 is sufficient
)
if raw_response is None:
return []

View file

@ -73,7 +73,7 @@ def summarize(
json={
"product": "turnstone",
"task": "log_analysis",
"payload": {"messages": messages, "stream": False},
"payload": {"messages": messages, "stream": False, "max_tokens": 1024},
},
headers=headers,
timeout=timeout,
@ -92,7 +92,7 @@ def summarize(
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/v1/chat/completions",
json={"model": llm_model, "messages": messages, "stream": False},
json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 1024},
headers=headers,
timeout=timeout,
)

View file

@ -85,10 +85,10 @@
<!-- LLM reasoning card -->
<div
v-if="reasoning"
class="mb-4 rounded border border-accent/30 bg-accent/5 p-4"
class="mb-4 rounded-r border-l-4 border-accent bg-surface-raised p-4"
>
<div class="flex items-center gap-2 mb-2 text-xs text-text-dim font-medium uppercase tracking-wide">
<span></span>
<div class="flex items-center gap-2 mb-2 text-xs text-accent font-semibold uppercase tracking-wide">
<span aria-hidden="true"></span>
<span>Diagnosis</span>
</div>
<p class="text-sm text-text-primary leading-relaxed whitespace-pre-wrap">{{ reasoning }}</p>