"""Shared LLM client for the multi-agent diagnose pipeline. Both Stage 3 (RootCauseHypothesizer) and Stage 5 (SummarySynthesizer) send messages to the same LLM backend using the same two-step pattern: 1. Try the cf-orch task endpoint → product-scoped inference routing. 2. Fall back to OpenAI-compat → direct model call by name. Centralising here means changes to auth headers, timeouts, retry logic, or cf-orch payload structure only need to be made once. """ from __future__ import annotations import logging import re import httpx logger = logging.getLogger(__name__) # Regex that strips ```json … ``` or ``` … ``` fences from LLM output. _JSON_FENCE_RE = re.compile( r"^```(?:json)?\s*|\s*```$", re.MULTILINE, ) def extract_content(resp_json: dict) -> str | None: """Pull text content from an OpenAI-compat chat completion response. Returns None when the response has no choices or empty content. """ choices = resp_json.get("choices") or [] if not choices: return None return (choices[0].get("message", {}).get("content") or "").strip() or None def strip_json_fences(raw: str) -> str: """Remove markdown code fences that some LLMs wrap around JSON output. Example: '```json\\n[...]\\n```' → '[...]' """ return _JSON_FENCE_RE.sub("", raw).strip() def extract_first_json_array(raw: str) -> str: """Extract the first complete JSON array from a string. Reasoning models (e.g. foundation-sec-8b) sometimes emit valid JSON and then repeat it inside a markdown fence. Standard json.loads() fails on the combined text. This function scans for the first '[' and walks to its matching ']', handling nested structures. Returns the extracted substring, or the original string if no array found (so the caller's json.loads() fails with the usual error message). """ start = raw.find("[") if start == -1: return raw depth = 0 in_string = False escape_next = False for i, ch in enumerate(raw[start:], start=start): if escape_next: escape_next = False continue if ch == "\\" and in_string: escape_next = True continue if ch == '"': in_string = not in_string continue if in_string: continue if ch == "[": depth += 1 elif ch == "]": depth -= 1 if depth == 0: return raw[start : i + 1] return raw # unbalanced — return as-is so caller sees the error def call_llm( llm_url: str, llm_model: str, llm_api_key: str | None, messages: list[dict], task_name: str = "log_analysis", timeout: float = 120.0, max_tokens: int = 2048, ) -> str | None: """Send messages to the LLM; return raw text or None on failure. Tries the cf-orch task endpoint first (product-routed inference). Falls back to a direct OpenAI-compat ``/v1/chat/completions`` call when: - The task endpoint returns 404 (no assignment for this task). - The task endpoint is unreachable (connection error, timeout, etc.). Args: llm_url: Base URL of the LLM backend (e.g. ``http://:7700``). llm_model: Model identifier used in the OpenAI-compat fallback call. llm_api_key: Optional bearer token for authenticated endpoints. messages: OpenAI-style message list (system + user turns). task_name: cf-orch task name for product-routed inference (default: ``log_analysis``). timeout: Request timeout in seconds (default: 120). max_tokens: Maximum tokens to generate (default: 2048). Prevents mid-sentence truncation when the backend default is lower than the output needs. Returns: Raw text content string, or None if both paths fail. """ headers: dict[str, str] = {} if llm_api_key: headers["Authorization"] = f"Bearer {llm_api_key}" # --- Path 1: cf-orch task endpoint --- task_url = f"{llm_url.rstrip('/')}/api/inference/task" try: resp = httpx.post( task_url, json={ "product": "turnstone", "task": task_name, "payload": {"messages": messages, "stream": False, "max_tokens": max_tokens}, }, headers=headers, timeout=timeout, ) if resp.status_code == 200: return extract_content(resp.json()) if resp.status_code != 404: resp.raise_for_status() logger.debug( "No task assignment for turnstone.%s — falling back to direct model", task_name, ) except Exception as exc: # noqa: BLE001 # Broad catch is intentional: captures network errors, timeouts, and # any backend-specific exceptions so the pipeline can fall back. logger.debug( "Task endpoint unavailable (%s) — falling back to direct model", exc ) # --- Path 2: OpenAI-compat fallback --- try: resp = httpx.post( f"{llm_url.rstrip('/')}/v1/chat/completions", json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": max_tokens}, headers=headers, timeout=timeout, ) resp.raise_for_status() return extract_content(resp.json()) except Exception as exc: # noqa: BLE001 logger.warning("LLM call failed (%s): %s", type(exc).__name__, exc) return None