feat: try cf-orch task endpoint first; fall back to direct model call

POST /api/inference/task with product=turnstone task=log_analysis routes to the security reasoning model assigned in cf-orch. Falls back to the OpenAI- compat /v1/chat/completions path on 404 (no assignment) or if the task endpoint is absent (local instances, example-node).
2026-05-13 08:20:29 -07:00 · 2026-05-13 08:20:29 -07:00 · 812c934822
commit 812c934822
parent 729b78e40f
1 changed files with 38 additions and 8 deletions
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -32,6 +32,14 @@ def _build_context(entries: list[SearchResult], max_entries: int = 25) -> str:
    )
 def _extract_content(resp_json: dict) -> str | None:
    """Pull text content from an OpenAI-compat chat completion response."""
    choices = resp_json.get("choices") or []
    if not choices:
        return None
    return (choices[0].get("message", {}).get("content") or "").strip() or None
 def summarize(
    query: str,
    entries: list[SearchResult],
@ -45,22 +53,44 @@ def summarize(
    log_block = _build_context(entries)
    prompt = _PROMPT_TEMPLATE.format(query=query, n=min(len(entries), 25), log_block=log_block)
    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
    messages = [{"role": "user", "content": prompt}]
    # Try cf-orch task-based endpoint first (routes to the security reasoning model
    # assigned to turnstone.log_analysis without needing an explicit model name).
    task_url = f"{llm_url.rstrip('/')}/api/inference/task"
    try:
        resp = httpx.post(
-            f"{llm_url.rstrip('/')}/v1/chat/completions",
+            task_url,
            json={
-                "model": llm_model,
+                "product": "turnstone",
-                "messages": [{"role": "user", "content": prompt}],
+                "task": "log_analysis",
-                "stream": False,
+                "payload": {"messages": messages, "stream": False},
            },
            headers=headers,
            timeout=timeout,
        )
        if resp.status_code == 200:
            return _extract_content(resp.json())
        if resp.status_code != 404:
            resp.raise_for_status()
        # 404 means no assignment configured — fall through to direct model call
        logger.debug("No task assignment for turnstone.log_analysis — falling back to direct model")
    except httpx.HTTPStatusError:
        raise
    except Exception as exc:
        logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc)
    # Fallback: OpenAI-compat endpoint with explicit model name (local instances,
    # example-node, or any cf-orch that doesn't have task assignments loaded).
    try:
        resp = httpx.post(
            f"{llm_url.rstrip('/')}/v1/chat/completions",
            json={"model": llm_model, "messages": messages, "stream": False},
            headers=headers,
            timeout=timeout,
        )
        resp.raise_for_status()
-        choices = resp.json().get("choices") or []
+        return _extract_content(resp.json())
        if not choices:
            return None
        return (choices[0].get("message", {}).get("content") or "").strip() or None
    except Exception as exc:
        logger.warning("LLM summarization failed (%s): %s", type(exc).__name__, exc)
        return None