turnstone/app/services/nl_source.py

"""Natural-language log source interpretation (LLM path for #53).

BSL-gated feature: the structured form fallback is MIT; the LLM interpretation
requires the LLM service to be configured. The caller always validates the
output against the source schema before writing anything.
"""
from __future__ import annotations

import json
import logging
import re
from typing import Any

import httpx

logger = logging.getLogger(__name__)

_SYSTEM_PROMPT = """\
You are a Turnstone log-source configuration assistant.
The operator will describe a log source in plain English.
Respond ONLY with a JSON object matching this schema — no prose, no markdown:

{
  "id":        "short-kebab-case identifier",
  "type":      "file" | "journald" | "docker",
  "path":      "/absolute/path  (file type only)",
  "container": "container-name  (docker type only)",
  "runtime":   "docker" | "podman"  (docker type only, default docker)",
  "unit":      "service.service  (journald type only, omit for all-journal)",
  "label":     "Human-readable name for the UI"
}

Rules:
- For well-known apps (nginx, apache, caddy, sonarr, radarr, qbittorrent, plex, jellyfin),
  use the conventional default log path.
- If the operator mentions a Docker/Podman container, use type=docker.
- If the operator mentions journald or a systemd service, use type=journald.
- If uncertain, use type=file with the most likely path.
- The "id" must be lowercase, hyphens only (no spaces, slashes, dots).
- Never include trailing commas or comments in your JSON.
"""

# Well-known path lookup for common apps — used as a deterministic fallback
_KNOWN_APPS: dict[str, dict[str, Any]] = {
    "nginx":        {"id": "nginx-access",  "type": "file", "path": "/var/log/nginx/access.log"},
    "apache":       {"id": "apache",        "type": "file", "path": "/var/log/apache2/access.log"},
    "caddy":        {"id": "caddy",         "type": "file", "path": "/var/log/caddy/access.log"},
    "sonarr":       {"id": "sonarr",        "type": "file", "path": "/var/log/sonarr/sonarr.0.txt"},
    "radarr":       {"id": "radarr",        "type": "file", "path": "/var/log/radarr/radarr.0.txt"},
    "qbittorrent":  {"id": "qbittorrent",   "type": "file", "path": "/var/log/qbittorrent/qbittorrent.log"},
    "plex":         {"id": "plex",          "type": "file", "path": "/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log"},
    "jellyfin":     {"id": "jellyfin",      "type": "file", "path": "/var/log/jellyfin/jellyfin.log"},
    "syslog":       {"id": "syslog",        "type": "file", "path": "/var/log/syslog"},
    "auth":         {"id": "auth",          "type": "file", "path": "/var/log/auth.log"},
    "fail2ban":     {"id": "fail2ban",      "type": "file", "path": "/var/log/fail2ban.log"},
    "docker":       {"id": "docker-daemon", "type": "file", "path": "/var/log/docker.log"},
    "journal":      {"id": "journal",       "type": "journald"},
    "journald":     {"id": "journal",       "type": "journald"},
    "systemd":      {"id": "journal",       "type": "journald"},
}


def _keyword_match(description: str) -> dict[str, Any] | None:
    """Try a simple keyword match before spending an LLM call."""
    lower = description.lower()
    for keyword, template in _KNOWN_APPS.items():
        if keyword in lower:
            result = dict(template)
            result.setdefault("label", keyword.capitalize() + " log")
            return result
    return None


def _extract_json(text: str) -> dict[str, Any] | None:
    """Pull the first {...} block out of an LLM response."""
    match = re.search(r"\{[^{}]+\}", text, re.DOTALL)
    if not match:
        return None
    try:
        return json.loads(match.group())
    except json.JSONDecodeError:
        return None


def interpret(
    description: str,
    llm_url: str | None,
    llm_model: str | None,
    api_key: str | None = None,
    timeout: float = 30.0,
) -> dict[str, Any] | None:
    """Interpret a natural-language source description.

    Returns a source dict or None if interpretation fails.
    The caller must validate the result with discover.validate_source()
    before writing anything to disk.
    """
    # 1. Keyword shortcut — no LLM needed for well-known apps
    kw = _keyword_match(description)
    if kw:
        logger.debug("NL source: keyword match for %r", description)
        return kw

    # 2. LLM path
    if not llm_url or not llm_model:
        logger.debug("NL source: no LLM configured, returning None")
        return None

    messages = [
        {"role": "system", "content": _SYSTEM_PROMPT},
        {"role": "user",   "content": description},
    ]
    headers = {"Content-Type": "application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"

    try:
        resp = httpx.post(
            f"{llm_url.rstrip('/')}/v1/chat/completions",
            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 256},
            headers=headers,
            timeout=timeout,
        )
        resp.raise_for_status()
        content = resp.json()["choices"][0]["message"]["content"]
        parsed = _extract_json(content)
        if parsed:
            parsed.setdefault("label", description[:60])
            return parsed
        logger.warning("NL source: could not extract JSON from LLM response")
    except Exception as exc:
        logger.warning("NL source: LLM call failed (%s): %s", type(exc).__name__, exc)

    return None