"""Natural-language log source interpretation (LLM path for #53). BSL-gated feature: the structured form fallback is MIT; the LLM interpretation requires the LLM service to be configured. The caller always validates the output against the source schema before writing anything. """ from __future__ import annotations import json import logging import re from typing import Any import httpx logger = logging.getLogger(__name__) _SYSTEM_PROMPT = """\ You are a Turnstone log-source configuration assistant. The operator will describe a log source in plain English. Respond ONLY with a JSON object matching this schema — no prose, no markdown: { "id": "short-kebab-case identifier", "type": "file" | "journald" | "docker", "path": "/absolute/path (file type only)", "container": "container-name (docker type only)", "runtime": "docker" | "podman" (docker type only, default docker)", "unit": "service.service (journald type only, omit for all-journal)", "label": "Human-readable name for the UI" } Rules: - For well-known apps (nginx, apache, caddy, sonarr, radarr, qbittorrent, plex, jellyfin), use the conventional default log path. - If the operator mentions a Docker/Podman container, use type=docker. - If the operator mentions journald or a systemd service, use type=journald. - If uncertain, use type=file with the most likely path. - The "id" must be lowercase, hyphens only (no spaces, slashes, dots). - Never include trailing commas or comments in your JSON. """ # Well-known path lookup for common apps — used as a deterministic fallback _KNOWN_APPS: dict[str, dict[str, Any]] = { "nginx": {"id": "nginx-access", "type": "file", "path": "/var/log/nginx/access.log"}, "apache": {"id": "apache", "type": "file", "path": "/var/log/apache2/access.log"}, "caddy": {"id": "caddy", "type": "file", "path": "/var/log/caddy/access.log"}, "sonarr": {"id": "sonarr", "type": "file", "path": "/var/log/sonarr/sonarr.0.txt"}, "radarr": {"id": "radarr", "type": "file", "path": "/var/log/radarr/radarr.0.txt"}, "qbittorrent": {"id": "qbittorrent", "type": "file", "path": "/var/log/qbittorrent/qbittorrent.log"}, "plex": {"id": "plex", "type": "file", "path": "/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log"}, "jellyfin": {"id": "jellyfin", "type": "file", "path": "/var/log/jellyfin/jellyfin.log"}, "syslog": {"id": "syslog", "type": "file", "path": "/var/log/syslog"}, "auth": {"id": "auth", "type": "file", "path": "/var/log/auth.log"}, "fail2ban": {"id": "fail2ban", "type": "file", "path": "/var/log/fail2ban.log"}, "docker": {"id": "docker-daemon", "type": "file", "path": "/var/log/docker.log"}, "journal": {"id": "journal", "type": "journald"}, "journald": {"id": "journal", "type": "journald"}, "systemd": {"id": "journal", "type": "journald"}, } def _keyword_match(description: str) -> dict[str, Any] | None: """Try a simple keyword match before spending an LLM call.""" lower = description.lower() for keyword, template in _KNOWN_APPS.items(): if keyword in lower: result = dict(template) result.setdefault("label", keyword.capitalize() + " log") return result return None def _extract_json(text: str) -> dict[str, Any] | None: """Pull the first {...} block out of an LLM response.""" match = re.search(r"\{[^{}]+\}", text, re.DOTALL) if not match: return None try: return json.loads(match.group()) except json.JSONDecodeError: return None def interpret( description: str, llm_url: str | None, llm_model: str | None, api_key: str | None = None, timeout: float = 30.0, ) -> dict[str, Any] | None: """Interpret a natural-language source description. Returns a source dict or None if interpretation fails. The caller must validate the result with discover.validate_source() before writing anything to disk. """ # 1. Keyword shortcut — no LLM needed for well-known apps kw = _keyword_match(description) if kw: logger.debug("NL source: keyword match for %r", description) return kw # 2. LLM path if not llm_url or not llm_model: logger.debug("NL source: no LLM configured, returning None") return None messages = [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": description}, ] headers = {"Content-Type": "application/json"} if api_key: headers["Authorization"] = f"Bearer {api_key}" try: resp = httpx.post( f"{llm_url.rstrip('/')}/v1/chat/completions", json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 256}, headers=headers, timeout=timeout, ) resp.raise_for_status() content = resp.json()["choices"][0]["message"]["content"] parsed = _extract_json(content) if parsed: parsed.setdefault("label", description[:60]) return parsed logger.warning("NL source: could not extract JSON from LLM response") except Exception as exc: logger.warning("NL source: LLM call failed (%s): %s", type(exc).__name__, exc) return None