turnstone/app/services/nl_source.py
pyr0ball e841c00949 feat: bundle PII sanitization, onboarding wizard, NL source addition (#51, #52, #53)
Bundle export (#51):
- _redact_text() with 5 compiled regex patterns (IPv4, email, user=, host=, password=)
- build_bundle(sanitize=False) — per-entry redaction at export time
- sent_bundles table tracks every outgoing export (GET and POST /send)
- GET /api/sent-bundles exposes history; SentBundle model added
- BundlesView: Received/Sent tabs, sanitized badge, 5-entry preview, re-download
- IncidentsView: Sanitize PII checkbox next to Send Bundle

Onboarding wizard (#52):
- app/services/discover.py: journald/Docker/file detection (best-effort, safe in containers)
- GET /api/setup/status, /discover, POST /api/setup/write (additive, appends to existing)
- SetupWizard.vue: 3-step Detect → Select → Confirm
  - Step 1 shows grouped summary (journald/file/docker counts)
  - Step 2: collapsible groups with All/None section toggles
    - journald + file: pre-selected; docker: collapsed, none pre-selected
  - Step 3: YAML preview before write
- SourcesView: shows wizard on first run; Add Source button reuses it

NL source addition (#53):
- app/services/nl_source.py: keyword shortcut (13 well-known apps) + LLM fallback
- POST /api/setup/interpret: keyword → LLM → null (graceful fallback)
- NL field in wizard step 2; manual form shown when interpretation fails
- Added sources appear in grouped list immediately
2026-05-29 14:14:28 -07:00

134 lines
5.4 KiB
Python

"""Natural-language log source interpretation (LLM path for #53).
BSL-gated feature: the structured form fallback is MIT; the LLM interpretation
requires the LLM service to be configured. The caller always validates the
output against the source schema before writing anything.
"""
from __future__ import annotations
import json
import logging
import re
from typing import Any
import httpx
logger = logging.getLogger(__name__)
_SYSTEM_PROMPT = """\
You are a Turnstone log-source configuration assistant.
The operator will describe a log source in plain English.
Respond ONLY with a JSON object matching this schema — no prose, no markdown:
{
"id": "short-kebab-case identifier",
"type": "file" | "journald" | "docker",
"path": "/absolute/path (file type only)",
"container": "container-name (docker type only)",
"runtime": "docker" | "podman" (docker type only, default docker)",
"unit": "service.service (journald type only, omit for all-journal)",
"label": "Human-readable name for the UI"
}
Rules:
- For well-known apps (nginx, apache, caddy, sonarr, radarr, qbittorrent, plex, jellyfin),
use the conventional default log path.
- If the operator mentions a Docker/Podman container, use type=docker.
- If the operator mentions journald or a systemd service, use type=journald.
- If uncertain, use type=file with the most likely path.
- The "id" must be lowercase, hyphens only (no spaces, slashes, dots).
- Never include trailing commas or comments in your JSON.
"""
# Well-known path lookup for common apps — used as a deterministic fallback
_KNOWN_APPS: dict[str, dict[str, Any]] = {
"nginx": {"id": "nginx-access", "type": "file", "path": "/var/log/nginx/access.log"},
"apache": {"id": "apache", "type": "file", "path": "/var/log/apache2/access.log"},
"caddy": {"id": "caddy", "type": "file", "path": "/var/log/caddy/access.log"},
"sonarr": {"id": "sonarr", "type": "file", "path": "/var/log/sonarr/sonarr.0.txt"},
"radarr": {"id": "radarr", "type": "file", "path": "/var/log/radarr/radarr.0.txt"},
"qbittorrent": {"id": "qbittorrent", "type": "file", "path": "/var/log/qbittorrent/qbittorrent.log"},
"plex": {"id": "plex", "type": "file", "path": "/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log"},
"jellyfin": {"id": "jellyfin", "type": "file", "path": "/var/log/jellyfin/jellyfin.log"},
"syslog": {"id": "syslog", "type": "file", "path": "/var/log/syslog"},
"auth": {"id": "auth", "type": "file", "path": "/var/log/auth.log"},
"fail2ban": {"id": "fail2ban", "type": "file", "path": "/var/log/fail2ban.log"},
"docker": {"id": "docker-daemon", "type": "file", "path": "/var/log/docker.log"},
"journal": {"id": "journal", "type": "journald"},
"journald": {"id": "journal", "type": "journald"},
"systemd": {"id": "journal", "type": "journald"},
}
def _keyword_match(description: str) -> dict[str, Any] | None:
"""Try a simple keyword match before spending an LLM call."""
lower = description.lower()
for keyword, template in _KNOWN_APPS.items():
if keyword in lower:
result = dict(template)
result.setdefault("label", keyword.capitalize() + " log")
return result
return None
def _extract_json(text: str) -> dict[str, Any] | None:
"""Pull the first {...} block out of an LLM response."""
match = re.search(r"\{[^{}]+\}", text, re.DOTALL)
if not match:
return None
try:
return json.loads(match.group())
except json.JSONDecodeError:
return None
def interpret(
description: str,
llm_url: str | None,
llm_model: str | None,
api_key: str | None = None,
timeout: float = 30.0,
) -> dict[str, Any] | None:
"""Interpret a natural-language source description.
Returns a source dict or None if interpretation fails.
The caller must validate the result with discover.validate_source()
before writing anything to disk.
"""
# 1. Keyword shortcut — no LLM needed for well-known apps
kw = _keyword_match(description)
if kw:
logger.debug("NL source: keyword match for %r", description)
return kw
# 2. LLM path
if not llm_url or not llm_model:
logger.debug("NL source: no LLM configured, returning None")
return None
messages = [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": description},
]
headers = {"Content-Type": "application/json"}
if api_key:
headers["Authorization"] = f"Bearer {api_key}"
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/v1/chat/completions",
json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 256},
headers=headers,
timeout=timeout,
)
resp.raise_for_status()
content = resp.json()["choices"][0]["message"]["content"]
parsed = _extract_json(content)
if parsed:
parsed.setdefault("label", description[:60])
return parsed
logger.warning("NL source: could not extract JSON from LLM response")
except Exception as exc:
logger.warning("NL source: LLM call failed (%s): %s", type(exc).__name__, exc)
return None