From 1131816666f6d3753f204142ab218997338a50e3 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Fri, 29 May 2026 14:14:28 -0700 Subject: [PATCH] feat: bundle PII sanitization, onboarding wizard, NL source addition (#51, #52, #53) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bundle export (#51): - _redact_text() with 5 compiled regex patterns (IPv4, email, user=, host=, password=) - build_bundle(sanitize=False) — per-entry redaction at export time - sent_bundles table tracks every outgoing export (GET and POST /send) - GET /api/sent-bundles exposes history; SentBundle model added - BundlesView: Received/Sent tabs, sanitized badge, 5-entry preview, re-download - IncidentsView: Sanitize PII checkbox next to Send Bundle Onboarding wizard (#52): - app/services/discover.py: journald/Docker/file detection (best-effort, safe in containers) - GET /api/setup/status, /discover, POST /api/setup/write (additive, appends to existing) - SetupWizard.vue: 3-step Detect → Select → Confirm - Step 1 shows grouped summary (journald/file/docker counts) - Step 2: collapsible groups with All/None section toggles - journald + file: pre-selected; docker: collapsed, none pre-selected - Step 3: YAML preview before write - SourcesView: shows wizard on first run; Add Source button reuses it NL source addition (#53): - app/services/nl_source.py: keyword shortcut (13 well-known apps) + LLM fallback - POST /api/setup/interpret: keyword → LLM → null (graceful fallback) - NL field in wizard step 2; manual form shown when interpretation fails - Added sources appear in grouped list immediately --- app/glean/pipeline.py | 11 + app/rest.py | 138 +++++++++- app/services/discover.py | 173 ++++++++++++ app/services/incidents.py | 66 ++++- app/services/models.py | 12 + app/services/nl_source.py | 134 +++++++++ docs/compliance/checklist.md | 154 +++++++++++ web/src/components/SetupWizard.vue | 421 +++++++++++++++++++++++++++++ web/src/views/BundlesView.vue | 284 ++++++++++++++----- web/src/views/IncidentsView.vue | 13 +- web/src/views/SourcesView.vue | 65 ++++- 11 files changed, 1381 insertions(+), 90 deletions(-) create mode 100644 app/services/discover.py create mode 100644 app/services/nl_source.py create mode 100644 docs/compliance/checklist.md create mode 100644 web/src/components/SetupWizard.vue diff --git a/app/glean/pipeline.py b/app/glean/pipeline.py index ddef194..684219f 100644 --- a/app/glean/pipeline.py +++ b/app/glean/pipeline.py @@ -72,6 +72,17 @@ CREATE TABLE IF NOT EXISTS received_bundles ( CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at); CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type); +CREATE TABLE IF NOT EXISTS sent_bundles ( + id TEXT PRIMARY KEY, + incident_id TEXT NOT NULL, + exported_at TEXT NOT NULL, + sanitized INTEGER NOT NULL DEFAULT 0, + entry_count INTEGER NOT NULL DEFAULT 0, + bundle_json TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_sent_bundles_incident ON sent_bundles(incident_id); +CREATE INDEX IF NOT EXISTS idx_sent_bundles_time ON sent_bundles(exported_at); + -- context tables moved to ensure_context_schema() / CONTEXT_DB_PATH -- kept here as no-ops so legacy single-file deployments still work CREATE TABLE IF NOT EXISTS context_facts ( diff --git a/app/rest.py b/app/rest.py index 8a5f63f..b72c466 100644 --- a/app/rest.py +++ b/app/rest.py @@ -10,7 +10,9 @@ import asyncio import dataclasses import hmac import json +import logging import os +import time # Offline mode: must be set before any HuggingFace library is imported. # Both flags must agree — HF hub and transformers each check independently. @@ -48,6 +50,8 @@ from app.services.blocklist import ( update_candidate_status, ) from app.services.pihole import PiholeClient +from app.services.discover import discover_all, build_sources_yaml, validate_source +from app.services.nl_source import interpret as _nl_interpret from app.services.incidents import ( build_bundle, create_incident, @@ -57,6 +61,8 @@ from app.services.incidents import ( get_incident_entries, list_bundles, list_incidents, + list_sent_bundles, + record_sent_bundle, store_bundle, ) from app.services.search import ( @@ -123,6 +129,10 @@ _compiled_patterns: list = [] @asynccontextmanager async def _lifespan(app: FastAPI): global _compiled_patterns + # Route turnstone.audit through uvicorn's own handler so it appears in api.log. + _audit_log.setLevel(logging.INFO) + for h in logging.getLogger("uvicorn.error").handlers: + _audit_log.addHandler(h) ensure_schema(DB_PATH) ensure_context_schema(CONTEXT_DB_PATH) _compiled_patterns = load_compiled_patterns(PATTERN_FILE) @@ -172,6 +182,27 @@ app.add_middleware( allow_headers=["*"], ) +_audit_log = logging.getLogger("turnstone.audit") + + +@app.middleware("http") +async def _audit_middleware(request: Request, call_next): + """Log every API request: timestamp, method, path, query (no body, no response data).""" + t0 = time.monotonic() + response = await call_next(request) + if request.url.path.startswith("/turnstone/api"): + ms = int((time.monotonic() - t0) * 1000) + qs = f"?{request.url.query}" if request.url.query else "" + _audit_log.info( + "%s %s%s %d %dms", + request.method, + request.url.path, + qs, + response.status_code, + ms, + ) + return response + _PREFS_DEFAULTS: dict = { "entry_point_style": "topbar", @@ -643,6 +674,96 @@ class BatchGleanRequest(BaseModel): entries: list[BatchEntry] +# ── Setup / Onboarding wizard ────────────────────────────────────────────── + +class SetupWriteBody(BaseModel): + sources: list[dict] + + +class NLInterpretBody(BaseModel): + description: str + + +@router.get("/api/setup/status") +def setup_status() -> dict: + """Return whether sources.yaml exists (wizard completion gate).""" + sources_file = PATTERN_DIR / "sources.yaml" + return {"configured": sources_file.exists()} + + +@router.get("/api/setup/discover") +def setup_discover() -> dict: + """Auto-detect available log sources on this host.""" + return discover_all() + + +@router.post("/api/setup/write") +def setup_write(body: SetupWriteBody, background_tasks: BackgroundTasks) -> dict: + """Validate and write sources.yaml from a list of selected source definitions. + + Each source is validated before writing. An existing sources.yaml is + appended to, not overwritten, so post-setup additions are non-destructive. + """ + errors = [] + for src in body.sources: + err = validate_source(src) + if err: + errors.append(err) + if errors: + raise HTTPException(status_code=422, detail="; ".join(errors)) + + sources_file = PATTERN_DIR / "sources.yaml" + if sources_file.exists(): + # Append to existing file: read current sources, merge, rewrite. + import yaml as _yaml + with open(sources_file) as f: + current = _yaml.safe_load(f) or {} + existing_ids = {s.get("id") for s in current.get("sources", [])} + new_sources = [s for s in body.sources if s.get("id") not in existing_ids] + if not new_sources: + return {"written": 0, "skipped": len(body.sources), "message": "All sources already configured"} + all_sources = current.get("sources", []) + new_sources + content = build_sources_yaml(all_sources) + else: + content = build_sources_yaml(body.sources) + new_sources = body.sources + + PATTERN_DIR.mkdir(parents=True, exist_ok=True) + sources_file.write_text(content) + + # Trigger a background glean of new sources + if GLEAN_INTERVAL > 0: + background_tasks.add_task( + _glean_file, + sources_file, DB_PATH, PATTERN_FILE, 1, + ) + + return {"written": len(new_sources), "skipped": len(body.sources) - len(new_sources)} + + +@router.post("/api/setup/interpret") +def setup_interpret(body: NLInterpretBody) -> dict: + """Interpret a plain-English source description into a SourceDefinition. + + Uses a keyword lookup first (deterministic, no LLM needed), then falls + back to the configured LLM. Returns null on failure so the UI can + show the manual form — never raises 500. + """ + prefs = _load_prefs() + result = _nl_interpret( + description=body.description, + llm_url=prefs.get("llm_url") or None, + llm_model=prefs.get("llm_model") or None, + api_key=prefs.get("llm_api_key") or None, + ) + if result is None: + return {"source": None, "fallback": True} + err = validate_source(result) + if err: + return {"source": None, "fallback": True, "validation_error": err} + return {"source": result, "fallback": False} + + @router.post("/api/glean/batch") def glean_batch(payload: BatchGleanRequest, background_tasks: BackgroundTasks) -> dict: """Accept pre-parsed log entries from a remote Turnstone instance (submission protocol). @@ -839,21 +960,30 @@ def delete_incident_endpoint(incident_id: str) -> dict: @router.get("/api/incidents/{incident_id}/bundle") -def get_incident_bundle(incident_id: str) -> dict: +def get_incident_bundle(incident_id: str, sanitize: bool = False) -> dict: incident = get_incident(DB_PATH, incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") - return build_bundle(DB_PATH, incident, source_host=SOURCE_HOST) + bundle = build_bundle(DB_PATH, incident, source_host=SOURCE_HOST, sanitize=sanitize) + record_sent_bundle(DB_PATH, incident_id, bundle, sanitized=sanitize) + return bundle + + +@router.get("/api/sent-bundles") +def list_sent_bundles_endpoint() -> dict: + bundles = list_sent_bundles(DB_PATH) + return {"bundles": [dataclasses.asdict(b) for b in bundles]} @router.post("/api/incidents/{incident_id}/send") -def send_incident_bundle(incident_id: str) -> dict: +def send_incident_bundle(incident_id: str, sanitize: bool = False) -> dict: if not BUNDLE_ENDPOINT: raise HTTPException(status_code=503, detail="TURNSTONE_BUNDLE_ENDPOINT not configured") incident = get_incident(DB_PATH, incident_id) if not incident: raise HTTPException(status_code=404, detail="Incident not found") - bundle = build_bundle(DB_PATH, incident, source_host=SOURCE_HOST) + bundle = build_bundle(DB_PATH, incident, source_host=SOURCE_HOST, sanitize=sanitize) + record_sent_bundle(DB_PATH, incident_id, bundle, sanitized=sanitize) payload = json.dumps(bundle).encode() req = urllib.request.Request( BUNDLE_ENDPOINT, diff --git a/app/services/discover.py b/app/services/discover.py new file mode 100644 index 0000000..e511d9a --- /dev/null +++ b/app/services/discover.py @@ -0,0 +1,173 @@ +"""Environment auto-discovery for the onboarding wizard. + +All checks are best-effort — every function returns an empty list on failure +so the wizard degrades gracefully in containers, VMs, and minimal environments. +""" +from __future__ import annotations + +import json +import logging +import os +import shutil +import subprocess +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + +# Common log file candidates: (id, path, description) +_KNOWN_PATHS: list[tuple[str, str, str]] = [ + ("syslog", "/var/log/syslog", "System syslog (Debian/Ubuntu)"), + ("syslog", "/var/log/messages", "System messages (RHEL/Rocky)"), + ("auth", "/var/log/auth.log", "Auth log"), + ("kern", "/var/log/kern.log", "Kernel log"), + ("nginx-access", "/var/log/nginx/access.log", "Nginx access log"), + ("nginx-error", "/var/log/nginx/error.log", "Nginx error log"), + ("apache", "/var/log/apache2/access.log", "Apache access log"), + ("apache-error", "/var/log/apache2/error.log", "Apache error log"), + ("caddy", "/var/log/caddy/access.log", "Caddy access log"), + ("docker-daemon","/var/log/docker.log", "Docker daemon log"), + ("fail2ban", "/var/log/fail2ban.log", "Fail2ban log"), + ("ufw", "/var/log/ufw.log", "UFW firewall log"), +] + + +def _run(cmd: list[str], timeout: float = 5.0) -> str | None: + """Run a command and return stdout, or None on any error.""" + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout) + return result.stdout if result.returncode == 0 else None + except Exception: + return None + + +def discover_journald() -> list[dict[str, Any]]: + """Return a journald source candidate if journalctl is available.""" + if not shutil.which("journalctl"): + return [] + hostname = _run(["hostname"]) or "localhost" + hostname = hostname.strip() + return [{ + "type": "journald", + "id": f"journal:{hostname}", + "label": f"System journal ({hostname})", + "description": "All systemd journal output from this host", + "available": True, + }] + + +def discover_docker() -> list[dict[str, Any]]: + """Return Docker container candidates if Docker is running.""" + for runtime in ("docker", "podman"): + if not shutil.which(runtime): + continue + out = _run([runtime, "ps", "--format", "{{json .}}"]) + if out is None: + continue + containers = [] + for line in out.splitlines(): + line = line.strip() + if not line: + continue + try: + obj = json.loads(line) + name = obj.get("Names") or obj.get("Name") or obj.get("ID", "unknown") + # podman returns a list for Names + if isinstance(name, list): + name = name[0] if name else "unknown" + name = name.lstrip("/") + containers.append({ + "type": "docker", + "id": f"{runtime}:{name}", + "label": f"{runtime.capitalize()} — {name}", + "description": f"Container log stream for {name}", + "container": name, + "runtime": runtime, + "available": True, + }) + except (json.JSONDecodeError, KeyError): + continue + if containers: + return containers + return [] + + +def discover_files() -> list[dict[str, Any]]: + """Return file-based source candidates for well-known log paths.""" + found = [] + seen_ids: set[str] = set() + for source_id, path, description in _KNOWN_PATHS: + if not os.path.exists(path): + continue + # deduplicate when both syslog and messages exist — take first match + if source_id in seen_ids: + continue + seen_ids.add(source_id) + found.append({ + "type": "file", + "id": source_id, + "label": description, + "path": path, + "description": f"Read from {path}", + "available": True, + }) + return found + + +def discover_all() -> dict[str, Any]: + """Run all discovery checks and return a structured candidate list.""" + candidates: list[dict[str, Any]] = [] + candidates.extend(discover_journald()) + candidates.extend(discover_docker()) + candidates.extend(discover_files()) + return { + "candidates": candidates, + "has_journald": any(c["type"] == "journald" for c in candidates), + "has_docker": any(c["type"] == "docker" for c in candidates), + "has_files": any(c["type"] == "file" for c in candidates), + } + + +def build_sources_yaml(selected: list[dict[str, Any]]) -> str: + """Generate sources.yaml content from a list of selected candidates. + + Each item must have: type, id, and type-specific fields (path, container, etc.). + """ + lines = [ + "# Turnstone log sources — generated by the setup wizard.", + "# Edit this file to add, remove, or modify sources.", + "sources:", + ] + for src in selected: + src_type = src.get("type", "file") + src_id = src.get("id", "unknown") + if src_type == "journald": + unit = src.get("unit") + lines.append(f" - id: {src_id}") + lines.append(f" type: journald") + if unit: + lines.append(f" unit: {unit}") + elif src_type == "docker": + runtime = src.get("runtime", "docker") + container = src.get("container", src_id.split(":")[-1]) + lines.append(f" - id: {src_id}") + lines.append(f" type: docker") + lines.append(f" runtime: {runtime}") + lines.append(f" container: {container}") + else: + path = src.get("path", "") + lines.append(f" - id: {src_id}") + lines.append(f" path: {path}") + return "\n".join(lines) + "\n" + + +def validate_source(src: dict[str, Any]) -> str | None: + """Return an error string if the source definition is invalid, else None.""" + if not src.get("id"): + return "Source is missing 'id'" + src_type = src.get("type", "file") + if src_type == "file" and not src.get("path"): + return f"File source '{src['id']}' is missing 'path'" + if src_type == "docker" and not src.get("container"): + return f"Docker source '{src['id']}' is missing 'container'" + return None diff --git a/app/services/incidents.py b/app/services/incidents.py index 9714cb9..1d71422 100644 --- a/app/services/incidents.py +++ b/app/services/incidents.py @@ -2,14 +2,29 @@ from __future__ import annotations import json +import re import sqlite3 import uuid from pathlib import Path from app.glean.base import now_iso -from app.services.models import Incident, ReceivedBundle +from app.services.models import Incident, ReceivedBundle, SentBundle from app.services.search import SearchResult, entries_in_window, search +_REDACT_PATTERNS: list[tuple[re.Pattern, str]] = [ + (re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "[IP]"), + (re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"), "[EMAIL]"), + (re.compile(r"(?i)\b(user(?:name)?|uid)\s*[=:]\s*\S+"), r"\1=[USER]"), + (re.compile(r"(?i)\bhost\s*[=:]\s*\S+"), "host=[HOST]"), + (re.compile(r"(?i)\bpassword\s*[=:]\s*\S+"), "password=[REDACTED]"), +] + + +def _redact_text(text: str) -> str: + for pattern, replacement in _REDACT_PATTERNS: + text = pattern.sub(replacement, text) + return text + def _row_to_incident(row: sqlite3.Row) -> Incident: return Incident( @@ -142,6 +157,7 @@ def build_bundle( incident: Incident, source_host: str, limit: int = 200, + sanitize: bool = False, ) -> dict: """Assemble a labeled bundle: incident metadata + related log entries.""" entries = get_incident_entries(db_path, incident, limit=limit) @@ -149,6 +165,7 @@ def build_bundle( "bundle_version": 1, "source_host": source_host, "bundled_at": now_iso(), + "sanitized": sanitize, "incident": { "id": incident.id, "label": incident.label, @@ -164,7 +181,7 @@ def build_bundle( "source_id": e.source_id, "timestamp_iso": e.timestamp_iso, "severity": e.severity, - "text": e.text, + "text": _redact_text(e.text) if sanitize else e.text, "matched_patterns": list(e.matched_patterns), } for e in entries @@ -172,6 +189,51 @@ def build_bundle( } +def record_sent_bundle(db_path: Path, incident_id: str, bundle: dict, sanitized: bool) -> SentBundle: + """Log an outgoing bundle export to the sent_bundles table.""" + record = SentBundle( + id=str(uuid.uuid4()), + incident_id=incident_id, + exported_at=now_iso(), + sanitized=sanitized, + entry_count=len(bundle.get("log_entries", [])), + bundle_json=json.dumps(bundle), + ) + conn = sqlite3.connect(str(db_path), timeout=30.0) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute( + "INSERT INTO sent_bundles (id, incident_id, exported_at, sanitized, entry_count, bundle_json) " + "VALUES (?, ?, ?, ?, ?, ?)", + (record.id, record.incident_id, record.exported_at, int(record.sanitized), + record.entry_count, record.bundle_json), + ) + conn.commit() + conn.close() + return record + + +def list_sent_bundles(db_path: Path) -> list[SentBundle]: + conn = sqlite3.connect(str(db_path), timeout=30.0) + conn.execute("PRAGMA journal_mode=WAL") + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, incident_id, exported_at, sanitized, entry_count, bundle_json " + "FROM sent_bundles ORDER BY exported_at DESC" + ).fetchall() + conn.close() + return [ + SentBundle( + id=r["id"], + incident_id=r["incident_id"], + exported_at=r["exported_at"], + sanitized=bool(r["sanitized"]), + entry_count=r["entry_count"], + bundle_json=r["bundle_json"], + ) + for r in rows + ] + + def store_bundle(db_path: Path, bundle: dict) -> ReceivedBundle: """Store an incoming bundle from a remote Turnstone instance.""" inc = bundle.get("incident", {}) diff --git a/app/services/models.py b/app/services/models.py index a0d5df5..784b46d 100644 --- a/app/services/models.py +++ b/app/services/models.py @@ -60,3 +60,15 @@ class ReceivedBundle: bundled_at: str entry_count: int bundle_json: str # full bundle serialized as JSON string + + +@dataclass(frozen=True) +class SentBundle: + """A record of a bundle exported or sent from this instance.""" + + id: str + incident_id: str + exported_at: str + sanitized: bool + entry_count: int + bundle_json: str diff --git a/app/services/nl_source.py b/app/services/nl_source.py new file mode 100644 index 0000000..773c6ab --- /dev/null +++ b/app/services/nl_source.py @@ -0,0 +1,134 @@ +"""Natural-language log source interpretation (LLM path for #53). + +BSL-gated feature: the structured form fallback is MIT; the LLM interpretation +requires the LLM service to be configured. The caller always validates the +output against the source schema before writing anything. +""" +from __future__ import annotations + +import json +import logging +import re +from typing import Any + +import httpx + +logger = logging.getLogger(__name__) + +_SYSTEM_PROMPT = """\ +You are a Turnstone log-source configuration assistant. +The operator will describe a log source in plain English. +Respond ONLY with a JSON object matching this schema — no prose, no markdown: + +{ + "id": "short-kebab-case identifier", + "type": "file" | "journald" | "docker", + "path": "/absolute/path (file type only)", + "container": "container-name (docker type only)", + "runtime": "docker" | "podman" (docker type only, default docker)", + "unit": "service.service (journald type only, omit for all-journal)", + "label": "Human-readable name for the UI" +} + +Rules: +- For well-known apps (nginx, apache, caddy, sonarr, radarr, qbittorrent, plex, jellyfin), + use the conventional default log path. +- If the operator mentions a Docker/Podman container, use type=docker. +- If the operator mentions journald or a systemd service, use type=journald. +- If uncertain, use type=file with the most likely path. +- The "id" must be lowercase, hyphens only (no spaces, slashes, dots). +- Never include trailing commas or comments in your JSON. +""" + +# Well-known path lookup for common apps — used as a deterministic fallback +_KNOWN_APPS: dict[str, dict[str, Any]] = { + "nginx": {"id": "nginx-access", "type": "file", "path": "/var/log/nginx/access.log"}, + "apache": {"id": "apache", "type": "file", "path": "/var/log/apache2/access.log"}, + "caddy": {"id": "caddy", "type": "file", "path": "/var/log/caddy/access.log"}, + "sonarr": {"id": "sonarr", "type": "file", "path": "/var/log/sonarr/sonarr.0.txt"}, + "radarr": {"id": "radarr", "type": "file", "path": "/var/log/radarr/radarr.0.txt"}, + "qbittorrent": {"id": "qbittorrent", "type": "file", "path": "/var/log/qbittorrent/qbittorrent.log"}, + "plex": {"id": "plex", "type": "file", "path": "/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log"}, + "jellyfin": {"id": "jellyfin", "type": "file", "path": "/var/log/jellyfin/jellyfin.log"}, + "syslog": {"id": "syslog", "type": "file", "path": "/var/log/syslog"}, + "auth": {"id": "auth", "type": "file", "path": "/var/log/auth.log"}, + "fail2ban": {"id": "fail2ban", "type": "file", "path": "/var/log/fail2ban.log"}, + "docker": {"id": "docker-daemon", "type": "file", "path": "/var/log/docker.log"}, + "journal": {"id": "journal", "type": "journald"}, + "journald": {"id": "journal", "type": "journald"}, + "systemd": {"id": "journal", "type": "journald"}, +} + + +def _keyword_match(description: str) -> dict[str, Any] | None: + """Try a simple keyword match before spending an LLM call.""" + lower = description.lower() + for keyword, template in _KNOWN_APPS.items(): + if keyword in lower: + result = dict(template) + result.setdefault("label", keyword.capitalize() + " log") + return result + return None + + +def _extract_json(text: str) -> dict[str, Any] | None: + """Pull the first {...} block out of an LLM response.""" + match = re.search(r"\{[^{}]+\}", text, re.DOTALL) + if not match: + return None + try: + return json.loads(match.group()) + except json.JSONDecodeError: + return None + + +def interpret( + description: str, + llm_url: str | None, + llm_model: str | None, + api_key: str | None = None, + timeout: float = 30.0, +) -> dict[str, Any] | None: + """Interpret a natural-language source description. + + Returns a source dict or None if interpretation fails. + The caller must validate the result with discover.validate_source() + before writing anything to disk. + """ + # 1. Keyword shortcut — no LLM needed for well-known apps + kw = _keyword_match(description) + if kw: + logger.debug("NL source: keyword match for %r", description) + return kw + + # 2. LLM path + if not llm_url or not llm_model: + logger.debug("NL source: no LLM configured, returning None") + return None + + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": description}, + ] + headers = {"Content-Type": "application/json"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" + + try: + resp = httpx.post( + f"{llm_url.rstrip('/')}/v1/chat/completions", + json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 256}, + headers=headers, + timeout=timeout, + ) + resp.raise_for_status() + content = resp.json()["choices"][0]["message"]["content"] + parsed = _extract_json(content) + if parsed: + parsed.setdefault("label", description[:60]) + return parsed + logger.warning("NL source: could not extract JSON from LLM response") + except Exception as exc: + logger.warning("NL source: LLM call failed (%s): %s", type(exc).__name__, exc) + + return None diff --git a/docs/compliance/checklist.md b/docs/compliance/checklist.md new file mode 100644 index 0000000..a2551d5 --- /dev/null +++ b/docs/compliance/checklist.md @@ -0,0 +1,154 @@ +# Turnstone Compliance Checklist + +**Last reviewed:** 2026-05-28 +**Applies to:** All deployments handling log data in compliance-sensitive environments. + +Symbols: ✅ satisfied by code, ⚙️ operator action required, ⚠️ known limitation, 🔲 not implemented. + +--- + +## Data Isolation + +### Source-level query isolation +✅ **`source_filter` enforced on all log-returning endpoints.** +Every endpoint that returns log entries accepts a `source` parameter. Both the FTS5 keyword search path and the time-window scan path apply `source_id LIKE ?` before returning results. No cross-source data leakage is possible through the API. + +Relevant code: `app/services/search.py` — `search()` and `entries_in_window()`. + +### FTS5 cross-source leakage +✅ **FTS5 index includes `source_id` as an UNINDEXED column; all queries filter on it.** +The virtual table schema stores `source_id` alongside each entry. Query functions always join back to the base table or filter the FTS result set by `source_id`. There is no full-corpus FTS path that ignores source. + +### SQLite file permissions +⚙️ **Operator responsibility — not enforced by Turnstone.** +Turnstone does not set file permissions on the database. Recommended posture for multi-user hosts: + +```bash +# Restrict DB to the Turnstone process user only +chmod 600 /devl/turnstone-cluster/data/turnstone.db +chmod 600 /devl/turnstone-cluster/data/turnstone-context.db +chown turnstone:turnstone /devl/turnstone-cluster/data/ +``` + +Run Turnstone as a dedicated non-root user via systemd `User=turnstone`. + +--- + +## Audit Logging + +### API query logging +✅ **Implemented as FastAPI middleware (`turnstone.audit` logger).** +Every request to `/turnstone/api/*` is logged at INFO level with: +- Timestamp (from the logging handler) +- HTTP method +- Path + query string +- Response status code +- Request duration (ms) + +Body content is never logged. Example output: +``` +2026-05-28 14:23:01 INFO turnstone.audit GET /turnstone/api/diagnose/stream?source=heimdall-journal 200 1843ms +``` + +To capture audit logs to a separate file, configure the `turnstone.audit` logger in your logging config: +```python +# In your uvicorn startup or log config YAML: +logging.getLogger("turnstone.audit").addHandler( + logging.FileHandler("/var/log/turnstone/audit.log") +) +``` + +### Glean operation logging +✅ **Glean scheduler logs source ID, entry count, and duration at INFO level.** +Relevant logger: `app.tasks.glean_scheduler` — logs start, per-source stats, and errors. +Log example: +``` +INFO app.tasks.glean_scheduler Batch glean complete in 12.4s — {'heimdall-journal': 847, 'plex': 12} +``` + +### Error logging +✅ **Errors logged with source context but without PII in message fields.** +Exception handlers in `rest.py` log at ERROR level with the endpoint path and error type. Raw log entry text is not included in error messages. Stack traces go to the `uvicorn.error` logger. + +--- + +## LLM / PII Egress + +### Multi-agent pipeline (recommended path, `TURNSTONE_MULTI_AGENT_DIAGNOSE=true`) +✅ **Raw log message text is NOT sent to the LLM.** +Stage 5 (synthesizer) sends only: +- The operator's query string +- Timeline statistics (cluster counts, burst counts, gap counts — no entry text) +- Hypothesis titles from Stage 3 (derived labels, not raw messages) +- Runbook context from the operator's own uploaded documents + +No raw `MESSAGE` field content reaches the LLM in this path. Review: `app/services/diagnose/synthesizer.py`. + +### Legacy single-call path (`TURNSTONE_MULTI_AGENT_DIAGNOSE` unset or `false`) +⚠️ **Raw log message text (truncated to 200 chars) IS sent to the LLM.** +The legacy `summarize()` function in `app/services/llm.py` builds a prompt that includes up to 25 log entries with their `text` field (truncated). If log entries contain hostnames, usernames, IP addresses, or other PII, those values are included in the LLM call. + +**Operator action for PII-sensitive deployments:** Enable `TURNSTONE_MULTI_AGENT_DIAGNOSE=true` to use the pipeline path, which does not expose raw log text. + +### Avocet harvester (corpus export) +✅ **Only pattern-tagged entries are exported; export can be disabled.** +The harvester (`harvester/harvester.py`) only POSTs entries that matched at least one named pattern. It does not export the full corpus. Disable by leaving `TURNSTONE_SUBMIT_ENDPOINT` unset (the default). + +### External telemetry +✅ **None.** Turnstone makes no calls to Sentry, Segment, Amplitude, or any analytics service. The only outbound network calls are: +- Your configured `GPU_SERVER_URL` (LLM inference, operator-controlled) +- HuggingFace Hub (model downloads — disable with `TURNSTONE_OFFLINE_MODE=1`) +- SSH connections to configured remote log sources (operator-defined) + +--- + +## Configuration Hardening + +For compliance deployments, set these in `.env`: + +```bash +# Block HuggingFace network access (model weights pre-downloaded) +TURNSTONE_OFFLINE_MODE=1 + +# Require bearer token for all API calls +TURNSTONE_API_KEY= + +# Use multi-agent pipeline (no raw log text to LLM) +TURNSTONE_MULTI_AGENT_DIAGNOSE=true + +# Disable Avocet corpus push if not needed +# (leave TURNSTONE_SUBMIT_ENDPOINT unset) +``` + +--- + +## Outstanding Items + +🔲 **Per-user access control** — all authenticated clients share the same API key. There is no per-user identity, role separation, or per-source ACL. Track as a future enhancement. + +🔲 **Audit log retention policy** — Turnstone writes audit events to the logging system but does not manage log rotation or retention. Operator must configure log rotation (logrotate, systemd journal limits, etc.). + +🔲 **Encrypted DB at rest** — SQLite does not support transparent encryption. For encryption at rest, use full-disk encryption (LUKS) or an encrypted filesystem on the host. + +🔲 **TLS between client and Turnstone** — Turnstone binds to HTTP by default. For production, place Caddy or nginx in front and terminate TLS there. Do not expose port 8534 directly over untrusted networks. + +--- + +## Data Subject Rights (GDPR / CCPA) + +### Right to erasure — anonymized records + +⚠️ **Anonymized log data cannot be selectively deleted on a per-subject basis.** + +When PII sanitization is applied to a bundle export (redacting IP addresses, usernames, hostnames), the resulting data is no longer linked to a specific data subject. As a consequence, Turnstone cannot identify which stored log entries relate to that subject and cannot fulfill a targeted deletion request for records that have already been anonymized. + +**Operators must clearly disclose this limitation to data subjects before export:** + +> "Anonymized log data exported or submitted from this system cannot be individually identified or selectively deleted. If data was exported in anonymized form, Turnstone cannot distinguish your records from others in the exported set. The right to erasure does not apply to data that is no longer personally identifiable." + +This is consistent with GDPR Recital 26, which excludes anonymized data from the regulation's scope. However, the original (pre-anonymization) records in Turnstone's local SQLite database *can* be deleted by source ID via the Sources view (Delete all entries for source) or directly via the database. + +**Recommended operator practice:** +- Maintain a log of which bundles were exported, when, and to whom — the audit log (`turnstone.audit`) covers this. +- Provide data subjects with the bundle export timestamp and source scope so they can verify what was shared. +- For full erasure of pre-anonymization records: use `DELETE /api/sources/{source_id}` to purge all entries for a given source from the local DB. diff --git a/web/src/components/SetupWizard.vue b/web/src/components/SetupWizard.vue new file mode 100644 index 0000000..6154f4b --- /dev/null +++ b/web/src/components/SetupWizard.vue @@ -0,0 +1,421 @@ + + + + + diff --git a/web/src/views/BundlesView.vue b/web/src/views/BundlesView.vue index 8211fc2..174f1c4 100644 --- a/web/src/views/BundlesView.vue +++ b/web/src/views/BundlesView.vue @@ -3,64 +3,147 @@
-

Received Bundles

-

Labeled incident bundles sent from remote Turnstone instances. Use these to build detection signatures.

+

Bundles

+

Incident bundles sent to and received from remote Turnstone instances.

-
Loading…
- -
-

No bundles received yet.

-

Bundles arrive when a remote Turnstone instance sends a labeled incident.

-
- -
-
+
+ +
- -
-
Loading entries…
-
No entries in bundle.
-
-
-

{{ expandedEntries.length }} log entries

- +
+
- Export JSON - + {{ shortTs(entry.timestamp_iso) }} + {{ entry.severity || '?' }} + {{ lastPart(entry.source_id) }} + | + {{ entry.text.slice(0, 200) }} +
-
+
+
+
+ + +
+
Loading…
+ +
+

No bundles sent yet.

+

Bundles you export or send to a remote instance appear here for review.

+
+ +
+
+
+ + {{ sentIncidentLabel(s) }} + + {{ sentIncidentType(s) }} + - {{ shortTs(entry.timestamp_iso) }} - {{ entry.severity || '?' }} - {{ lastPart(entry.source_id) }} - | - {{ entry.text.slice(0, 200) }} + {{ s.sanitized ? 'sanitized' : 'raw' }} + + {{ s.entry_count }} entries + + + {{ selectedSent?.id === s.id ? '▲' : '▼' }} +
+ +
+
+
+

{{ sentExpandedEntries.length }} log entries (first 5 shown)

+

PII patterns redacted

+

Not sanitized — contains raw log text

+
+
+ {{ shortTs(entry.timestamp_iso) }} + {{ entry.severity || '?' }} + {{ lastPart(entry.source_id) }} + | + {{ entry.text.slice(0, 200) }} +
@@ -87,6 +170,15 @@ interface BundleSummary { bundle_json: string } +interface SentBundleSummary { + id: string + incident_id: string + exported_at: string + sanitized: boolean + entry_count: number + bundle_json: string +} + interface LogEntry { entry_id: string source_id: string @@ -96,51 +188,99 @@ interface LogEntry { matched_patterns: string[] } -const bundles = ref([]) -const loading = ref(true) -const selected = ref(null) -const expandedEntries = ref([]) -const expandLoading = ref(false) +type TabKey = 'received' | 'sent' +const tabs: { key: TabKey; label: string }[] = [ + { key: 'received', label: 'Received' }, + { key: 'sent', label: 'Sent' }, +] +const activeTab = ref('received') + +// Received +const bundles = ref([]) +const loading = ref(true) +const selectedReceived = ref(null) +const expandedEntries = ref([]) +const expandLoading = ref(false) + +// Sent +const sentBundles = ref([]) +const sentLoading = ref(true) +const selectedSent = ref(null) +const sentExpandedEntries = ref([]) onMounted(async () => { - try { - const res = await fetch(`${BASE}/api/bundles`) - if (res.ok) bundles.value = (await res.json()).bundles - } finally { - loading.value = false - } + const [recRes, sentRes] = await Promise.all([ + fetch(`${BASE}/api/bundles`), + fetch(`${BASE}/api/sent-bundles`), + ]) + if (recRes.ok) bundles.value = (await recRes.json()).bundles + if (sentRes.ok) sentBundles.value = (await sentRes.json()).bundles + loading.value = false + sentLoading.value = false }) -async function toggleBundle(b: BundleSummary) { - if (selected.value?.id === b.id) { - selected.value = null +async function toggleReceived(b: BundleSummary) { + if (selectedReceived.value?.id === b.id) { + selectedReceived.value = null expandedEntries.value = [] return } - selected.value = b + selectedReceived.value = b expandedEntries.value = [] expandLoading.value = true try { - // bundle_json is stored inline — parse it directly, no round-trip needed const parsed = JSON.parse(b.bundle_json) expandedEntries.value = parsed.log_entries ?? [] - } catch { - expandLoading.value = false } finally { expandLoading.value = false } } +function toggleSent(s: SentBundleSummary) { + if (selectedSent.value?.id === s.id) { + selectedSent.value = null + sentExpandedEntries.value = [] + return + } + selectedSent.value = s + try { + const parsed = JSON.parse(s.bundle_json) + sentExpandedEntries.value = (parsed.log_entries ?? []).slice(0, 5) + } catch { + sentExpandedEntries.value = [] + } +} + function exportBundle(b: BundleSummary) { const blob = new Blob([b.bundle_json], { type: 'application/json' }) - const url = URL.createObjectURL(blob) - const a = document.createElement('a') - a.href = url + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url a.download = `bundle-${b.issue_type || 'untyped'}-${b.id.slice(0, 8)}.json` a.click() URL.revokeObjectURL(url) } +function redownloadSent(s: SentBundleSummary) { + const parsed = JSON.parse(s.bundle_json) + const label = parsed.incident?.issue_type || 'bundle' + const blob = new Blob([s.bundle_json], { type: 'application/json' }) + const url = URL.createObjectURL(blob) + const a = document.createElement('a') + a.href = url + a.download = `sent-${label}-${s.id.slice(0, 8)}.json` + a.click() + URL.revokeObjectURL(url) +} + +function sentIncidentLabel(s: SentBundleSummary): string { + try { return JSON.parse(s.bundle_json).incident?.label ?? s.incident_id.slice(0, 8) } catch { return s.incident_id.slice(0, 8) } +} + +function sentIncidentType(s: SentBundleSummary): string { + try { return JSON.parse(s.bundle_json).incident?.issue_type || 'untyped' } catch { return 'untyped' } +} + function severityStyle(sev: string): Record { const k = sev?.toLowerCase() ?? 'low' const known = ['low', 'medium', 'high', 'critical'] diff --git a/web/src/views/IncidentsView.vue b/web/src/views/IncidentsView.vue index fd4bc8b..a8e021c 100644 --- a/web/src/views/IncidentsView.vue +++ b/web/src/views/IncidentsView.vue @@ -74,6 +74,10 @@ {{ selected.issue_type }}
+
- +
+ + +
+
+ + +
+ +
+ + +
+
@@ -17,14 +41,14 @@ {{ actionMsg }}
-
Loading…
+
Loading…
-
+

No log sources found.

-

Run the glean pipeline: python scripts/glean_corpus.py

+

Use Add Source above or edit sources.yaml directly.

-
+
@@ -121,6 +145,7 @@