diff --git a/.env.example b/.env.example index 483790b..816b47d 100644 --- a/.env.example +++ b/.env.example @@ -23,6 +23,6 @@ # Remote endpoint to push diagnostic bundles for escalation. # TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles -# --- Periodic batch ingest --- -# Seconds between automatic ingest runs from sources.yaml. Set to 0 to disable. -# TURNSTONE_INGEST_INTERVAL=900 +# --- Periodic batch glean --- +# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable. +# TURNSTONE_GLEAN_INTERVAL=900 diff --git a/README.md b/README.md index a36ae60..9c8086d 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,8 @@ Service logs (journald, Docker, syslog, Caddy, Plex, arr stack, qBittorrent, dme ## Features -- **Multi-source ingest** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml` -- **Pattern tagging** — named regex patterns applied at ingest time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml` +- **Multi-source glean** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml` +- **Pattern tagging** — named regex patterns applied at glean time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml` - **Full-text search** — SQLite FTS5 index across all ingested entries; filter by source, severity, time window - **Natural-language time queries** — "what happened yesterday morning", "show me errors from the last 3 hours"; powered by dateparser - **Incident management** — create, label, and track incidents; attach supporting log entries @@ -101,13 +101,13 @@ sources: path: /var/log/caddy/access.log ``` -For `journald` sources, run `scripts/export_journal.sh` on the host before each ingest (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down. +For `journald` sources, run `scripts/export_journal.sh` on the host before each glean (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down. --- ## Pattern library -Named patterns in `patterns/default.yaml` are matched against every log entry at ingest time. Matched pattern names are stored and used to boost search relevance for diagnostic queries. +Named patterns in `patterns/default.yaml` are matched against every log entry at glean time. Matched pattern names are stored and used to boost search relevance for diagnostic queries. ```yaml patterns: @@ -157,7 +157,7 @@ Copy `.env.example` to `.env` (or pass as `-e` flags to Docker/Podman). All vari | `TURNSTONE_PATTERNS` | `./patterns` | Pattern directory (default.yaml, sources.yaml, watch.yaml). | | `TURNSTONE_SOURCE_HOST` | `unknown` | Host identifier stamped on ingested entries. | | `TURNSTONE_BUNDLE_ENDPOINT` | — | Remote URL to push diagnostic bundles for escalation. | -| `TURNSTONE_INGEST_INTERVAL` | `900` | Seconds between automatic batch ingest runs. Set to `0` to disable. | +| `TURNSTONE_GLEAN_INTERVAL` | `900` | Seconds between automatic batch glean runs. Set to `0` to disable. | --- diff --git a/app/context/embedder.py b/app/context/embedder.py index 519870d..7bd17e0 100644 --- a/app/context/embedder.py +++ b/app/context/embedder.py @@ -1,64 +1,81 @@ -"""Ollama embedding client with sqlite-vec storage — BSL licensed.""" +"""Context chunk embedding — BSL licensed. + +Thin wrapper around app.services.embeddings that handles the DB I/O for +context_chunks. All backend configuration (model, device, backend type) is +delegated to the service layer via TURNSTONE_EMBED_* env vars. + +Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue +to work without changes. +""" from __future__ import annotations import logging import sqlite3 -import struct from pathlib import Path -import httpx +from app.services.embeddings import ( + EMBEDDING_AVAILABLE, # re-export for backward compat + get_embedder, + pack_vector, +) + +__all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"] logger = logging.getLogger(__name__) -EMBEDDING_AVAILABLE: bool = False - -try: - import sqlite_vec # type: ignore[import] # noqa: F401 - EMBEDDING_AVAILABLE = True - logger.debug("sqlite-vec loaded — embedding pipeline enabled") -except ImportError: - logger.debug("sqlite-vec not available — embedding pipeline disabled") - def embed_chunks( db_path: Path, document_id: str, - llm_url: str, - model: str = "nomic-embed-text", + # Legacy params kept for backward compat — ignored when the ST backend is active. + llm_url: str = "", + model: str = "", timeout: float = 60.0, ) -> int: - """Embed all unembedded chunks for a document. Returns count embedded. No-op when EMBEDDING_AVAILABLE is False.""" - if not EMBEDDING_AVAILABLE: + """Embed all un-embedded chunks for *document_id*. + + Uses the configured embedder (sentence-transformers by default; Ollama when + TURNSTONE_EMBED_BACKEND=ollama). Returns the count of newly embedded chunks. + Returns 0 silently when no embedder is available. + + The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when + the sentence-transformers backend is active — configure via env vars instead. + """ + embedder = get_embedder() + if embedder is None: return 0 conn = sqlite3.connect(str(db_path)) conn.execute("PRAGMA journal_mode=WAL") conn.row_factory = sqlite3.Row + rows = conn.execute( - "SELECT id, text FROM context_chunks WHERE document_id=? AND embedding IS NULL", + "SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL", (document_id,), ).fetchall() - count = 0 - for row in rows: - try: - resp = httpx.post( - f"{llm_url.rstrip('/')}/api/embeddings", - json={"model": model, "prompt": row["text"]}, - timeout=timeout, - ) - resp.raise_for_status() - vector: list[float] = resp.json().get("embedding") or [] - if vector: - blob = struct.pack(f"{len(vector)}f", *vector) - conn.execute( - "UPDATE context_chunks SET embedding=? WHERE id=?", - (blob, row["id"]), - ) - count += 1 - except Exception as exc: - logger.warning("Embedding chunk %s failed: %s", row["id"], exc) + if not rows: + conn.close() + return 0 - conn.commit() - conn.close() + texts = [r["text"] for r in rows] + ids = [r["id"] for r in rows] + + count = 0 + try: + vectors = embedder.embed_batch(texts) + for chunk_id, vec in zip(ids, vectors): + blob = pack_vector(vec) + conn.execute( + "UPDATE context_chunks SET embedding = ? WHERE id = ?", + (blob, chunk_id), + ) + count += 1 + conn.commit() + except Exception as exc: + logger.warning("Batch embedding failed for document %s: %s", document_id, exc) + finally: + conn.close() + + logger.debug("Embedded %d chunk(s) for document %s", count, document_id) return count diff --git a/app/context/retriever.py b/app/context/retriever.py index 6b42c8e..c4b511e 100644 --- a/app/context/retriever.py +++ b/app/context/retriever.py @@ -1,10 +1,30 @@ -"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed.""" +"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed. + +Two retrieval modes for context_chunks: + Vector search — cosine similarity over stored embeddings (when available) + Keyword search — LIKE-based fallback when no embedder is configured + +Both modes are called from retrieve_context(); the best available mode is used +automatically so callers need not check EMBEDDING_AVAILABLE themselves. +""" from __future__ import annotations +import logging import sqlite3 from dataclasses import dataclass, field from pathlib import Path +import numpy as np + +from app.services.embeddings import ( + EMBEDDING_AVAILABLE, + cosine_similarity, + get_embedder, + unpack_vector, +) + +logger = logging.getLogger(__name__) + @dataclass class RetrievedContext: @@ -12,6 +32,8 @@ class RetrievedContext: chunks: list[dict[str, str]] = field(default_factory=list) +# ── Structured fact retrieval (always runs) ─────────────────────────────────── + def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]: """Keyword match against context_facts. Always runs — Free tier.""" try: @@ -42,8 +64,68 @@ def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]: return [] -def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]: - """Keyword search across context_chunks. Fallback when no embeddings.""" +# ── Chunk retrieval: vector path ────────────────────────────────────────────── + +def _search_chunks_vector( + db_path: Path, + query: str, + top_k: int = 3, +) -> list[dict[str, str]]: + """Cosine similarity search over embedded context_chunks. + + Loads all stored embeddings into memory and scores in-process with numpy. + Skips any chunk whose BLOB dimension does not match the current model dim + (stale embeddings from a previous model — they will be re-embedded on the + next document upload). + + Returns at most *top_k* results ordered by similarity descending. + """ + embedder = get_embedder() + if embedder is None: + return [] + + try: + query_vec: np.ndarray = embedder.embed(query) + model_dim: int = embedder.dim + except Exception as exc: + logger.warning("Query embedding failed: %s", exc) + return [] + + try: + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT cc.id, cc.text, cc.embedding, cd.filename" + " FROM context_chunks cc" + " JOIN context_documents cd ON cc.document_id = cd.id" + " WHERE cc.embedding IS NOT NULL" + ).fetchall() + conn.close() + except sqlite3.OperationalError: + return [] + + scored: list[tuple[float, dict[str, str]]] = [] + for row in rows: + blob: bytes = row["embedding"] + # Guard against blobs from a different-dimension model + if len(blob) // 4 != model_dim: + continue + try: + chunk_vec = unpack_vector(blob) + score = cosine_similarity(query_vec, chunk_vec) + scored.append((score, {"text": row["text"], "filename": row["filename"]})) + except Exception: + continue + + scored.sort(key=lambda t: t[0], reverse=True) + return [item for _, item in scored[:top_k]] + + +# ── Chunk retrieval: keyword fallback ───────────────────────────────────────── + +def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]: + """LIKE-based keyword search across context_chunks. Fallback when no embedder.""" try: conn = sqlite3.connect(str(db_path)) conn.execute("PRAGMA journal_mode=WAL") @@ -66,16 +148,29 @@ def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]: return [] +# ── Public interface ────────────────────────────────────────────────────────── + def retrieve_context(db_path: Path, query: str) -> RetrievedContext: - """Retrieve structured facts and relevant chunks for a query.""" - return RetrievedContext( - facts=get_relevant_facts(db_path, query), - chunks=_search_chunks(db_path, query), - ) + """Retrieve structured facts and relevant chunks for a query. + + Chunk retrieval uses vector search when an embedder is available and at + least one embedded chunk exists; falls back to keyword search otherwise. + """ + facts = get_relevant_facts(db_path, query) + + if EMBEDDING_AVAILABLE: + chunks = _search_chunks_vector(db_path, query) + if not chunks: + # Vector search returned nothing (no embedded chunks yet) — fall back. + chunks = _search_chunks_keyword(db_path, query) + else: + chunks = _search_chunks_keyword(db_path, query) + + return RetrievedContext(facts=facts, chunks=chunks) def format_context_block(ctx: RetrievedContext) -> str | None: - """Format context for injection into LLM prompt. Returns None when empty.""" + """Format context for injection into an LLM prompt. Returns None when empty.""" lines: list[str] = [] if ctx.facts: lines.append("Known environment facts:") diff --git a/app/ingest/__init__.py b/app/glean/__init__.py similarity index 100% rename from app/ingest/__init__.py rename to app/glean/__init__.py diff --git a/app/ingest/base.py b/app/glean/base.py similarity index 100% rename from app/ingest/base.py rename to app/glean/base.py diff --git a/app/ingest/caddy.py b/app/glean/caddy.py similarity index 98% rename from app/ingest/caddy.py rename to app/glean/caddy.py index 0cf2319..a7fb494 100644 --- a/app/ingest/caddy.py +++ b/app/glean/caddy.py @@ -4,7 +4,7 @@ from __future__ import annotations import json from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, epoch_float_to_iso, make_entry_id, now_iso, ) diff --git a/app/ingest/dmesg_log.py b/app/glean/dmesg_log.py similarity index 99% rename from app/ingest/dmesg_log.py rename to app/glean/dmesg_log.py index 84058aa..f5e2e11 100644 --- a/app/ingest/dmesg_log.py +++ b/app/glean/dmesg_log.py @@ -18,7 +18,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/doc_upload.py b/app/glean/doc_upload.py similarity index 93% rename from app/ingest/doc_upload.py rename to app/glean/doc_upload.py index 98bb8d7..bc4a3b7 100644 --- a/app/ingest/doc_upload.py +++ b/app/glean/doc_upload.py @@ -10,7 +10,7 @@ from app.context.chunker import process_upload from app.context.store import add_document, add_fact -def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]: +def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]: """Process an uploaded file and write to context store. Returns result summary.""" doc_type, facts, chunks = process_upload(filename, content) diff --git a/app/ingest/docker_log.py b/app/glean/docker_log.py similarity index 98% rename from app/ingest/docker_log.py rename to app/glean/docker_log.py index c383571..18f6966 100644 --- a/app/ingest/docker_log.py +++ b/app/glean/docker_log.py @@ -4,7 +4,7 @@ from __future__ import annotations import json from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) diff --git a/app/ingest/journald.py b/app/glean/journald.py similarity index 98% rename from app/ingest/journald.py rename to app/glean/journald.py index 220e9c8..cc03d22 100644 --- a/app/ingest/journald.py +++ b/app/glean/journald.py @@ -4,7 +4,7 @@ from __future__ import annotations import json from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, epoch_micros_to_iso, make_entry_id, now_iso, SYSLOG_PRIORITY, ) diff --git a/app/ingest/mqtt_subscriber.py b/app/glean/mqtt_subscriber.py similarity index 97% rename from app/ingest/mqtt_subscriber.py rename to app/glean/mqtt_subscriber.py index 1f00da4..efa514d 100644 --- a/app/ingest/mqtt_subscriber.py +++ b/app/glean/mqtt_subscriber.py @@ -1,10 +1,10 @@ -"""Live MQTT ingest subscriber for Turnstone. +"""Live MQTT glean subscriber for Turnstone. Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker in the background. Incoming messages are normalized to RetrievedEntry and written to the Turnstone SQLite database as they arrive. -This runs as an asyncio task alongside the batch ingest scheduler. It is +This runs as an asyncio task alongside the batch glean scheduler. It is started from the FastAPI lifespan in rest.py. MQTT source config format in sources.yaml:: diff --git a/app/glean/pipeline.py b/app/glean/pipeline.py new file mode 100644 index 0000000..42f5463 --- /dev/null +++ b/app/glean/pipeline.py @@ -0,0 +1,616 @@ +"""Glean pipeline: auto-detect format, parse, write to SQLite.""" +from __future__ import annotations + +import json +import logging +import re +import sqlite3 +from pathlib import Path +from typing import Iterator + +import yaml + +from app.glean import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh +from app.glean.base import _compile, load_patterns, now_iso +from app.glean.ssh import ( + SSHTransport, + SSHConnectionError, + SSHCommandError, + _build_docker_command, + _build_journald_command, + _build_plaintext_command, + _build_syslog_command, +) +from app.services.models import LogPattern, RetrievedEntry +from app.services.search import build_fts_index + +logger = logging.getLogger(__name__) + +_SCHEMA = """ +CREATE TABLE IF NOT EXISTS log_entries ( + id TEXT PRIMARY KEY, + source_id TEXT NOT NULL, + sequence INTEGER NOT NULL, + timestamp_raw TEXT, + timestamp_iso TEXT, + ingest_time TEXT NOT NULL, + severity TEXT, + repeat_count INTEGER DEFAULT 1, + out_of_order INTEGER DEFAULT 0, + matched_patterns TEXT DEFAULT '[]', + text TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id); +CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso); +CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count); +CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity); +CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns); + +CREATE TABLE IF NOT EXISTS incidents ( + id TEXT PRIMARY KEY, + label TEXT NOT NULL, + issue_type TEXT NOT NULL DEFAULT '', + started_at TEXT, + ended_at TEXT, + notes TEXT NOT NULL DEFAULT '', + created_at TEXT NOT NULL, + severity TEXT NOT NULL DEFAULT 'medium' +); +CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at); + +CREATE TABLE IF NOT EXISTS received_bundles ( + id TEXT PRIMARY KEY, + source_host TEXT NOT NULL, + issue_type TEXT NOT NULL DEFAULT '', + label TEXT NOT NULL, + severity TEXT NOT NULL DEFAULT 'medium', + started_at TEXT, + bundled_at TEXT NOT NULL, + entry_count INTEGER NOT NULL DEFAULT 0, + bundle_json TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at); +CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type); + +CREATE TABLE IF NOT EXISTS context_facts ( + id TEXT PRIMARY KEY, + category TEXT NOT NULL, + key TEXT NOT NULL, + value TEXT NOT NULL, + source TEXT, + created_at TEXT NOT NULL +); +CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category); +CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key); + +CREATE TABLE IF NOT EXISTS context_documents ( + id TEXT PRIMARY KEY, + filename TEXT NOT NULL, + doc_type TEXT NOT NULL, + full_text TEXT NOT NULL, + file_size INTEGER, + uploaded_at TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS context_chunks ( + id TEXT PRIMARY KEY, + document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE, + chunk_index INTEGER NOT NULL, + text TEXT NOT NULL, + embedding BLOB +); +CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id); + +CREATE TABLE IF NOT EXISTS blocklist_candidates ( + id TEXT PRIMARY KEY, + domain_or_ip TEXT NOT NULL, + source_device_ip TEXT, + source_device_name TEXT, + first_seen TEXT NOT NULL, + last_seen TEXT NOT NULL, + hit_count INTEGER DEFAULT 1, + status TEXT DEFAULT 'pending', + pushed_at TEXT, + log_evidence TEXT DEFAULT '[]', + matched_rule TEXT, + llm_score REAL, + llm_reason TEXT +); +CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip); +CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status); +CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip); + +CREATE TABLE IF NOT EXISTS glean_fingerprints ( + path TEXT PRIMARY KEY, + mtime REAL NOT NULL, + size INTEGER NOT NULL, + gleaned_at TEXT NOT NULL +); +""" + + +def ensure_schema(db_path: Path) -> None: + """Create all tables and apply additive migrations. Safe to call on every startup.""" + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + conn.executescript(_SCHEMA) + # Additive column migrations — ALTER TABLE silently skips if column exists + for stmt in [ + "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''", + ]: + try: + conn.execute(stmt) + except sqlite3.OperationalError: + pass + conn.commit() + conn.close() + + +def _fingerprint(path: Path) -> tuple[float, int]: + """Return (mtime, size) for a file — cheap identity check, no content read needed.""" + st = path.stat() + return st.st_mtime, st.st_size + + +def _fp_unchanged(conn: sqlite3.Connection, path: Path, mtime: float, size: int) -> bool: + """Return True only when the stored fingerprint exactly matches (mtime, size). + + A smaller size (log rotation) or a larger size (new lines appended) both + return False so the caller re-gleams the file. + """ + row = conn.execute( + "SELECT mtime, size FROM glean_fingerprints WHERE path = ?", + (str(path),), + ).fetchone() + if row is None: + return False + return row[0] == mtime and row[1] == size + + +def _save_fingerprint( + conn: sqlite3.Connection, + path: Path, + mtime: float, + size: int, + gleaned_at: str, +) -> None: + """Upsert the fingerprint for *path* after a successful glean.""" + conn.execute( + """ + INSERT OR REPLACE INTO glean_fingerprints (path, mtime, size, gleaned_at) + VALUES (?, ?, ?, ?) + """, + (str(path), mtime, size, gleaned_at), + ) + + +def _detect_format(first_line: str) -> str: + try: + obj = json.loads(first_line) + if "__REALTIME_TIMESTAMP" in obj: + return "journald" + if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"): + return "docker" + if wazuh.is_wazuh_alert(obj): + return "wazuh" + if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj): + return "caddy" + except (json.JSONDecodeError, AttributeError): + pass + if plex.is_plex_log(first_line): + return "plex" + if qbittorrent.is_qbit_log(first_line): + return "qbittorrent" + if servarr.is_servarr_log(first_line): + return "servarr" + if dmesg_log.is_dmesg_log(first_line): + return "dmesg" + if syslog.is_syslog(first_line): + return "syslog" + return "plaintext" + + +def _parse_file( + path: Path, + compiled: list[tuple[LogPattern, object]], + ingest_time: str, + source_id: str | None = None, +) -> Iterator[RetrievedEntry]: + source_id = source_id or path.stem + + with path.open("r", errors="replace") as f: + lines = iter(f) + try: + first = next(lines) + except StopIteration: + return + + fmt = _detect_format(first.strip()) + logger.info("Detected format %r for %s", fmt, path.name) + + def all_lines(): + yield first + yield from lines + + if fmt == "journald": + yield from journald.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "wazuh": + yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "docker": + yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "caddy": + yield from caddy.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "plex": + yield from plex.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "qbittorrent": + yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "servarr": + yield from servarr.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "dmesg": + yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time) + elif fmt == "syslog": + yield from syslog.parse(all_lines(), source_id, compiled, ingest_time) + else: + yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time) + + +def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None: + conn.executemany( + """ + INSERT OR IGNORE INTO log_entries + (id, source_id, sequence, timestamp_raw, timestamp_iso, + ingest_time, severity, repeat_count, out_of_order, + matched_patterns, text) + VALUES (?,?,?,?,?,?,?,?,?,?,?) + """, + [ + ( + e.entry_id, e.source_id, e.sequence, + e.timestamp_raw, e.timestamp_iso, e.ingest_time, + e.severity, e.repeat_count, int(e.out_of_order), + json.dumps(list(e.matched_patterns)), e.text, + ) + for e in batch + ], + ) + + +def _glean_files( + files: list[Path], + db_path: Path, + pattern_file: Path | None = None, + batch_size: int = 1000, + source_id_map: dict[Path, str] | None = None, + force: bool = False, +) -> dict[str, int]: + pattern_file = pattern_file or Path("patterns/default.yaml") + patterns = load_patterns(pattern_file) + compiled = _compile(patterns) + ingest_time = now_iso() + source_id_map = source_id_map or {} + + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + conn.executescript(_SCHEMA) + conn.commit() + + stats: dict[str, int] = {} + skipped: list[str] = [] + + for log_file in files: + source_id = source_id_map.get(log_file, log_file.stem) + + # Fingerprint check — skip files whose mtime+size haven't changed. + mtime, size = _fingerprint(log_file) + if not force and _fp_unchanged(conn, log_file, mtime, size): + logger.debug("Skipping unchanged file: %s", log_file.name) + skipped.append(log_file.name) + stats[source_id] = stats.get(source_id, 0) + continue + + count = 0 + batch: list[RetrievedEntry] = [] + for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id): + batch.append(entry) + if len(batch) >= batch_size: + _write_batch(conn, batch) + conn.commit() + count += len(batch) + batch.clear() + if batch: + _write_batch(conn, batch) + conn.commit() + count += len(batch) + + _save_fingerprint(conn, log_file, mtime, size, ingest_time) + conn.commit() + + stats[source_id] = stats.get(source_id, 0) + count + logger.info("Gleaned %d entries from %s (source: %s)", count, log_file.name, source_id) + + conn.close() + + if skipped: + logger.info("Skipped %d unchanged file(s): %s", len(skipped), ", ".join(skipped)) + + logger.info("Building FTS index...") + build_fts_index(db_path) + logger.info("FTS index ready") + + return stats + + +def _stream_and_write( + transport: SSHTransport, + cmd: str, + parser, + source_id: str, + compiled: list[tuple[LogPattern, object]], + ingest_time: str, + conn: sqlite3.Connection, + batch_size: int, +) -> int: + """Stream *cmd* output through *parser* and write entries to *conn*. + + Catches SSHCommandError per-item so one bad command doesn't abort the rest + of the glean items for this host. Returns the number of entries written. + """ + count = 0 + batch: list[RetrievedEntry] = [] + try: + for entry in parser(transport.exec_stream(cmd), source_id, compiled, ingest_time): + batch.append(entry) + if len(batch) >= batch_size: + _write_batch(conn, batch) + conn.commit() + count += len(batch) + batch.clear() + if batch: + _write_batch(conn, batch) + conn.commit() + count += len(batch) + except SSHCommandError as exc: + logger.warning("SSH command failed for source %r (cmd: %s): %s", source_id, cmd, exc) + logger.info("Gleaned %d entries from SSH source %s", count, source_id) + return count + + +def _glean_ssh_source( + src: dict, # type: ignore[type-arg] + compiled: list[tuple[LogPattern, object]], + ingest_time: str, + conn: sqlite3.Connection, + batch_size: int, +) -> dict[str, int]: + """Open one SSHTransport connection for *src* and glean all its glean items. + + One SSH connection is shared across all items in the ``glean:`` list so + the handshake overhead is paid only once per host per glean run. + + Returns a stats dict mapping ``{source_id: entry_count}`` for each item. + Gracefully skips the entire source on SSHConnectionError. + """ + host_id = src.get("id", src.get("host", "unknown")) + host = src["host"] + user = src["user"] + key_path = str(Path(src["key_path"]).expanduser()) + port = int(src.get("port", 22)) + glean_items: list[dict] = src.get("glean", []) # type: ignore[type-arg] + + stats: dict[str, int] = {} + + try: + with SSHTransport(host=host, user=user, key_path=key_path, port=port) as t: + for item in glean_items: + item_type = item.get("type", "plaintext") + # Per-item source_id — falls back to host_id/type for un-labelled items + item_id = item.get("id") or f"{host_id}/{item_type}" + + if item_type == "journald": + cmd = _build_journald_command(item) + count = _stream_and_write( + t, cmd, journald.parse, item_id, compiled, ingest_time, conn, batch_size + ) + stats[item_id] = stats.get(item_id, 0) + count + + elif item_type == "syslog": + cmd = _build_syslog_command(item) + count = _stream_and_write( + t, cmd, syslog.parse, item_id, compiled, ingest_time, conn, batch_size + ) + stats[item_id] = stats.get(item_id, 0) + count + + elif item_type == "plaintext": + cmd = _build_plaintext_command(item) + count = _stream_and_write( + t, cmd, plaintext.parse, item_id, compiled, ingest_time, conn, batch_size + ) + stats[item_id] = stats.get(item_id, 0) + count + + elif item_type == "docker": + cmds = _build_docker_command(item) + if isinstance(cmds, str): + cmds = [cmds] + containers: list[str] = item.get("containers", []) + for i, cmd in enumerate(cmds): + # Use the container name as the final path segment when available + container_name = containers[i] if i < len(containers) else str(i) + container_id = f"{item_id}/{container_name}" if len(cmds) > 1 else item_id + count = _stream_and_write( + t, cmd, docker_log.parse, container_id, + compiled, ingest_time, conn, batch_size, + ) + stats[container_id] = stats.get(container_id, 0) + count + + else: + logger.warning( + "Unknown SSH glean type %r for source %r — skipping item", + item_type, host_id, + ) + + except SSHConnectionError as exc: + logger.warning("SSH connection failed for source %r: %s", host_id, exc) + + return stats + + +def glean_ssh_source( + src: dict, # type: ignore[type-arg] + db_path: Path, + pattern_file: Path | None = None, + batch_size: int = 1000, +) -> dict[str, int]: + """Glean a single SSH source dict and write results to *db_path*. + + Public wrapper around :func:`_glean_ssh_source` for the REST layer. + Manages the DB connection, pattern compilation, and FTS rebuild so callers + don't have to deal with those lifecycle concerns. + + Returns stats mapping ``{sub_source_id: entry_count}``. + """ + effective_pattern_file = pattern_file or Path("patterns/default.yaml") + compiled = _compile(load_patterns(effective_pattern_file)) + ingest_time = now_iso() + + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + conn.executescript(_SCHEMA) + conn.commit() + + try: + stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size) + finally: + conn.close() + + logger.info("Rebuilding FTS index after SSH source glean...") + build_fts_index(db_path) + return stats + + +def glean_dir( + corpus_dir: Path, + db_path: Path, + pattern_file: Path | None = None, + batch_size: int = 1000, + force: bool = False, +) -> dict[str, int]: + """Glean all .jsonl and .log files from a corpus directory. + + Pass ``force=True`` to bypass fingerprint checks and re-glean all files + regardless of whether they have changed since the last run. + """ + files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log")) + return _glean_files(files, db_path, pattern_file, batch_size, force=force) + + +def glean_file( + log_file: Path, + db_path: Path, + pattern_file: Path | None = None, + force: bool = False, +) -> dict[str, int]: + """Glean a single log file (any supported format). + + Pass ``force=True`` to re-glean even when the file fingerprint is unchanged. + """ + return _glean_files([log_file], db_path, pattern_file, force=force) + + +def glean_sources( + sources_file: Path, + db_path: Path, + pattern_file: Path | None = None, + batch_size: int = 1000, + force: bool = False, +) -> dict[str, int]: + """Glean all sources listed in a sources.yaml config file. + + Supports two source types: + + Local file sources (default): + sources: + - id: sonarr + path: /opt/sonarr/config/logs/sonarr.0.txt + + SSH remote sources (transport: ssh): + sources: + - id: rack01 + transport: ssh + host: 192.168.1.10 + user: admin + key_path: ~/.ssh/id_ed25519 + glean: + - type: journald + args: ["--since", "2 hours ago"] + - type: syslog + path: /var/log/syslog + - type: plaintext + path: /var/log/app/error.log + - type: docker + containers: [myapp, nginx] + + Missing local paths and SSH connection failures are logged as warnings + so the cron keeps running when a source is temporarily down. + """ + with open(sources_file) as f: + config = yaml.safe_load(f) + + local_sources: list[dict] = [] # type: ignore[type-arg] + ssh_sources: list[dict] = [] # type: ignore[type-arg] + + for src in config.get("sources", []): + if src.get("transport") == "ssh": + ssh_sources.append(src) + else: + local_sources.append(src) + + # ── Local file sources ───────────────────────────────────────────────── + files: list[Path] = [] + source_id_map: dict[Path, str] = {} + + for src in local_sources: + path = Path(src["path"]) + if not path.exists(): + logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path) + continue + files.append(path) + if "id" in src: + source_id_map[path] = src["id"] + + if not files and not ssh_sources: + logger.warning("No sources found — check sources.yaml paths") + return {} + + stats: dict[str, int] = {} + if files: + stats.update(_glean_files(files, db_path, pattern_file, batch_size, source_id_map, force=force)) + + # ── SSH remote sources ───────────────────────────────────────────────── + if not ssh_sources: + return stats + + # Compile patterns once, share across all SSH sources in this run. + effective_pattern_file = pattern_file or Path("patterns/default.yaml") + compiled = _compile(load_patterns(effective_pattern_file)) + ingest_time = now_iso() + + conn = sqlite3.connect(str(db_path)) + conn.execute("PRAGMA journal_mode=WAL") + conn.executescript(_SCHEMA) + conn.commit() + + try: + for src in ssh_sources: + ssh_stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size) + for k, v in ssh_stats.items(): + stats[k] = stats.get(k, 0) + v + finally: + conn.close() + + # Rebuild FTS only when SSH sources added entries (_glean_files already + # rebuilds when local sources are present; safe to call again if both ran). + if ssh_sources: + logger.info("Rebuilding FTS index after SSH glean...") + build_fts_index(db_path) + + return stats diff --git a/app/ingest/plaintext.py b/app/glean/plaintext.py similarity index 98% rename from app/ingest/plaintext.py rename to app/glean/plaintext.py index 1bb83d7..a205fc0 100644 --- a/app/ingest/plaintext.py +++ b/app/glean/plaintext.py @@ -10,7 +10,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/plex.py b/app/glean/plex.py similarity index 99% rename from app/ingest/plex.py rename to app/glean/plex.py index 89d7232..5a9ec45 100644 --- a/app/ingest/plex.py +++ b/app/glean/plex.py @@ -12,7 +12,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/qbittorrent.py b/app/glean/qbittorrent.py similarity index 99% rename from app/ingest/qbittorrent.py rename to app/glean/qbittorrent.py index 404c84c..642c419 100644 --- a/app/ingest/qbittorrent.py +++ b/app/glean/qbittorrent.py @@ -18,7 +18,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/servarr.py b/app/glean/servarr.py similarity index 99% rename from app/ingest/servarr.py rename to app/glean/servarr.py index b59471e..bd494c2 100644 --- a/app/ingest/servarr.py +++ b/app/glean/servarr.py @@ -12,7 +12,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/glean/ssh.py b/app/glean/ssh.py new file mode 100644 index 0000000..5acfed3 --- /dev/null +++ b/app/glean/ssh.py @@ -0,0 +1,225 @@ +"""SSH transport layer for remote log gleaning (issue #22). + +Wraps Paramiko to provide a clean context-manager interface for executing +remote commands and streaming their stdout output. All format parsing is +delegated to the existing per-format parsers (journald, syslog, plaintext, +docker); this module is transport only. + +Key design choices: +- Key-based auth only — no password prompts in a daemon context. +- exec_stream is a generator; exit-status check fires after all lines are + yielded, so callers must drain the iterator (e.g. list()) to trigger it. +- Command builders live here because they encode SSH/remote-execution idioms + (journalctl flags, docker logs invocation) that the generic parsers don't + need to know about. + +Example sources.yaml snippet:: + + sources: + - id: rack01 + transport: ssh + host: 192.168.1.10 + user: admin + key_path: ~/.ssh/id_ed25519 + glean: + - type: journald + args: ["--since", "2 hours ago"] + - type: syslog + path: /var/log/syslog + - type: plaintext + path: /var/log/app/error.log + - type: docker + containers: [myapp, nginx] +""" +from __future__ import annotations + +import shlex +from collections.abc import Iterator +from typing import Union + +import paramiko + + +__all__ = [ + "SSHConnectionError", + "SSHCommandError", + "SSHTransport", + "_build_journald_command", + "_build_syslog_command", + "_build_plaintext_command", + "_build_docker_command", +] + +# Default syslog path used when none is specified in the source spec. +_SYSLOG_DEFAULT_PATH = "/var/log/syslog" + + +# ── Custom exceptions ───────────────────────────────────────────────────────── + +class SSHConnectionError(Exception): + """Raised when the SSH connection cannot be established or authenticated.""" + + +class SSHCommandError(Exception): + """Raised when a remote command exits with a non-zero status code.""" + + +# ── Transport context manager ───────────────────────────────────────────────── + +class SSHTransport: + """Context manager wrapping a Paramiko SSH connection. + + Opens the connection on ``__enter__`` and closes it on ``__exit__``, + even if an exception propagates. Key-based authentication only. + + Usage:: + + with SSHTransport(host="10.0.0.1", user="admin", + key_path="~/.ssh/id_ed25519") as t: + for line in t.exec_stream("journalctl -o json --since '1 hour ago'"): + process(line) + """ + + def __init__( + self, + host: str, + user: str, + key_path: str, + port: int = 22, + ) -> None: + self._host = host + self._user = user + self._key_path = key_path + self._port = port + self._client: paramiko.SSHClient | None = None + + # ── context manager protocol ────────────────────────────────────────────── + + def __enter__(self) -> "SSHTransport": + client = paramiko.SSHClient() + client.set_missing_host_key_policy(paramiko.AutoAddPolicy()) + try: + client.connect( + hostname=self._host, + username=self._user, + key_filename=self._key_path, + port=self._port, + ) + except paramiko.AuthenticationException as exc: + client.close() + raise SSHConnectionError( + f"SSH auth failed for {self._user}@{self._host}: {exc}" + ) from exc + except paramiko.SSHException as exc: + client.close() + raise SSHConnectionError( + f"SSH connection failed to {self._host}: {exc}" + ) from exc + self._client = client + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore[override] + if self._client is not None: + self._client.close() + self._client = None + # Return None (falsy) so any in-flight exception is not suppressed. + + # ── remote execution ────────────────────────────────────────────────────── + + def exec_stream(self, command: str) -> Iterator[str]: + """Execute *command* on the remote host and yield stdout lines. + + The exit-status check runs after all stdout lines have been yielded, + so callers must drain the iterator to trigger it:: + + list(transport.exec_stream(cmd)) # raises if exit != 0 + + Raises: + SSHConnectionError: if called outside a ``with`` block. + SSHCommandError: if the remote command exits non-zero. + """ + if self._client is None: + raise SSHConnectionError( + "Not connected — use SSHTransport as a context manager" + ) + _, stdout, stderr = self._client.exec_command(command) + for line in stdout: + yield line + exit_code = stdout.channel.recv_exit_status() + # Guard against MagicMock in tests: only treat real integer exit codes. + if isinstance(exit_code, int) and exit_code != 0: + error_msg = stderr.read().decode(errors="replace").strip() + raise SSHCommandError( + f"Command failed (exit {exit_code}): {error_msg}" + ) + + +# ── Command builders ────────────────────────────────────────────────────────── + +def _build_journald_command(spec: dict) -> str: # type: ignore[type-arg] + """Build a ``journalctl`` command string from a glean source spec. + + Spec keys: + + - ``args`` — list of extra journalctl arguments appended verbatim. + - ``unit`` — shorthand for ``--unit `` (inserted before ``args``). + + Returns a single shell command string. + """ + parts = ["journalctl", "-o json", "--no-pager"] + if "unit" in spec: + parts.append(f"--unit {spec['unit']}") + if "args" in spec: + parts.extend(spec["args"]) + return " ".join(parts) + + +def _build_syslog_command(spec: dict) -> str: # type: ignore[type-arg] + """Build a ``cat`` command for a syslog-format log file. + + Spec keys: + + - ``path`` — path to the file (default: ``/var/log/syslog``). + + Returns a single shell command string. + """ + path = spec.get("path", _SYSLOG_DEFAULT_PATH) + return f"cat {shlex.quote(path)}" + + +def _build_plaintext_command(spec: dict) -> str: # type: ignore[type-arg] + """Build a ``cat`` command for an arbitrary plaintext log file. + + Spec keys: + + - ``path`` — **required** path to the log file. + + Raises: + KeyError: if ``path`` is absent from the spec. + """ + path = spec["path"] # intentional KeyError if missing — callers must supply it + return f"cat {shlex.quote(path)}" + + +def _build_docker_command( + spec: dict, # type: ignore[type-arg] +) -> Union[str, list[str]]: + """Build ``docker logs`` command(s) for one or more named containers. + + Spec keys: + + - ``containers`` — **required** list of container names or IDs. + + Returns a single command string when there is one container, or a list + of command strings when there are multiple (one command per container so + each can be streamed independently). + + Raises: + KeyError: if ``containers`` is absent from the spec. + ValueError: if ``containers`` is an empty list. + """ + containers = spec["containers"] # intentional KeyError if missing + if not containers: + raise ValueError("'containers' must be a non-empty list") + commands = [f"docker logs {shlex.quote(c)}" for c in containers] + return commands[0] if len(commands) == 1 else commands diff --git a/app/ingest/syslog.py b/app/glean/syslog.py similarity index 99% rename from app/ingest/syslog.py rename to app/glean/syslog.py index 81f38b1..341fdf4 100644 --- a/app/ingest/syslog.py +++ b/app/glean/syslog.py @@ -14,7 +14,7 @@ import re from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, detect_severity, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/tautulli.py b/app/glean/tautulli.py similarity index 99% rename from app/ingest/tautulli.py rename to app/glean/tautulli.py index dba0cf0..ed7578b 100644 --- a/app/ingest/tautulli.py +++ b/app/glean/tautulli.py @@ -5,7 +5,7 @@ Tautulli sends all template values as strings, so all fields are treated as str. """ from __future__ import annotations -from app.ingest.base import ( +from app.glean.base import ( apply_patterns, epoch_float_to_iso, make_entry_id, diff --git a/app/ingest/wazuh.py b/app/glean/wazuh.py similarity index 99% rename from app/ingest/wazuh.py rename to app/glean/wazuh.py index 49e808d..70b69a7 100644 --- a/app/ingest/wazuh.py +++ b/app/glean/wazuh.py @@ -22,7 +22,7 @@ import json from datetime import datetime, timezone from typing import Iterator -from app.ingest.base import ( +from app.glean.base import ( SourceState, apply_patterns, make_entry_id, now_iso, ) from app.services.models import LogPattern, RetrievedEntry diff --git a/app/ingest/pipeline.py b/app/ingest/pipeline.py deleted file mode 100644 index f912be9..0000000 --- a/app/ingest/pipeline.py +++ /dev/null @@ -1,328 +0,0 @@ -"""Ingest pipeline: auto-detect format, parse, write to SQLite.""" -from __future__ import annotations - -import json -import logging -import re -import sqlite3 -from pathlib import Path -from typing import Iterator - -import yaml - -from app.ingest import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh -from app.ingest.base import _compile, load_patterns, now_iso -from app.services.models import LogPattern, RetrievedEntry -from app.services.search import build_fts_index - -logger = logging.getLogger(__name__) - -_SCHEMA = """ -CREATE TABLE IF NOT EXISTS log_entries ( - id TEXT PRIMARY KEY, - source_id TEXT NOT NULL, - sequence INTEGER NOT NULL, - timestamp_raw TEXT, - timestamp_iso TEXT, - ingest_time TEXT NOT NULL, - severity TEXT, - repeat_count INTEGER DEFAULT 1, - out_of_order INTEGER DEFAULT 0, - matched_patterns TEXT DEFAULT '[]', - text TEXT NOT NULL -); -CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id); -CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso); -CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count); -CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity); -CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns); - -CREATE TABLE IF NOT EXISTS incidents ( - id TEXT PRIMARY KEY, - label TEXT NOT NULL, - issue_type TEXT NOT NULL DEFAULT '', - started_at TEXT, - ended_at TEXT, - notes TEXT NOT NULL DEFAULT '', - created_at TEXT NOT NULL, - severity TEXT NOT NULL DEFAULT 'medium' -); -CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at); - -CREATE TABLE IF NOT EXISTS received_bundles ( - id TEXT PRIMARY KEY, - source_host TEXT NOT NULL, - issue_type TEXT NOT NULL DEFAULT '', - label TEXT NOT NULL, - severity TEXT NOT NULL DEFAULT 'medium', - started_at TEXT, - bundled_at TEXT NOT NULL, - entry_count INTEGER NOT NULL DEFAULT 0, - bundle_json TEXT NOT NULL -); -CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at); -CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type); - -CREATE TABLE IF NOT EXISTS context_facts ( - id TEXT PRIMARY KEY, - category TEXT NOT NULL, - key TEXT NOT NULL, - value TEXT NOT NULL, - source TEXT, - created_at TEXT NOT NULL -); -CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category); -CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key); - -CREATE TABLE IF NOT EXISTS context_documents ( - id TEXT PRIMARY KEY, - filename TEXT NOT NULL, - doc_type TEXT NOT NULL, - full_text TEXT NOT NULL, - file_size INTEGER, - uploaded_at TEXT NOT NULL -); - -CREATE TABLE IF NOT EXISTS context_chunks ( - id TEXT PRIMARY KEY, - document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE, - chunk_index INTEGER NOT NULL, - text TEXT NOT NULL, - embedding BLOB -); -CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id); - -CREATE TABLE IF NOT EXISTS blocklist_candidates ( - id TEXT PRIMARY KEY, - domain_or_ip TEXT NOT NULL, - source_device_ip TEXT, - source_device_name TEXT, - first_seen TEXT NOT NULL, - last_seen TEXT NOT NULL, - hit_count INTEGER DEFAULT 1, - status TEXT DEFAULT 'pending', - pushed_at TEXT, - log_evidence TEXT DEFAULT '[]', - matched_rule TEXT, - llm_score REAL, - llm_reason TEXT -); -CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip); -CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status); -CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip); -""" - - -def ensure_schema(db_path: Path) -> None: - """Create all tables and apply additive migrations. Safe to call on every startup.""" - conn = sqlite3.connect(str(db_path)) - conn.execute("PRAGMA journal_mode=WAL") - conn.executescript(_SCHEMA) - # Additive column migrations — ALTER TABLE silently skips if column exists - for stmt in [ - "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''", - ]: - try: - conn.execute(stmt) - except sqlite3.OperationalError: - pass - conn.commit() - conn.close() - - -def _detect_format(first_line: str) -> str: - try: - obj = json.loads(first_line) - if "__REALTIME_TIMESTAMP" in obj: - return "journald" - if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"): - return "docker" - if wazuh.is_wazuh_alert(obj): - return "wazuh" - if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj): - return "caddy" - except (json.JSONDecodeError, AttributeError): - pass - if plex.is_plex_log(first_line): - return "plex" - if qbittorrent.is_qbit_log(first_line): - return "qbittorrent" - if servarr.is_servarr_log(first_line): - return "servarr" - if dmesg_log.is_dmesg_log(first_line): - return "dmesg" - if syslog.is_syslog(first_line): - return "syslog" - return "plaintext" - - -def _parse_file( - path: Path, - compiled: list[tuple[LogPattern, object]], - ingest_time: str, - source_id: str | None = None, -) -> Iterator[RetrievedEntry]: - source_id = source_id or path.stem - - with path.open("r", errors="replace") as f: - lines = iter(f) - try: - first = next(lines) - except StopIteration: - return - - fmt = _detect_format(first.strip()) - logger.info("Detected format %r for %s", fmt, path.name) - - def all_lines(): - yield first - yield from lines - - if fmt == "journald": - yield from journald.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "wazuh": - yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "docker": - yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "caddy": - yield from caddy.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "plex": - yield from plex.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "qbittorrent": - yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "servarr": - yield from servarr.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "dmesg": - yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time) - elif fmt == "syslog": - yield from syslog.parse(all_lines(), source_id, compiled, ingest_time) - else: - yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time) - - -def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None: - conn.executemany( - """ - INSERT OR IGNORE INTO log_entries - (id, source_id, sequence, timestamp_raw, timestamp_iso, - ingest_time, severity, repeat_count, out_of_order, - matched_patterns, text) - VALUES (?,?,?,?,?,?,?,?,?,?,?) - """, - [ - ( - e.entry_id, e.source_id, e.sequence, - e.timestamp_raw, e.timestamp_iso, e.ingest_time, - e.severity, e.repeat_count, int(e.out_of_order), - json.dumps(list(e.matched_patterns)), e.text, - ) - for e in batch - ], - ) - - -def _ingest_files( - files: list[Path], - db_path: Path, - pattern_file: Path | None = None, - batch_size: int = 1000, - source_id_map: dict[Path, str] | None = None, -) -> dict[str, int]: - pattern_file = pattern_file or Path("patterns/default.yaml") - patterns = load_patterns(pattern_file) - compiled = _compile(patterns) - ingest_time = now_iso() - source_id_map = source_id_map or {} - - conn = sqlite3.connect(str(db_path)) - conn.execute("PRAGMA journal_mode=WAL") - conn.executescript(_SCHEMA) - conn.commit() - - stats: dict[str, int] = {} - - for log_file in files: - source_id = source_id_map.get(log_file, log_file.stem) - count = 0 - batch: list[RetrievedEntry] = [] - for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id): - batch.append(entry) - if len(batch) >= batch_size: - _write_batch(conn, batch) - conn.commit() - count += len(batch) - batch.clear() - if batch: - _write_batch(conn, batch) - conn.commit() - count += len(batch) - stats[source_id] = stats.get(source_id, 0) + count - logger.info("Ingested %d entries from %s (source: %s)", count, log_file.name, source_id) - - conn.close() - - logger.info("Building FTS index...") - build_fts_index(db_path) - logger.info("FTS index ready") - - return stats - - -def ingest( - corpus_dir: Path, - db_path: Path, - pattern_file: Path | None = None, - batch_size: int = 1000, -) -> dict[str, int]: - """Ingest all .jsonl and .log files from a corpus directory.""" - files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log")) - return _ingest_files(files, db_path, pattern_file, batch_size) - - -def ingest_file( - log_file: Path, - db_path: Path, - pattern_file: Path | None = None, -) -> dict[str, int]: - """Ingest a single log file (any supported format).""" - return _ingest_files([log_file], db_path, pattern_file) - - -def ingest_sources( - sources_file: Path, - db_path: Path, - pattern_file: Path | None = None, - batch_size: int = 1000, -) -> dict[str, int]: - """Ingest all sources listed in a sources.yaml config file. - - sources.yaml format: - sources: - - id: sonarr - path: /opt/sonarr/config/logs/sonarr.0.txt - - id: qbittorrent - path: /opt/qbittorrent/config/data/logs/qbittorrent.log - - Missing paths are skipped with a warning so the cron keeps running - when a service is temporarily down. - """ - with open(sources_file) as f: - config = yaml.safe_load(f) - - files: list[Path] = [] - source_id_map: dict[Path, str] = {} - - for src in config.get("sources", []): - path = Path(src["path"]) - if not path.exists(): - logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path) - continue - files.append(path) - if "id" in src: - source_id_map[path] = src["id"] - - if not files: - logger.warning("No source files found — check sources.yaml paths") - return {} - - return _ingest_files(files, db_path, pattern_file, batch_size, source_id_map) diff --git a/app/mcp_server.py b/app/mcp_server.py index 4047f98..4c30cdf 100644 --- a/app/mcp_server.py +++ b/app/mcp_server.py @@ -94,7 +94,7 @@ def search_logs( severity: Filter by level — EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG. source: Partial match on source_id. Format is 'corpus:host:service'. Example: 'xanderland:caddy' matches all Caddy entries from xanderland. - pattern: Filter by named pattern tag applied at ingest time. + pattern: Filter by named pattern tag applied at glean time. Known tags: auth_failure, connection_lost, oom, segfault, disk_full, timeout, caddy_tls_error, caddy_config_error, caddy_auth_error, caddy_upstream_error, service_restart, service_update, @@ -176,7 +176,7 @@ def list_log_sources() -> str: """ sources = list_sources(DB_PATH) if not sources: - return "No log sources found. Has the corpus been ingested? Run: python scripts/ingest_corpus.py" + return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py" lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"] for s in sources: @@ -192,7 +192,7 @@ def list_log_sources() -> str: if __name__ == "__main__": if not DB_PATH.exists(): logger.error("Database not found: %s", DB_PATH) - logger.error("Run: python scripts/ingest_corpus.py ") + logger.error("Run: python scripts/glean_corpus.py ") sys.exit(1) logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH) mcp.run() diff --git a/app/rest.py b/app/rest.py index 81601f2..c3ef4d9 100644 --- a/app/rest.py +++ b/app/rest.py @@ -27,10 +27,10 @@ from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse from fastapi.staticfiles import StaticFiles from pydantic import BaseModel -from app.ingest.pipeline import ensure_schema, ingest_file as _ingest_file -from app.ingest.base import load_compiled_patterns, now_iso -from app.ingest.tautulli import parse_webhook as _parse_tautulli -from app.ingest.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh +from app.glean.pipeline import ensure_schema, glean_file as _glean_file, glean_ssh_source as _glean_ssh_source +from app.glean.base import load_compiled_patterns, now_iso +from app.glean.tautulli import parse_webhook as _parse_tautulli +from app.glean.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh from app.services.blocklist import ( BlocklistCandidate, get_candidate, @@ -71,11 +71,11 @@ from app.context.store import ( delete_document as _delete_document, ) from app.context.retriever import retrieve_context as _retrieve_context, format_context_block -from app.ingest.doc_upload import ingest_upload as _ingest_upload +from app.glean.doc_upload import glean_upload as _glean_upload from app.context.wizard import get_schema as _wizard_schema, advance_step, is_complete, apply_session from app.context.chunker import UnsupportedDocType, FileTooLarge -from app.tasks.ingest_scheduler import get_state as _ingest_state, run_once as _run_ingest, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched -from app.ingest.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers +from app.tasks.glean_scheduler import get_state as _glean_state, run_once as _run_glean, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched +from app.glean.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers DB_PATH = Path(os.environ.get("TURNSTONE_DB", Path(__file__).parent.parent / "data" / "turnstone.db")) PREFS_PATH = DB_PATH.parent / "preferences.json" @@ -84,7 +84,7 @@ SOURCE_HOST = os.environ.get("TURNSTONE_SOURCE_HOST", "unknown") BUNDLE_ENDPOINT = os.environ.get("TURNSTONE_BUNDLE_ENDPOINT", "") PATTERN_DIR = Path(os.environ.get("TURNSTONE_PATTERNS", Path(__file__).parent.parent / "patterns")) PATTERN_FILE = PATTERN_DIR / "default.yaml" -INGEST_INTERVAL = int(os.environ.get("TURNSTONE_INGEST_INTERVAL", "900")) +GLEAN_INTERVAL = int(os.environ.get("TURNSTONE_GLEAN_INTERVAL", "900")) SUBMIT_ENDPOINT = os.environ.get("TURNSTONE_SUBMIT_ENDPOINT", "").rstrip("/") # GPU inference server URL. @@ -119,14 +119,14 @@ async def _lifespan(app: FastAPI): sources_file = PATTERN_DIR / "sources.yaml" _scheduler_task: asyncio.Task | None = None - if INGEST_INTERVAL > 0 and sources_file.exists(): + if GLEAN_INTERVAL > 0 and sources_file.exists(): _scheduler_task = asyncio.create_task( _scheduler_loop( - sources_file, DB_PATH, PATTERN_FILE, INGEST_INTERVAL, + sources_file, DB_PATH, PATTERN_FILE, GLEAN_INTERVAL, submit_endpoint=SUBMIT_ENDPOINT or None, source_host=SOURCE_HOST, ), - name="ingest-scheduler", + name="glean-scheduler", ) _mqtt_task: asyncio.Task | None = None @@ -433,6 +433,72 @@ def list_sources() -> dict: return {"sources": _list_sources(DB_PATH)} +@router.get("/api/sources/configured") +def list_configured_sources() -> dict: + """Return every source in sources.yaml, enriched with DB stats. + + Unlike ``/api/sources`` (which is DB-only), this endpoint reads sources.yaml + so SSH sources appear even before their first successful glean. DB entry + counts, error counts, and timestamps are aggregated and merged in. + + For SSH sources, sub-source IDs (e.g. ``rack01/journald``) are summed to + produce a single aggregate stat row for the top-level host entry. + """ + sources_file = PATTERN_DIR / "sources.yaml" + if not sources_file.exists(): + return {"sources": []} + + with open(sources_file) as f: + config = yaml.safe_load(f) or {} + + # Fetch all DB source stats once; key by source_id for O(1) lookup. + db_stats: dict[str, dict] = {} + try: + for row in _list_sources(DB_PATH): + db_stats[row["source_id"]] = row + except Exception: + pass # DB may not exist on first run + + result = [] + for src in config.get("sources", []): + transport = src.get("transport", "local") + src_id = src.get("id", "") + + entry: dict = {"id": src_id, "transport": transport} + + if transport != "ssh": + entry["path"] = src.get("path", "") + db = db_stats.get(src_id, {}) + entry["entry_count"] = db.get("entry_count", 0) + entry["error_count"] = db.get("error_count", 0) + entry["earliest"] = db.get("earliest") + entry["latest"] = db.get("latest") + else: + entry["host"] = src.get("host", "") + entry["user"] = src.get("user", "") + glean_items: list[dict] = src.get("glean", []) + entry["glean_types"] = sorted({item.get("type", "plaintext") for item in glean_items}) + entry["glean_items"] = glean_items + + # Aggregate sub-source DB rows that belong to this SSH host. + # Sub-sources use IDs like "{host_id}/{type}" or "{host_id}/{type}/{container}". + prefix = src_id + "/" + matching_rows = [ + v for k, v in db_stats.items() + if k.startswith(prefix) or k == src_id + ] + entry["entry_count"] = sum(r.get("entry_count", 0) for r in matching_rows) + entry["error_count"] = sum(r.get("error_count", 0) for r in matching_rows) + earliests = [r["earliest"] for r in matching_rows if r.get("earliest")] + latests = [r["latest"] for r in matching_rows if r.get("latest")] + entry["earliest"] = min(earliests) if earliests else None + entry["latest"] = max(latests) if latests else None + + result.append(entry) + + return {"sources": result} + + @router.delete("/api/sources/{source_id}") def delete_source(source_id: str) -> dict: """Delete all log entries (and FTS index rows) for a given source.""" @@ -448,9 +514,22 @@ def delete_source(source_id: str) -> dict: return {"deleted": deleted, "source_id": source_id} -@router.post("/api/sources/{source_id}/ingest") -def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict: - """Trigger a re-ingest for a configured source from sources.yaml.""" +@router.post("/api/sources/{source_id}/glean") +def reglean_source( + source_id: str, + background_tasks: BackgroundTasks, + force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean even if file is unchanged")] = False, +) -> dict: + """Trigger a re-glean for a configured source from sources.yaml. + + Handles both local file sources and SSH remote sources. For SSH sources, + the glean runs in the foreground and rebuilds the FTS index before returning + (same behaviour as local sources — callers can rely on the count being final + when the response arrives). + + Use ``?force=true`` to bypass the fingerprint cache and re-glean the file + even if mtime and size appear unchanged since the last run. + """ sources_file = PATTERN_DIR / "sources.yaml" if not sources_file.exists(): raise HTTPException(status_code=404, detail="sources.yaml not found") @@ -459,21 +538,31 @@ def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict: matching = [s for s in config.get("sources", []) if s.get("id") == source_id] if not matching: raise HTTPException(status_code=404, detail=f"Source {source_id!r} not in sources.yaml") - src_path = Path(matching[0]["path"]) + + src = matching[0] + + if src.get("transport") == "ssh": + # SSH sources: open connection, glean all items, rebuild FTS inline. + # Fingerprint skipping applies only to local file sources. + stats = _glean_ssh_source(src, DB_PATH, PATTERN_FILE) + return {"source_id": source_id, "gleaned": sum(stats.values())} + + # Local file source. + src_path = Path(src["path"]) if not src_path.exists(): raise HTTPException(status_code=422, detail=f"Path does not exist: {src_path}") - stats = _ingest_file(src_path, DB_PATH, PATTERN_FILE) + stats = _glean_file(src_path, DB_PATH, PATTERN_FILE, force=force) background_tasks.add_task(build_fts_index, DB_PATH) - return {"source_id": source_id, "ingested": stats.get(source_id, sum(stats.values()))} + return {"source_id": source_id, "gleaned": stats.get(source_id, sum(stats.values()))} -@router.post("/api/ingest/upload") -async def ingest_upload( +@router.post("/api/glean/upload") +async def glean_upload( file: UploadFile, source_id: Annotated[str | None, Query(description="Override source ID (defaults to filename)")] = None, background_tasks: BackgroundTasks = None, ) -> dict: - """Accept a multipart log file, auto-detect format, ingest into DB.""" + """Accept a multipart log file, auto-detect format, glean into DB.""" sid = source_id or Path(file.filename or "upload").stem content = await file.read() with tempfile.NamedTemporaryFile( @@ -483,13 +572,13 @@ async def ingest_upload( tmp.write(content) tmp_path = Path(tmp.name) try: - stats = _ingest_file(tmp_path, DB_PATH, PATTERN_FILE) + stats = _glean_file(tmp_path, DB_PATH, PATTERN_FILE) finally: tmp_path.unlink(missing_ok=True) if background_tasks is not None: background_tasks.add_task(build_fts_index, DB_PATH) total = sum(stats.values()) - return {"source_id": sid, "ingested": total, "stats": stats} + return {"source_id": sid, "gleaned": total, "stats": stats} class BatchEntry(BaseModel): @@ -506,20 +595,20 @@ class BatchEntry(BaseModel): text: str -class BatchIngestRequest(BaseModel): +class BatchGleanRequest(BaseModel): source_host: str = "unknown" entries: list[BatchEntry] -@router.post("/api/ingest/batch") -def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks) -> dict: +@router.post("/api/glean/batch") +def glean_batch(payload: BatchGleanRequest, background_tasks: BackgroundTasks) -> dict: """Accept pre-parsed log entries from a remote Turnstone instance (submission protocol). Used by nodes with TURNSTONE_SUBMIT_ENDPOINT configured to push their pattern-matched entries to a central receiving instance. """ if not payload.entries: - return {"ingested": 0} + return {"gleaned": 0} conn = sqlite3.connect(str(DB_PATH)) conn.execute("PRAGMA journal_mode=WAL") conn.executemany( @@ -550,13 +639,13 @@ def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks) conn.commit() conn.close() background_tasks.add_task(build_fts_index, DB_PATH) - return {"ingested": len(payload.entries), "source_host": payload.source_host} + return {"gleaned": len(payload.entries), "source_host": payload.source_host} -@router.get("/api/tasks/ingest/status") -def ingest_task_status() -> dict: - """Return the current state of the periodic batch ingest scheduler.""" - s = _ingest_state() +@router.get("/api/tasks/glean/status") +def glean_task_status() -> dict: + """Return the current state of the periodic glean scheduler.""" + s = _glean_state() return { "running": s.running, "run_count": s.run_count, @@ -565,8 +654,8 @@ def ingest_task_status() -> dict: "last_stats": s.last_stats, "last_error": s.last_error, "next_run_at": s.next_run_at, - "interval_s": INGEST_INTERVAL, - "scheduler_active": INGEST_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(), + "interval_s": GLEAN_INTERVAL, + "scheduler_active": GLEAN_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(), "submit_endpoint": SUBMIT_ENDPOINT or None, "last_submitted_at": s.last_submitted_at, "last_submit_count": s.last_submit_count, @@ -574,21 +663,28 @@ def ingest_task_status() -> dict: } -@router.post("/api/tasks/ingest") -async def trigger_ingest() -> dict: - """Manually trigger a batch ingest of all configured sources. No-ops if already running.""" +@router.post("/api/tasks/glean") +async def trigger_glean( + force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean all sources")] = False, +) -> dict: + """Manually trigger a glean of all configured sources. No-ops if already running. + + Use ``?force=true`` to bypass the fingerprint cache and re-glean every local + file source even when mtime and size are unchanged since the last run. + """ sources_file = PATTERN_DIR / "sources.yaml" if not sources_file.exists(): raise HTTPException(status_code=404, detail="sources.yaml not found — configure log sources first") - return await _run_ingest( + return await _run_glean( sources_file, DB_PATH, PATTERN_FILE, submit_endpoint=SUBMIT_ENDPOINT or None, source_host=SOURCE_HOST, + force=force, ) -@router.post("/api/ingest/wazuh/alert") -async def ingest_wazuh_alert( +@router.post("/api/glean/wazuh/alert") +async def glean_wazuh_alert( alert: dict, source_id: Annotated[str | None, Query(description="Source label (defaults to 'wazuh')")] = None, background_tasks: BackgroundTasks = None, @@ -769,8 +865,8 @@ def _tautulli_write_entry(conn: sqlite3.Connection, entry) -> None: ) -@router.post("/api/ingest/tautulli") -def ingest_tautulli( +@router.post("/api/glean/tautulli") +def glean_tautulli( payload: dict, request: Request, background_tasks: BackgroundTasks, diff --git a/app/services/diagnose/__init__.py b/app/services/diagnose/__init__.py new file mode 100644 index 0000000..51613cf --- /dev/null +++ b/app/services/diagnose/__init__.py @@ -0,0 +1,357 @@ +"""Frictionless diagnose service — NL time extraction + layered log search. + +This module is the public interface for the diagnose package. +Full implementation lives here so that patch("app.services.diagnose._HAS_DATEPARSER") +and patch("app.services.diagnose._search_dates") continue to target the correct +namespace, preserving backward compatibility with existing tests. + +The verbatim original is preserved in legacy.py for reference. +""" + +from __future__ import annotations + +import asyncio +import dataclasses +import logging +import os +import re +from collections.abc import AsyncGenerator +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any + +from app.context.retriever import retrieve_context, format_context_block +from app.services.llm import summarize +from app.services.search import SearchResult, entries_in_window, search +from app.services.diagnose.pipeline import run_pipeline + +logger = logging.getLogger(__name__) + +try: + from dateparser.search import search_dates as _search_dates # type: ignore[import] + + _HAS_DATEPARSER = True +except ImportError: + _search_dates = None # type: ignore[assignment] + _HAS_DATEPARSER = False + + +_RELATIVE_RE = re.compile( + r"\b(?:last|past)\s+(?:(?P\d+)|(?Pa\s+few|few|couple(?:\s+of)?|several))?\s*(?Pminute|hour|day|week)s?\b", + re.IGNORECASE, +) +_RELATIVE_UNITS = {"minute": 1, "hour": 60, "day": 1440, "week": 10080} +# Fuzzy quantifiers map to a reasonable span so "last few hours" → 3h window +_APPROX_N = 3 + + +def _relative_window(match: re.Match) -> tuple[str, str]: + """Convert a relative time match to (since_iso, until_iso).""" + n_str = match.group("n") + approx = match.group("approx") + unit = match.group("unit").lower() + n = int(n_str) if n_str else (_APPROX_N if approx else 1) + minutes = n * _RELATIVE_UNITS[unit] + return _last_n_minutes(minutes), _now_iso() + + +def parse_time_window(query: str) -> tuple[str | None, str | None, str]: + """Extract a time window from a natural-language query string. + + Returns (since_iso, until_iso, keywords) where keywords is the query with + the matched time phrase stripped. Falls back to last-60-min window. + """ + # Handle relative expressions first ("last hour", "past 30 minutes", etc.) + # dateparser misinterprets these as absolute times. + m = _RELATIVE_RE.search(query) + if m: + since, until = _relative_window(m) + keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip() + return since, until, keywords or query + + if _HAS_DATEPARSER and _search_dates is not None: + # Tell dateparser what timezone the user is in so "3:35 am" means local time. + # PREFER_DAY_OF_MONTH is unused here but PREFER_DATES_FROM=past ensures + # "3:35 am" resolves to the most recent past occurrence, not a future one. + local_offset = datetime.now().astimezone().utcoffset() + offset_h = int((local_offset.total_seconds() if local_offset else 0) / 3600) + tz_str = f"UTC{'+' if offset_h >= 0 else ''}{offset_h}" + try: + results = _search_dates( + query, + languages=["en"], + settings={ + "PREFER_DATES_FROM": "past", + "TIMEZONE": tz_str, + "RETURN_AS_TIMEZONE_AWARE": True, + }, + ) + except Exception as e: + logger.warning( + "dateparser failed (%s) on query %r — falling back to 60-min window", + type(e).__name__, + query, + ) + results = None + if results: + phrase, dt = results[0] + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + else: + dt = dt.astimezone( + timezone.utc + ) # normalise to UTC for SQLite string compare + since = (dt - timedelta(minutes=30)).isoformat() + until = (dt + timedelta(minutes=30)).isoformat() + keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip()) + return since, until, keywords or query + + return _last_n_minutes(60), _now_iso(), query + + +def diagnose( + db_path: Path, + query: str, + since: str | None = None, + until: str | None = None, + source_filter: str | None = None, + llm_url: str | None = None, + llm_model: str | None = None, + llm_api_key: str | None = None, +) -> dict[str, Any]: + """Run layered log search with NL time extraction. Returns summary + entries.""" + time_detected = since is not None and until is not None + if not time_detected: + parsed_since, parsed_until, keywords = parse_time_window(query) + since = since or parsed_since + until = until or parsed_until + time_detected = keywords != query + else: + keywords = query + + keyword_hits = search( + db_path, + query=keywords, + since=since, + until=until, + source_filter=source_filter, + limit=150, + or_mode=True, + ) + window_hits = entries_in_window( + db_path, + since=since, + until=until, + source_filter=source_filter, + limit=50, + per_source_cap=15, + ) + + seen: set[str] = set() + merged: list[SearchResult] = [] + for r in keyword_hits + window_hits: + if r.entry_id not in seen: + seen.add(r.entry_id) + merged.append(r) + + combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[ + :200 + ] + + by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0} + by_source: dict[str, int] = {} + for r in combined: + sev = (r.severity or "INFO").upper() + if sev in by_severity: + by_severity[sev] += 1 + by_source[r.source_id] = by_source.get(r.source_id, 0) + 1 + + reasoning: str | None = None + if llm_url and llm_model: + reasoning = summarize( + query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key + ) + + return { + "summary": { + "total": len(combined), + "window_start": since, + "window_end": until, + "time_detected": time_detected, + "by_severity": by_severity, + "by_source": by_source, + }, + "reasoning": reasoning, + "entries": combined, + } + + +async def diagnose_stream( + db_path: Path, + query: str, + since: str | None = None, + until: str | None = None, + source_filter: str | None = None, + llm_url: str | None = None, + llm_model: str | None = None, + llm_api_key: str | None = None, +) -> AsyncGenerator[dict[str, Any], None]: + """Async generator yielding SSE event dicts for the diagnose pipeline. + + Yields events in order: + {"type":"status","message":"…"} — pipeline progress + {"type":"summary","data":{…}} — window + severity counts (fast, from DB) + {"type":"entries","data":[…]} — log entries (fast, from DB) + {"type":"reasoning","text":"…"} — LLM analysis (slow, optional) + {"type":"done"} + """ + keywords = query.strip() + source_browse = not keywords and source_filter is not None + + if source_browse: + # No keyword — browsing a source directly. Use 24h window; skip FTS entirely. + yield {"type": "status", "message": f"Loading {source_filter}…"} + since = since or _last_n_minutes(60 * 24) + until = until or _now_iso() + time_detected = False + else: + yield {"type": "status", "message": "Parsing time window…"} + time_detected = since is not None and until is not None + if not time_detected: + parsed_since, parsed_until, keywords = await asyncio.to_thread( + parse_time_window, query + ) + since = since or parsed_since + until = until or parsed_until + time_detected = keywords != query + + yield {"type": "status", "message": "Loading environment context…"} + ctx = await asyncio.to_thread(lambda: retrieve_context(db_path, query)) + context_block = format_context_block(ctx) + yield { + "type": "context", + "facts": ctx.facts, + "chunks": ctx.chunks, + } + + yield {"type": "status", "message": "Searching logs…"} + + if source_browse: + keyword_hits: list[SearchResult] = [] + window_hits = await asyncio.to_thread( + lambda: entries_in_window( + db_path, + since, + until, + source_filter=source_filter, + limit=200, + ) + ) + else: + keyword_hits, window_hits = await asyncio.gather( + asyncio.to_thread( + lambda: search( + db_path, + keywords, + source_filter=source_filter, + since=since, + until=until, + limit=150, + or_mode=True, + ) + ), + asyncio.to_thread( + lambda: entries_in_window( + db_path, + since, + until, + source_filter=source_filter, + limit=50, + per_source_cap=15, + ) + ), + ) + + seen: set[str] = set() + merged: list[SearchResult] = [] + for r in keyword_hits + window_hits: + if r.entry_id not in seen: + seen.add(r.entry_id) + merged.append(r) + + combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[ + :200 + ] + + by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0} + by_source: dict[str, int] = {} + for r in combined: + sev = (r.severity or "INFO").upper() + if sev in by_severity: + by_severity[sev] += 1 + by_source[r.source_id] = by_source.get(r.source_id, 0) + 1 + + yield { + "type": "summary", + "data": { + "total": len(combined), + "window_start": since, + "window_end": until, + "time_detected": time_detected, + "by_severity": by_severity, + "by_source": by_source, + }, + } + yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]} + + if MULTI_AGENT_ENABLED: + async for event in run_pipeline( + db_path=db_path, + entries=combined, + ctx=ctx, + query=query, + since=since, + until=until, + llm_url=llm_url, + llm_model=llm_model, + llm_api_key=llm_api_key, + ): + yield event + return # pipeline emits its own "done" event + + if llm_url and llm_model and combined: + yield {"type": "status", "message": "Analyzing with LLM…"} + reasoning = await asyncio.to_thread( + lambda: summarize( + query, + combined, + llm_url, + llm_model, + llm_api_key, + context_block=context_block, + ) + ) + if reasoning: + yield {"type": "reasoning", "text": reasoning} + + yield {"type": "done"} + + +def _now_iso() -> str: + return datetime.now(timezone.utc).isoformat() + + +def _last_n_minutes(n: int) -> str: + return (datetime.now(timezone.utc) - timedelta(minutes=n)).isoformat() + + +__all__ = [ + "diagnose", + "diagnose_stream", + "parse_time_window", +] + +# Feature flag for Task 6 +MULTI_AGENT_ENABLED = ( + os.getenv("TURNSTONE_MULTI_AGENT_DIAGNOSE", "false").lower() == "true" +) diff --git a/app/services/diagnose/classifier.py b/app/services/diagnose/classifier.py new file mode 100644 index 0000000..b7aa8ed --- /dev/null +++ b/app/services/diagnose/classifier.py @@ -0,0 +1,249 @@ +"""Stage 2: Severity Classifier — ML with two fallback levels. + +Classification strategy (in priority order): + + Path A — ML: Hugging Face text-classification pipeline, loaded lazily. + Path B — pattern_tags: Map cluster.pattern_tags through the loaded pattern + severity dict; pick the highest severity across matching tags. + Path C — regex: Call detect_severity() from app.glean.base on the cluster's + representative_text. + +Each cluster is classified independently. The ``classifier_used`` field on the +returned ``ClassifiedTimeline`` reflects the primary path (the one that governed +the overall classification session, not individual cluster fallbacks). +""" +from __future__ import annotations + +import logging +import os +from pathlib import Path +from typing import Any + +from app.services.diagnose.models import ( + ClassifiedTimeline, + EventCluster, + SeverityLabel, + TimelineResult, +) + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Module-level ML singleton — reset to None between tests via the fixture +# --------------------------------------------------------------------------- + +_ml_classifier: Any | None = None + + +def _get_ml_classifier(model_id: str, device: str) -> Any: + """Return the cached HF pipeline, loading it on first call.""" + global _ml_classifier # noqa: PLW0603 + if _ml_classifier is None: + from transformers import pipeline as hf_pipeline # type: ignore[import-untyped] + + _ml_classifier = hf_pipeline( + "text-classification", model=model_id, device=device + ) + return _ml_classifier + + +# --------------------------------------------------------------------------- +# Label mapping +# --------------------------------------------------------------------------- + +_LABEL_MAP: dict[str, SeverityLabel] = { + "ERROR": "ERROR", + "WARNING": "WARN", + "WARN": "WARN", + "INFO": "INFO", + "DEBUG": "DEBUG", + "CRITICAL": "CRITICAL", +} + +_CRITICAL_KEYWORDS: frozenset[str] = frozenset( + { + "panic", + "oom", + "fatal", + "critical", + "kernel panic", + "out of memory", + "segfault", + "segmentation fault", + } +) + +_SEVERITY_ORDER: dict[str | None, int] = { + "CRITICAL": 5, + "ERROR": 4, + "WARN": 3, + "WARNING": 3, + "INFO": 2, + "DEBUG": 1, + None: 0, +} + + +def _map_label(label: str, score: float, text: str) -> SeverityLabel: + """Apply the severity shim: promote to CRITICAL or demote to DEBUG where warranted.""" + upper = label.upper() + if upper == "ERROR" and score > 0.95 and any( + k in text.lower() for k in _CRITICAL_KEYWORDS + ): + return "CRITICAL" + if upper == "INFO" and score < 0.4: + return "DEBUG" + return _LABEL_MAP.get(upper, "UNKNOWN") # type: ignore[return-value] + + +def _highest_from_tags( + tags: tuple[str, ...], severity_map: dict[str, str] +) -> SeverityLabel | None: + """Return the highest severity from the pattern_tags that appear in severity_map.""" + best: str | None = None + best_rank = -1 + for tag in tags: + sev = severity_map.get(tag) + rank = _SEVERITY_ORDER.get(sev, 0) + if rank > best_rank: + best_rank = rank + best = sev + if best is None: + return None + normalised = "WARN" if best.upper() == "WARNING" else best.upper() + return normalised # type: ignore[return-value] + + +# --------------------------------------------------------------------------- +# SeverityClassifier +# --------------------------------------------------------------------------- + + +class SeverityClassifier: + """Classify each EventCluster's severity using ML, patterns, or regex fallback. + + Parameters + ---------- + model_id: + Hugging Face model identifier. When empty (default), ML is skipped. + device: + Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``). + pattern_file: + Path to the YAML pattern file. When ``None`` the classifier reads + ``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``). + """ + + def __init__( + self, + model_id: str = "", + device: str = "cpu", + pattern_file: Path | None = None, + ) -> None: + self._model_id = model_id + self._device = device + self._pattern_file: Path | None = pattern_file + self._pattern_severity: dict[str, str] = {} + self._patterns_loaded = False + + # ------------------------------------------------------------------ + # Lazy loaders + # ------------------------------------------------------------------ + + def _resolve_pattern_file(self) -> Path | None: + """Resolve pattern file from constructor arg or env var.""" + if self._pattern_file is not None: + return self._pattern_file + env_dir = os.environ.get("TURNSTONE_PATTERNS") + if env_dir: + return Path(env_dir) / "default.yaml" + return None + + def _ensure_patterns_loaded(self) -> None: + """Populate _pattern_severity from the pattern YAML file (once).""" + if self._patterns_loaded: + return + self._patterns_loaded = True + path = self._resolve_pattern_file() + if path is None: + return + from app.glean.base import load_patterns + + patterns = load_patterns(path) + self._pattern_severity = {p.name: p.severity for p in patterns} + + # ------------------------------------------------------------------ + # Per-cluster classification helpers + # ------------------------------------------------------------------ + + def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None: + """Attempt ML classification. Returns None on any inference failure.""" + try: + pipe = _get_ml_classifier(self._model_id, self._device) + results = pipe(cluster.representative_text) + if not results: + return None + hit = results[0] + return _map_label(hit["label"], hit["score"], cluster.representative_text) + except Exception: # noqa: BLE001 + logger.warning( + "ML inference failed for cluster %s — falling back", + cluster.cluster_id, + ) + return None + + def _classify_cluster_pattern_tags( + self, cluster: EventCluster + ) -> SeverityLabel | None: + """Derive severity from the cluster's pattern_tags. Returns None if no match.""" + return _highest_from_tags(cluster.pattern_tags, self._pattern_severity) + + def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel: + """Classify by scanning representative_text with the severity regex.""" + from app.glean.base import detect_severity + + raw = detect_severity(cluster.representative_text) + if raw is None: + return "INFO" + return _LABEL_MAP.get(raw.upper(), "INFO") # type: ignore[return-value] + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def classify(self, timeline: TimelineResult) -> ClassifiedTimeline: + """Classify every cluster in *timeline* and return a ``ClassifiedTimeline``.""" + self._ensure_patterns_loaded() + + # Determine which primary path governs this session + ml_available = bool(self._model_id) + patterns_available = bool(self._pattern_severity) + + if ml_available: + classifier_used: str = "ml" + elif patterns_available: + classifier_used = "pattern_tags" + else: + classifier_used = "regex" + + cluster_severities: dict[str, SeverityLabel] = {} + + for cluster in timeline.clusters: + severity: SeverityLabel | None = None + + if ml_available: + severity = self._classify_cluster_ml(cluster) + + if severity is None and patterns_available: + severity = self._classify_cluster_pattern_tags(cluster) + + if severity is None: + severity = self._classify_cluster_regex(cluster) + + cluster_severities[cluster.cluster_id] = severity + + return ClassifiedTimeline( + timeline=timeline, + cluster_severities=cluster_severities, + classifier_used=classifier_used, # type: ignore[arg-type] + model_id=self._model_id if ml_available else None, + ) diff --git a/app/services/diagnose/hypothesizer.py b/app/services/diagnose/hypothesizer.py new file mode 100644 index 0000000..7c5c3e6 --- /dev/null +++ b/app/services/diagnose/hypothesizer.py @@ -0,0 +1,216 @@ +"""Stage 3: Root-Cause Hypothesizer — LLM + RAG context.""" +from __future__ import annotations + +import json +import logging +from uuid import uuid4 + +import httpx + +from app.context.retriever import RetrievedContext +from app.services.diagnose.models import ( + ClassifiedTimeline, + EventCluster, + Hypothesis, + SeverityLabel, +) + +logger = logging.getLogger(__name__) + +_VALID_SEVERITIES: frozenset[str] = frozenset({"CRITICAL", "ERROR", "WARN", "INFO", "DEBUG"}) + +_SYSTEM_PROMPT = ( + "You are a Linux sysadmin log analyst. Analyze the following clustered log timeline " + "and generate 2-4 root cause hypotheses as a JSON array.\n\n" + "Each hypothesis must follow this exact JSON schema:\n" + '{"title": str (≤80 chars), "description": str (2-4 sentences), ' + '"confidence": float (0.0-1.0), "severity": str (one of: CRITICAL, ERROR, WARN, INFO), ' + '"supporting_clusters": [str list of cluster IDs]}\n\n' + "Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON." +) + + +def _coerce_float(val: object, default: float) -> float: + """Safely coerce LLM output to float, returning default on failure.""" + try: + return float(val) # type: ignore[arg-type] + except (TypeError, ValueError): + return default + + +def _validate_severity(s: str) -> SeverityLabel: + """Map a raw severity string to a valid SeverityLabel, defaulting to ERROR.""" + upper = s.upper() + if upper == "WARNING": + return "WARN" + return upper if upper in _VALID_SEVERITIES else "ERROR" # type: ignore[return-value] + + +def _cluster_summary(cluster: EventCluster, severity: str) -> str: + """Build a condensed single-line summary of a cluster for the prompt.""" + sources = ", ".join(list(cluster.source_ids)[:3]) + patterns = ", ".join(list(cluster.pattern_tags)[:5]) + text_preview = cluster.representative_text[:200] + summary = ( + f"[{severity}] {cluster.start_iso or 'unknown'} " + f"({sources}) — {text_preview}" + ) + if patterns: + summary += f" [patterns: {patterns}]" + return summary + + +def _extract_content(resp_json: dict) -> str | None: + """Pull text content from an OpenAI-compat chat completion response.""" + choices = resp_json.get("choices") or [] + if not choices: + return None + return (choices[0].get("message", {}).get("content") or "").strip() or None + + +class RootCauseHypothesizer: + """Generate ranked root-cause hypotheses from a classified log timeline.""" + + def __init__(self, max_hypotheses: int = 4) -> None: + self._max_hypotheses = max_hypotheses + + def hypothesize( + self, + classified: ClassifiedTimeline, + ctx: RetrievedContext, + query: str, + llm_url: str | None = None, + llm_model: str | None = None, + llm_api_key: str | None = None, + ) -> list[Hypothesis]: + """Generate hypotheses from a classified timeline and RAG context. + + Returns an empty list when no LLM is configured or there are no + clusters to analyse. + """ + if not llm_url or not llm_model: + return [] + + clusters = classified.timeline.clusters + if not clusters: + return [] + + cluster_lines = [ + _cluster_summary(c, classified.cluster_severities.get(c.cluster_id, c.severity)) + for c in clusters + ] + cluster_block = "\n".join(cluster_lines) + + context_parts: list[str] = [] + for chunk in ctx.chunks[:5]: + filename = chunk.get("filename", "unknown") + text = chunk.get("text", "")[:300] + context_parts.append(f"[{filename}] {text}") + context_block = "\n".join(context_parts) if context_parts else "(none)" + + user_message = ( + f"Query: {query}\n\n" + f"Context from runbooks and known patterns:\n{context_block}\n\n" + f"Log timeline (clustered, {len(clusters)} clusters):\n{cluster_block}\n\n" + f"Generate up to {self._max_hypotheses} hypotheses. Return JSON array only." + ) + + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": user_message}, + ] + + raw_response = self._call_llm( + llm_url=llm_url, + llm_model=llm_model, + llm_api_key=llm_api_key, + messages=messages, + ) + if raw_response is None: + return [] + + return self._parse_response(raw_response) + + def _call_llm( + self, + llm_url: str, + llm_model: str, + llm_api_key: str | None, + messages: list[dict], + ) -> str | None: + """Send messages to the LLM and return raw text content.""" + headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {} + + # Try cf-orch task-based endpoint first. + task_url = f"{llm_url.rstrip('/')}/api/inference/task" + try: + resp = httpx.post( + task_url, + json={ + "product": "turnstone", + "task": "log_analysis", + "payload": {"messages": messages, "stream": False}, + }, + headers=headers, + timeout=120.0, + ) + if resp.status_code == 200: + return _extract_content(resp.json()) + if resp.status_code != 404: + resp.raise_for_status() + logger.debug( + "No task assignment for turnstone.log_analysis — falling back to direct model" + ) + except Exception as exc: + logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc) + + # Fallback: OpenAI-compat endpoint with explicit model name. + try: + resp = httpx.post( + f"{llm_url.rstrip('/')}/v1/chat/completions", + json={"model": llm_model, "messages": messages, "stream": False}, + headers=headers, + timeout=120.0, + ) + resp.raise_for_status() + return _extract_content(resp.json()) + except Exception as exc: + logger.warning( + "LLM hypothesizer failed (%s): %s", type(exc).__name__, exc + ) + return None + + def _parse_response(self, raw: str) -> list[Hypothesis]: + """Parse the LLM JSON response into a list of Hypothesis objects.""" + try: + data = json.loads(raw.strip()) + except json.JSONDecodeError: + logger.warning( + "Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw + ) + return [] + + if not isinstance(data, list): + logger.warning( + "Hypothesizer: expected JSON array, got %s", type(data).__name__ + ) + return [] + + hypotheses: list[Hypothesis] = [] + for item in data[: self._max_hypotheses]: + if not isinstance(item, dict): + continue + severity_raw = item.get("severity", "ERROR") + severity = _validate_severity(str(severity_raw)) + hypothesis = Hypothesis( + hypothesis_id=str(uuid4()), + title=str(item.get("title", "Unknown"))[:80], + description=str(item.get("description", "")), + confidence=_coerce_float(item.get("confidence"), 0.5), + supporting_cluster_ids=tuple(item.get("supporting_clusters") or []), + runbook_refs=(), + severity=severity, + ) + hypotheses.append(hypothesis) + + return hypotheses diff --git a/app/services/diagnose.py b/app/services/diagnose/legacy.py similarity index 81% rename from app/services/diagnose.py rename to app/services/diagnose/legacy.py index 2f0c4c7..ccbe6d8 100644 --- a/app/services/diagnose.py +++ b/app/services/diagnose/legacy.py @@ -1,4 +1,5 @@ """Frictionless diagnose service — NL time extraction + layered log search.""" + from __future__ import annotations import asyncio @@ -18,6 +19,7 @@ logger = logging.getLogger(__name__) try: from dateparser.search import search_dates as _search_dates # type: ignore[import] + _HAS_DATEPARSER = True except ImportError: _search_dates = None # type: ignore[assignment] @@ -54,7 +56,7 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]: m = _RELATIVE_RE.search(query) if m: since, until = _relative_window(m) - keywords = re.sub(r"\s{2,}", " ", query[:m.start()] + query[m.end():]).strip() + keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip() return since, until, keywords or query if _HAS_DATEPARSER and _search_dates is not None: @@ -68,17 +70,25 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]: results = _search_dates( query, languages=["en"], - settings={"PREFER_DATES_FROM": "past", "TIMEZONE": tz_str, "RETURN_AS_TIMEZONE_AWARE": True}, + settings={ + "PREFER_DATES_FROM": "past", + "TIMEZONE": tz_str, + "RETURN_AS_TIMEZONE_AWARE": True, + }, ) except Exception: - logger.warning("dateparser failed on query %r — falling back to 60-min window", query) + logger.warning( + "dateparser failed on query %r — falling back to 60-min window", query + ) results = None if results: phrase, dt = results[0] if dt.tzinfo is None: dt = dt.replace(tzinfo=timezone.utc) else: - dt = dt.astimezone(timezone.utc) # normalise to UTC for SQLite string compare + dt = dt.astimezone( + timezone.utc + ) # normalise to UTC for SQLite string compare since = (dt - timedelta(minutes=30)).isoformat() until = (dt + timedelta(minutes=30)).isoformat() keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip()) @@ -107,8 +117,23 @@ def diagnose( else: keywords = query - keyword_hits = search(db_path, query=keywords, since=since, until=until, source_filter=source_filter, limit=150, or_mode=True) - window_hits = entries_in_window(db_path, since=since, until=until, source_filter=source_filter, limit=50, per_source_cap=15) + keyword_hits = search( + db_path, + query=keywords, + since=since, + until=until, + source_filter=source_filter, + limit=150, + or_mode=True, + ) + window_hits = entries_in_window( + db_path, + since=since, + until=until, + source_filter=source_filter, + limit=50, + per_source_cap=15, + ) seen: set[str] = set() merged: list[SearchResult] = [] @@ -117,7 +142,9 @@ def diagnose( seen.add(r.entry_id) merged.append(r) - combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200] + combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[ + :200 + ] by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0} by_source: dict[str, int] = {} @@ -129,7 +156,9 @@ def diagnose( reasoning: str | None = None if llm_url and llm_model: - reasoning = summarize(query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key) + reasoning = summarize( + query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key + ) return { "summary": { @@ -177,7 +206,9 @@ async def diagnose_stream( yield {"type": "status", "message": "Parsing time window…"} time_detected = since is not None and until is not None if not time_detected: - parsed_since, parsed_until, keywords = await asyncio.to_thread(parse_time_window, query) + parsed_since, parsed_until, keywords = await asyncio.to_thread( + parse_time_window, query + ) since = since or parsed_since until = until or parsed_until time_detected = keywords != query @@ -197,23 +228,34 @@ async def diagnose_stream( keyword_hits: list[SearchResult] = [] window_hits = await asyncio.to_thread( lambda: entries_in_window( - db_path, since, until, - source_filter=source_filter, limit=200, + db_path, + since, + until, + source_filter=source_filter, + limit=200, ) ) else: keyword_hits, window_hits = await asyncio.gather( asyncio.to_thread( lambda: search( - db_path, keywords, - source_filter=source_filter, since=since, until=until, - limit=150, or_mode=True, + db_path, + keywords, + source_filter=source_filter, + since=since, + until=until, + limit=150, + or_mode=True, ) ), asyncio.to_thread( lambda: entries_in_window( - db_path, since, until, - source_filter=source_filter, limit=50, per_source_cap=15, + db_path, + since, + until, + source_filter=source_filter, + limit=50, + per_source_cap=15, ) ), ) @@ -225,7 +267,9 @@ async def diagnose_stream( seen.add(r.entry_id) merged.append(r) - combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200] + combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[ + :200 + ] by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0} by_source: dict[str, int] = {} @@ -251,7 +295,14 @@ async def diagnose_stream( if llm_url and llm_model and combined: yield {"type": "status", "message": "Analyzing with LLM…"} reasoning = await asyncio.to_thread( - lambda: summarize(query, combined, llm_url, llm_model, llm_api_key, context_block=context_block) + lambda: summarize( + query, + combined, + llm_url, + llm_model, + llm_api_key, + context_block=context_block, + ) ) if reasoning: yield {"type": "reasoning", "text": reasoning} diff --git a/app/services/diagnose/models.py b/app/services/diagnose/models.py new file mode 100644 index 0000000..2831d30 --- /dev/null +++ b/app/services/diagnose/models.py @@ -0,0 +1,72 @@ +"""Pipeline data types for the multi-agent diagnose pipeline.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +SeverityLabel = Literal["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "UNKNOWN"] + + +@dataclass(frozen=True) +class EventCluster: + """A time-correlated group of log entries within the timeline.""" + + cluster_id: str + entries: tuple[str, ...] # entry_id refs + start_iso: str | None + end_iso: str | None + duration_seconds: float + source_ids: tuple[str, ...] + pattern_tags: tuple[str, ...] + severity: SeverityLabel + burst: bool + gap_before_seconds: float + representative_text: str + + +@dataclass(frozen=True) +class TimelineResult: + """Structured timeline of event clusters built from log entries.""" + + clusters: tuple[EventCluster, ...] + total_entries: int + window_start: str | None + window_end: str | None + gap_count: int + burst_count: int + dominant_sources: tuple[str, ...] + + +@dataclass(frozen=True) +class ClassifiedTimeline: + """Timeline annotated with ML-assigned severity per cluster.""" + + timeline: TimelineResult + cluster_severities: dict[str, SeverityLabel] + classifier_used: Literal["ml", "pattern_tags", "regex"] + model_id: str | None + + +@dataclass(frozen=True) +class Hypothesis: + """A root-cause hypothesis generated by Stage 3.""" + + hypothesis_id: str + title: str + description: str + confidence: float + supporting_cluster_ids: tuple[str, ...] + runbook_refs: tuple[str, ...] + severity: SeverityLabel + + +@dataclass(frozen=True) +class RankedHypothesis: + """A hypothesis enriched by Stage 4 false-positive suppression.""" + + hypothesis: Hypothesis + novelty_score: float + similarity_to_known: float + suppress: bool + suppression_reason: str | None diff --git a/app/services/diagnose/pipeline.py b/app/services/diagnose/pipeline.py new file mode 100644 index 0000000..f902610 --- /dev/null +++ b/app/services/diagnose/pipeline.py @@ -0,0 +1,132 @@ +"""Multi-agent diagnose pipeline orchestrator — Stage 1–5 wiring.""" + +from __future__ import annotations + +import asyncio +import dataclasses +import logging +from collections.abc import AsyncGenerator +from pathlib import Path +from typing import Any + +from app.context.retriever import RetrievedContext +from app.services.diagnose.classifier import SeverityClassifier +from app.services.diagnose.hypothesizer import RootCauseHypothesizer +from app.services.diagnose.suppressor import FalsePositiveSuppressor +from app.services.diagnose.synthesizer import SummarySynthesizer +from app.services.diagnose.timeline import TimelineReconstructor +from app.services.search import SearchResult + +logger = logging.getLogger(__name__) + + +async def run_pipeline( + db_path: Path, + entries: list[SearchResult], + ctx: RetrievedContext, + query: str, + since: str | None, # reserved for future range-filtering in stage queries (#29 follow-up) + until: str | None, # reserved for future range-filtering in stage queries (#29 follow-up) + llm_url: str | None, + llm_model: str | None, + llm_api_key: str | None, +) -> AsyncGenerator[dict[str, Any], None]: + """Async generator that runs all 5 pipeline stages and yields SSE event dicts. + + Stages: + 1. TimelineReconstructor — cluster log entries by time + 2. SeverityClassifier — annotate clusters with severity + 3. RootCauseHypothesizer — generate hypotheses via LLM + 4. FalsePositiveSuppressor — rank and suppress known patterns + 5. SummarySynthesizer — produce a narrative diagnosis + + Yields events in order: + {"type": "status", "message": "Building timeline…"} + {"type": "pipeline_stage", "stage": 1, ...} + {"type": "pipeline_stage", "stage": 2, ...} + {"type": "pipeline_stage", "stage": 3, ...} + {"type": "pipeline_stage", "stage": 4, ...} + {"type": "hypotheses", "data": [...]} + {"type": "status", "message": "Synthesizing…"} + {"type": "reasoning", "text": "..."} — only when synthesis produces text + {"type": "done"} + """ + # Stage 1: Timeline reconstruction + yield {"type": "status", "message": "Building timeline…"} + timeline = await asyncio.to_thread( + TimelineReconstructor().reconstruct, entries + ) + n_clusters = len(timeline.clusters) + burst = timeline.burst_count + yield { + "type": "pipeline_stage", + "stage": 1, + "name": "timeline", + "message": f"Built {n_clusters} clusters, {burst} bursts", + } + + # Stage 2: Severity classification + classified = await asyncio.to_thread( + SeverityClassifier().classify, timeline + ) + sev_counts: dict[str, int] = {} + for sev in classified.cluster_severities.values(): + sev_counts[sev] = sev_counts.get(sev, 0) + 1 + counts_str = ", ".join(f"{k}:{v}" for k, v in sorted(sev_counts.items())) + yield { + "type": "pipeline_stage", + "stage": 2, + "name": "classifier", + "message": f"{classified.classifier_used} classifier: {counts_str}", + } + + # Stage 3: Root-cause hypotheses + hypotheses = await asyncio.to_thread( + RootCauseHypothesizer().hypothesize, + classified, + ctx, + query, + llm_url, + llm_model, + llm_api_key, + ) + yield { + "type": "pipeline_stage", + "stage": 3, + "name": "hypotheses", + "message": f"{len(hypotheses)} hypotheses generated", + } + + # Stage 4: False-positive suppression + ranked = await asyncio.to_thread( + FalsePositiveSuppressor().suppress, hypotheses, db_path + ) + suppressed = sum(1 for rh in ranked if rh.suppress) + active = len(ranked) - suppressed + yield { + "type": "pipeline_stage", + "stage": 4, + "name": "suppressor", + "message": f"{suppressed} suppressed, {active} active", + } + yield { + "type": "hypotheses", + "data": [dataclasses.asdict(rh) for rh in ranked], + } + + # Stage 5: Summary synthesis + yield {"type": "status", "message": "Synthesizing…"} + synthesis_text = await asyncio.to_thread( + SummarySynthesizer().synthesize, + ranked, + timeline, + ctx, + query, + llm_url, + llm_model, + llm_api_key, + ) + if synthesis_text: + yield {"type": "reasoning", "text": synthesis_text} + + yield {"type": "done"} diff --git a/app/services/diagnose/suppressor.py b/app/services/diagnose/suppressor.py new file mode 100644 index 0000000..a77d74d --- /dev/null +++ b/app/services/diagnose/suppressor.py @@ -0,0 +1,275 @@ +"""Stage 4: False-Positive Suppressor — embedding cosine similarity. + +Compares each hypothesis against a corpus of resolved incidents using +embedding cosine similarity. Hypotheses that closely match a previously +resolved incident are suppressed as likely false positives. + +When no embedding model is configured or the service is unavailable, all +hypotheses pass through with novelty_score=1.0 (full novelty assumed). +""" +from __future__ import annotations + +import logging +import sqlite3 +from pathlib import Path +from typing import Any + +from app.services.diagnose.models import Hypothesis, RankedHypothesis + +logger = logging.getLogger(__name__) + +# Module-level corpus cache: db_path_str -> (corpus_texts, embeddings) +# Invalidated when the corpus text list changes between calls. +_corpus_cache: dict[str, tuple[list[str], Any]] = {} + +# --------------------------------------------------------------------------- +# Cosine similarity helpers +# --------------------------------------------------------------------------- + +try: + import numpy as np + + def _cosine_similarities( + query_emb: list[float], corpus_embs: list[list[float]] + ) -> list[float]: + """Batch cosine similarity of one query embedding against all corpus embeddings.""" + q = np.array(query_emb, dtype=np.float32) + c = np.array(corpus_embs, dtype=np.float32) + q_norm = q / (np.linalg.norm(q) + 1e-10) + c_norm = c / (np.linalg.norm(c, axis=1, keepdims=True) + 1e-10) + return list(c_norm @ q_norm) + + _HAS_NUMPY = True + +except ImportError: # pragma: no cover + import math + + _HAS_NUMPY = False + + def _dot(a: list[float], b: list[float]) -> float: + return sum(x * y for x, y in zip(a, b)) + + def _norm(a: list[float]) -> float: + return math.sqrt(sum(x * x for x in a)) + 1e-10 + + def _cosine(a: list[float], b: list[float]) -> float: + return _dot(a, b) / (_norm(a) * _norm(b)) + + def _cosine_similarities( + query_emb: list[float], corpus_embs: list[list[float]] + ) -> list[float]: + return [_cosine(query_emb, c) for c in corpus_embs] + + +# --------------------------------------------------------------------------- +# DB helpers +# --------------------------------------------------------------------------- + +def _fetch_resolved_incidents(db_path: Path) -> list[str]: + """Fetch resolved incident texts from SQLite. + + Returns a list of non-empty combined strings for each resolved incident. + Returns an empty list on any error (missing table, connection failure, etc.). + """ + try: + with sqlite3.connect(str(db_path)) as conn: + cursor = conn.execute( + "SELECT label, notes FROM incidents WHERE ended_at IS NOT NULL LIMIT 200" + ) + rows = cursor.fetchall() + except sqlite3.OperationalError as exc: + logger.warning("Could not query resolved incidents (%s) — treating as empty corpus", exc) + return [] + except sqlite3.Error as exc: + # Catches all remaining SQLite-family errors (IntegrityError, DatabaseError, etc.) + logger.warning("Unexpected SQLite error fetching resolved incidents (%s) — treating as empty corpus", exc) + return [] + + texts: list[str] = [] + for label, notes in rows: + label = (label or "").strip() + notes = (notes or "").strip() + combined = f"{label}. {notes}" if label and notes else (label or notes) + if combined: + texts.append(combined) + return texts + + +# --------------------------------------------------------------------------- +# Public class +# --------------------------------------------------------------------------- + +class FalsePositiveSuppressor: + """Stage 4 of the multi-agent diagnose pipeline. + + Uses embedding cosine similarity to detect hypotheses that closely match + previously resolved incidents and suppress them as likely false positives. + + When model_id is empty or the embedding service is unavailable, all + hypotheses pass through with novelty_score=1.0 (no suppression). + """ + + def __init__( + self, + model_id: str = "", + device: str = "cpu", + similarity_threshold: float = 0.85, + ) -> None: + self._model_id = model_id + self._device = device + # _device stored for future use when get_embedder() supports device selection + # Suppress when cosine similarity to a known resolved incident >= threshold. + # A threshold of 0.85 means "suppress if 85%+ similar to something already resolved." + self._similarity_threshold = similarity_threshold + + def suppress( + self, + hypotheses: list[Hypothesis], + db_path: Path, + ) -> list[RankedHypothesis]: + """Rank hypotheses by novelty, suppressing those matching resolved incidents. + + Args: + hypotheses: Candidate hypotheses from Stage 3. + db_path: Path to the Turnstone SQLite database containing incidents. + + Returns: + List of RankedHypothesis sorted by (novelty_score * confidence) descending. + Non-suppressed hypotheses appear first in practice. + """ + if not hypotheses: + return [] + + # No model configured — full passthrough, rank by confidence only. + if not self._model_id: + return self._passthrough(hypotheses) + + # Attempt to obtain an embedder; fall back to passthrough on failure. + embedder = self._load_embedder() + if embedder is None: + logger.warning( + "Embedding service unavailable for model %r — skipping suppression", + self._model_id, + ) + return self._passthrough(hypotheses) + + # Fetch corpus texts from DB; fall back to passthrough if corpus is empty. + corpus_texts = _fetch_resolved_incidents(db_path) + if not corpus_texts: + logger.debug("No resolved incidents found — all hypotheses treated as novel") + return self._passthrough(hypotheses) + + # Embed corpus (with caching). + corpus_embeddings = self._get_corpus_embeddings(embedder, corpus_texts, db_path) + + # Score each hypothesis and sort by novelty * confidence descending. + ranked = [ + self._score_hypothesis(h, embedder, corpus_embeddings) + for h in hypotheses + ] + ranked.sort(key=lambda rh: rh.novelty_score * rh.hypothesis.confidence, reverse=True) + return ranked + + # ------------------------------------------------------------------ + # Private helpers + # ------------------------------------------------------------------ + + def _score_hypothesis( + self, + hypothesis: Hypothesis, + embedder: Any, + corpus_embeddings: list[list[float]], + ) -> RankedHypothesis: + """Score a single hypothesis against the resolved incident corpus.""" + try: + query_text = f"{hypothesis.title}. {hypothesis.description}" + h_emb = embedder.embed(query_text) + # Convert numpy array to plain Python list for _cosine_similarities + h_emb_list: list[float] = h_emb.tolist() if hasattr(h_emb, "tolist") else list(h_emb) + sims = _cosine_similarities(h_emb_list, corpus_embeddings) + max_sim = float(max(sims)) if sims else 0.0 + except Exception as exc: + # Broad catch is intentional: catches unknown embedder runtime errors + # (e.g. CUDA OOM, backend crashes) so one bad hypothesis never halts the pipeline. + logger.warning("Embedding failed for hypothesis %r: %s — treating as novel", hypothesis.title, exc) + return RankedHypothesis( + hypothesis=hypothesis, + novelty_score=1.0, + similarity_to_known=0.0, + suppress=False, + suppression_reason=None, + ) + + novelty_score = 1.0 - max_sim + suppress = bool(max_sim >= self._similarity_threshold) + suppression_reason = ( + f"Similar to resolved incident (similarity {max_sim:.2f})" + if suppress + else None + ) + return RankedHypothesis( + hypothesis=hypothesis, + novelty_score=novelty_score, + similarity_to_known=max_sim, + suppress=suppress, + suppression_reason=suppression_reason, + ) + + def _load_embedder(self) -> Any | None: + """Load the embedding service. Returns None if unavailable.""" + try: + from app.services.embeddings import get_embedder + return get_embedder() + except Exception as exc: + # Broad catch is intentional: get_embedder() may raise on import or + # backend init failures from any number of third-party libraries. + logger.warning("Failed to import/initialise embedding service: %s", exc) + return None + + def _get_corpus_embeddings( + self, + embedder: Any, + corpus_texts: list[str], + db_path: Path, + ) -> list[list[float]]: + """Return cached corpus embeddings, re-embedding if the corpus has changed.""" + cache_key = str(db_path) + cached = _corpus_cache.get(cache_key) + + if cached is not None: + cached_texts, cached_embeddings = cached + if cached_texts == corpus_texts: + return cached_embeddings + + logger.debug("Embedding corpus of %d resolved incidents", len(corpus_texts)) + try: + raw_embeddings = embedder.embed_batch(corpus_texts) + # Normalise each embedding to a plain Python list for portability + corpus_embeddings: list[list[float]] = [ + e.tolist() if hasattr(e, "tolist") else list(e) + for e in raw_embeddings + ] + except Exception as exc: + # Broad catch is intentional: embed_batch() may raise from any backend + # (network timeout, CUDA error, etc.) — treat as empty corpus so the + # pipeline can continue without suppression. + logger.warning("Corpus embedding failed: %s — treating as empty corpus", exc) + return [] + + _corpus_cache[cache_key] = (corpus_texts, corpus_embeddings) + return corpus_embeddings + + def _passthrough(self, hypotheses: list[Hypothesis]) -> list[RankedHypothesis]: + """Return all hypotheses as non-suppressed, ranked by confidence descending.""" + ranked = [ + RankedHypothesis( + hypothesis=h, + novelty_score=1.0, + similarity_to_known=0.0, + suppress=False, + suppression_reason=None, + ) + for h in hypotheses + ] + ranked.sort(key=lambda rh: rh.hypothesis.confidence, reverse=True) + return ranked diff --git a/app/services/diagnose/synthesizer.py b/app/services/diagnose/synthesizer.py new file mode 100644 index 0000000..ce07625 --- /dev/null +++ b/app/services/diagnose/synthesizer.py @@ -0,0 +1,210 @@ +"""Stage 5: Summary Synthesizer — deterministic narrative from ranked hypotheses. + +Streaming upgrade (async SSE chunks) is tracked as a follow-up enhancement. +This implementation is synchronous to match the rest of the pipeline. +""" +from __future__ import annotations + +import logging + +import httpx + +from app.context.retriever import RetrievedContext +from app.services.diagnose.models import RankedHypothesis, TimelineResult + +logger = logging.getLogger(__name__) + +_SYSTEM_PROMPT = ( + "You are a Linux sysadmin diagnosing a system incident. " + "Write a concise, actionable incident diagnosis.\n\n" + "Format your response exactly as:\n" + "1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — (% confidence)\n" + "2. TIMELINE: \n" + "3. ROOT CAUSES:\n" + " - (%)\n" + " - (%)\n" + "4. RECOMMENDED ACTIONS:\n" + " - \n" + "5. INVESTIGATE FURTHER: " +) + + +def _extract_content(resp_json: dict) -> str | None: + """Pull text content from an OpenAI-compat chat completion response.""" + choices = resp_json.get("choices") or [] + if not choices: + return None + return (choices[0].get("message", {}).get("content") or "").strip() or None + + +def _build_hypothesis_block(ranked: list[RankedHypothesis]) -> str: + """Build the hypothesis block for the prompt (non-suppressed only, top 3).""" + active = [rh for rh in ranked if not rh.suppress][:3] + if not active: + return "(none)" + lines: list[str] = [] + for rh in active: + h = rh.hypothesis + conf_pct = int(h.confidence * 100) + similar = ( + f"Yes — suppressed, {rh.suppression_reason}" + if rh.suppress and rh.suppression_reason + else "No" + ) + novelty = f"{rh.novelty_score:.2f}" + lines.append( + f"- [{h.severity}, {conf_pct}%] {h.title}\n" + f" Similar resolved incident? {similar} (novelty {novelty})" + ) + return "\n".join(lines) + + +def _build_context_block(ctx: RetrievedContext) -> str: + """Build the runbook context block for the prompt.""" + parts: list[str] = [] + for chunk in ctx.chunks[:5]: + filename = chunk.get("filename", "unknown") + text = chunk.get("text", "")[:300] + parts.append(f"[{filename}] {text}") + return "\n".join(parts) if parts else "(none)" + + +def _deterministic_fallback( + ranked: list[RankedHypothesis], + timeline: TimelineResult, +) -> str: + """Build a deterministic fallback text when no LLM is available.""" + active = [rh for rh in ranked if not rh.suppress][:3] + if active: + top = active[0] + verdict_severity = top.hypothesis.severity + verdict_title = top.hypothesis.title + verdict_conf = int(top.hypothesis.confidence * 100) + elif ranked: + top = ranked[0] + verdict_severity = top.hypothesis.severity + verdict_title = top.hypothesis.title + verdict_conf = int(top.hypothesis.confidence * 100) + else: + verdict_severity = "UNKNOWN" + verdict_title = "No hypotheses generated" + verdict_conf = 0 + + root_causes = ", ".join( + rh.hypothesis.title for rh in (active or ranked[:3]) + ) or "None" + + return ( + f"VERDICT: {verdict_severity} — {verdict_title} ({verdict_conf}% confidence)\n" + f"TIMELINE: {timeline.total_entries} entries across {len(timeline.clusters)} clusters.\n" + f"ROOT CAUSES: {root_causes}" + ) + + +class SummarySynthesizer: + """Stage 5 of the multi-agent diagnose pipeline. + + Synthesizes a human-readable incident narrative from ranked hypotheses, + the reconstructed timeline, and RAG context. When no LLM is configured, + returns a deterministic fallback built from the hypothesis data. + """ + + def synthesize( + self, + ranked: list[RankedHypothesis], + timeline: TimelineResult, + ctx: RetrievedContext, + query: str, + llm_url: str | None = None, + llm_model: str | None = None, + llm_api_key: str | None = None, + ) -> str: + """Return synthesis text (single string, synchronous). + + Falls back to a deterministic narrative when no LLM URL or model is + provided, or when the LLM call fails. + """ + fallback = _deterministic_fallback(ranked, timeline) + + if not llm_url or not llm_model: + return fallback + + hypothesis_block = _build_hypothesis_block(ranked) + context_block = _build_context_block(ctx) + dominant = ", ".join(timeline.dominant_sources[:5]) or "none" + + user_message = ( + f"Query: {query}\n\n" + f"Timeline summary:\n" + f"- {len(timeline.clusters)} clusters, " + f"{timeline.burst_count} bursts, " + f"{timeline.gap_count} silence gaps\n" + f"- Primary sources: {dominant}\n\n" + f"Top hypotheses:\n{hypothesis_block}\n\n" + f"Context from runbooks:\n{context_block}" + ) + + messages = [ + {"role": "system", "content": _SYSTEM_PROMPT}, + {"role": "user", "content": user_message}, + ] + + result = self._call_llm( + llm_url=llm_url, + llm_model=llm_model, + llm_api_key=llm_api_key, + messages=messages, + ) + return result if result else fallback + + def _call_llm( + self, + llm_url: str, + llm_model: str, + llm_api_key: str | None, + messages: list[dict], + ) -> str | None: + """Send messages to the LLM and return raw text content. + + Tries the cf-orch task endpoint first, falls back to direct OpenAI-compat. + """ + headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {} + + task_url = f"{llm_url.rstrip('/')}/api/inference/task" + try: + resp = httpx.post( + task_url, + json={ + "product": "turnstone", + "task": "log_analysis", + "payload": {"messages": messages, "stream": False}, + }, + headers=headers, + timeout=120.0, + ) + if resp.status_code == 200: + return _extract_content(resp.json()) + if resp.status_code != 404: + resp.raise_for_status() + logger.debug( + "No task assignment for turnstone.log_analysis — falling back to direct model" + ) + except Exception as exc: + logger.debug( + "Task endpoint unavailable (%s) — falling back to direct model", exc + ) + + try: + resp = httpx.post( + f"{llm_url.rstrip('/')}/v1/chat/completions", + json={"model": llm_model, "messages": messages, "stream": False}, + headers=headers, + timeout=120.0, + ) + resp.raise_for_status() + return _extract_content(resp.json()) + except Exception as exc: + logger.warning( + "LLM synthesizer failed (%s): %s", type(exc).__name__, exc + ) + return None diff --git a/app/services/diagnose/timeline.py b/app/services/diagnose/timeline.py new file mode 100644 index 0000000..3d557dc --- /dev/null +++ b/app/services/diagnose/timeline.py @@ -0,0 +1,272 @@ +"""Stage 1: Timeline Reconstructor — pure Python, no ML.""" + +from __future__ import annotations + +import hashlib +import logging +from collections import defaultdict +from datetime import datetime, timezone + +from app.services.diagnose.models import EventCluster, TimelineResult +from app.services.search import SearchResult + +logger = logging.getLogger(__name__) + +_SEVERITY_ORDER: dict[str | None, int] = { + "CRITICAL": 5, + "ERROR": 4, + "WARN": 3, + "WARNING": 3, + "INFO": 2, + "DEBUG": 1, + None: 0, +} + + +def _parse_iso(s: str) -> datetime | None: + """Parse ISO 8601 string to UTC-aware datetime. Returns None on parse failure.""" + try: + dt = datetime.fromisoformat(s) + except ValueError: + logger.warning("Unparseable timestamp in log entry, treating as None: %r", s) + return None + if dt.tzinfo is None: + logger.debug("Naive timestamp treated as UTC: %s", s) + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + + +def _sort_key(e: SearchResult) -> tuple[int, str]: + """Sort key: timestamped entries first (ascending), then None-timestamp entries.""" + if e.timestamp_iso is None: + return (1, "") + return (0, e.timestamp_iso) + + +def _highest_severity(entries: list[SearchResult]) -> str: + """Return the highest severity label across all entries.""" + best: str | None = None + best_rank = -1 + for entry in entries: + sev = entry.severity + rank = _SEVERITY_ORDER.get(sev, 0) + if rank > best_rank: + best_rank = rank + best = sev + # SeverityLabel requires a valid literal; fall back to "UNKNOWN" if None + if best is None: + return "UNKNOWN" + # Normalise WARNING -> WARN for the output type + if best == "WARNING": + return "WARN" + return best + + +def _representative_text(entries: list[SearchResult]) -> str: + """Return text of the entry with highest rank; tie-break on longest text.""" + if not entries: + return "" + best = max(entries, key=lambda e: (e.rank, len(e.text))) + return best.text + + +def _cluster_id(entry_ids: list[str]) -> str: + """Compute a 12-char hex cluster ID from a sorted list of entry IDs.""" + payload = ",".join(sorted(entry_ids)).encode() + return hashlib.sha1(payload).hexdigest()[:12] # noqa: S324 — not used for security + + +def _make_event_cluster( + cluster_entries: list[SearchResult], + gap_before_seconds: float, + burst_threshold: int, + burst_window_seconds: int, +) -> EventCluster: + """Construct an EventCluster from a list of SearchResult entries.""" + timestamps = [ + ts + for e in cluster_entries + if e.timestamp_iso is not None + for ts in (_parse_iso(e.timestamp_iso),) + if ts is not None + ] + + start_iso: str | None = None + end_iso: str | None = None + duration_seconds = 0.0 + + if timestamps: + ts_min = min(timestamps) + ts_max = max(timestamps) + start_iso = ts_min.isoformat() + end_iso = ts_max.isoformat() + duration_seconds = (ts_max - ts_min).total_seconds() + + entry_ids = [e.entry_id for e in cluster_entries] + burst = ( + len(cluster_entries) >= burst_threshold + and duration_seconds <= burst_window_seconds + ) + + return EventCluster( + cluster_id=_cluster_id(entry_ids), + entries=tuple(entry_ids), + start_iso=start_iso, + end_iso=end_iso, + duration_seconds=duration_seconds, + source_ids=tuple(sorted(set(e.source_id for e in cluster_entries))), + pattern_tags=tuple( + sorted(set(tag for e in cluster_entries for tag in e.matched_patterns)) + ), + severity=_highest_severity(cluster_entries), # type: ignore[arg-type] # SeverityLabel is a Literal; _highest_severity returns a compatible str + burst=burst, + gap_before_seconds=gap_before_seconds, + representative_text=_representative_text(cluster_entries), + ) + + +class TimelineReconstructor: + """Reconstruct a structured timeline of event clusters from log entries. + + Pure Python — no ML or LLM calls. Designed as Stage 1 of the multi-agent + diagnose pipeline. + """ + + def __init__( + self, + cluster_window_seconds: int = 30, + burst_threshold: int = 10, + burst_window_seconds: int = 5, + gap_significance_seconds: int = 30, + ) -> None: + self._cluster_window = cluster_window_seconds + self._burst_threshold = burst_threshold + self._burst_window = burst_window_seconds + self._gap_significance_seconds: int = gap_significance_seconds + + def _sort_entries(self, entries: list[SearchResult]) -> list[SearchResult]: + """Sort entries: timestamped first (ascending), then None-timestamp entries.""" + return sorted(entries, key=_sort_key) + + def _group_into_raw_clusters( + self, sorted_entries: list[SearchResult] + ) -> list[list[SearchResult]]: + """Group sorted entries into time-window clusters.""" + raw_clusters: list[list[SearchResult]] = [] + current: list[SearchResult] = [] + cluster_anchor: datetime | None = None + + for entry in sorted_entries: + if not current: + current.append(entry) + if entry.timestamp_iso is not None: + cluster_anchor = _parse_iso(entry.timestamp_iso) + continue + + if entry.timestamp_iso is None: + # No timestamp — always joins the current cluster + current.append(entry) + continue + + entry_dt = _parse_iso(entry.timestamp_iso) + + if entry_dt is None: + # Malformed timestamp — treat same as None: join current cluster + current.append(entry) + continue + + if cluster_anchor is None: + # Current cluster has no anchor yet — set it, stay in cluster + cluster_anchor = entry_dt + current.append(entry) + continue + + delta = (entry_dt - cluster_anchor).total_seconds() + if delta > self._cluster_window: + raw_clusters.append(current) + current = [entry] + cluster_anchor = entry_dt + else: + current.append(entry) + + if current: + raw_clusters.append(current) + + return raw_clusters + + def _build_cluster( + self, + cluster_entries: list[SearchResult], + prev_end_iso: str | None, + ) -> EventCluster: + """Build an EventCluster from a list of SearchResult entries.""" + gap_before = 0.0 + if prev_end_iso is not None: + ts_list = [ + ts + for e in cluster_entries + if e.timestamp_iso is not None + for ts in (_parse_iso(e.timestamp_iso),) + if ts is not None + ] + if ts_list: + this_start = min(ts_list) + prev_end = _parse_iso(prev_end_iso) + if prev_end is not None: + gap_before = (this_start - prev_end).total_seconds() + + return _make_event_cluster( + cluster_entries, + gap_before_seconds=gap_before, + burst_threshold=self._burst_threshold, + burst_window_seconds=self._burst_window, + ) + + def _dominant_sources_tuple(self, entries: list[SearchResult]) -> tuple[str, ...]: + """Return source_ids sorted by total entry count descending.""" + source_counts: dict[str, int] = defaultdict(int) + for entry in entries: + source_counts[entry.source_id] += 1 + return tuple( + src for src, _ in sorted(source_counts.items(), key=lambda kv: -kv[1]) + ) + + def reconstruct(self, entries: list[SearchResult]) -> TimelineResult: + """Build a structured timeline from a flat list of log entries.""" + if not entries: + return TimelineResult( + clusters=(), + total_entries=0, + window_start=None, + window_end=None, + gap_count=0, + burst_count=0, + dominant_sources=(), + ) + + sorted_entries = self._sort_entries(entries) + raw_clusters = self._group_into_raw_clusters(sorted_entries) + + clusters: list[EventCluster] = [] + prev_end: str | None = None + for raw in raw_clusters: + c = self._build_cluster(raw, prev_end) + clusters.append(c) + prev_end = c.end_iso + + clusters_tuple = tuple(clusters) + gap_count = sum( + 1 + for c in clusters_tuple + if c.gap_before_seconds > self._gap_significance_seconds + ) + + return TimelineResult( + clusters=clusters_tuple, + total_entries=len(entries), + window_start=clusters_tuple[0].start_iso if clusters_tuple else None, + window_end=clusters_tuple[-1].end_iso if clusters_tuple else None, + gap_count=gap_count, + burst_count=sum(1 for c in clusters_tuple if c.burst), + dominant_sources=self._dominant_sources_tuple(entries), + ) diff --git a/app/services/embeddings.py b/app/services/embeddings.py new file mode 100644 index 0000000..7e9b30a --- /dev/null +++ b/app/services/embeddings.py @@ -0,0 +1,229 @@ +"""Configurable embedding service — BSL licensed. + +Backends: + sentence_transformers — local in-process inference (default, no server needed) + ollama — HTTP to a running Ollama instance + +Configuration (env vars): + TURNSTONE_EMBED_BACKEND sentence_transformers | ollama (default: sentence_transformers) + TURNSTONE_EMBED_MODEL model name/path (backend-specific default) + TURNSTONE_EMBED_DEVICE cpu | cuda (default: cpu; ST backend only) + TURNSTONE_LLM_URL Ollama base URL (default: http://localhost:11434) + +When no backend is importable/reachable, EMBEDDING_AVAILABLE is False and all +embed calls return empty arrays — callers must handle this gracefully. +""" +from __future__ import annotations + +import logging +import os +import struct +from typing import Protocol, runtime_checkable + +import numpy as np + +logger = logging.getLogger(__name__) + +# ── Public availability flag ────────────────────────────────────────────────── + +EMBEDDING_AVAILABLE: bool = False + +# ── Config ──────────────────────────────────────────────────────────────────── + +_BACKEND = os.environ.get("TURNSTONE_EMBED_BACKEND", "sentence_transformers").lower() +_DEVICE = os.environ.get("TURNSTONE_EMBED_DEVICE", "cpu").lower() +_LLM_URL = os.environ.get("TURNSTONE_LLM_URL", "http://localhost:11434") + +# BAAI/bge-small-en-v1.5: 33MB, MIT, 49M downloads/month, 384-dim, 512-token max. +# Benchmarked as the best quality-to-size ratio in the field (MTEB 62.17). +# all-MiniLM-L6-v2 is a viable lighter alternative (23MB, 256-token max) if +# inference speed is the primary constraint. +_DEFAULT_MODEL: dict[str, str] = { + "sentence_transformers": "BAAI/bge-small-en-v1.5", + "ollama": "nomic-embed-text", +} +_MODEL = os.environ.get( + "TURNSTONE_EMBED_MODEL", + _DEFAULT_MODEL.get(_BACKEND, "sentence-transformers/all-MiniLM-L6-v2"), +) + + +# ── Protocol ────────────────────────────────────────────────────────────────── + +@runtime_checkable +class Embedder(Protocol): + """Minimal interface all embedding backends must satisfy.""" + + @property + def dim(self) -> int: + """Embedding dimension produced by this model.""" + ... + + @property + def model_name(self) -> str: + """Human-readable model identifier.""" + ... + + def embed(self, text: str) -> np.ndarray: + """Embed a single string. Returns 1-D float32 array of length dim.""" + ... + + def embed_batch(self, texts: list[str]) -> list[np.ndarray]: + """Embed a list of strings. Returns list of 1-D float32 arrays.""" + ... + + +# ── sentence-transformers backend ───────────────────────────────────────────── + +class SentenceTransformerEmbedder: + """Local in-process embedding via the sentence-transformers library. + + The model is downloaded from HuggingFace on first instantiation and cached + at ~/.cache/huggingface/. Subsequent starts use the local cache. + """ + + def __init__(self, model_name: str = _MODEL, device: str = _DEVICE) -> None: + from sentence_transformers import SentenceTransformer # type: ignore[import] + logger.info("Loading embedding model %r on device %r ...", model_name, device) + self._model = SentenceTransformer(model_name, device=device) + self._model_name = model_name + # Infer dimension from a test embed rather than hard-coding + self._dim: int = int(self._model.encode("test").shape[0]) + logger.info("Embedding model ready — dim=%d", self._dim) + + @property + def dim(self) -> int: + return self._dim + + @property + def model_name(self) -> str: + return self._model_name + + def embed(self, text: str) -> np.ndarray: + vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True) + return vec.astype(np.float32) + + def embed_batch(self, texts: list[str]) -> list[np.ndarray]: + if not texts: + return [] + vecs = self._model.encode( + texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=32 + ) + return [v.astype(np.float32) for v in vecs] + + +# ── Ollama backend ──────────────────────────────────────────────────────────── + +class OllamaEmbedder: + """HTTP embedding via a running Ollama instance.""" + + def __init__( + self, + model_name: str = _MODEL, + llm_url: str = _LLM_URL, + timeout: float = 30.0, + ) -> None: + import httpx # already a project dependency + self._model_name = model_name + self._url = f"{llm_url.rstrip('/')}/api/embeddings" + self._timeout = timeout + self._client = httpx.Client(timeout=timeout) + # Probe dimension with a test call + self._dim = self._probe_dim() + + def _probe_dim(self) -> int: + try: + vec = self._raw_embed("probe") + return len(vec) + except Exception as exc: + logger.warning("Ollama dim probe failed (%s) — defaulting to 768", exc) + return 768 + + def _raw_embed(self, text: str) -> list[float]: + resp = self._client.post( + self._url, json={"model": self._model_name, "prompt": text} + ) + resp.raise_for_status() + return resp.json().get("embedding") or [] + + @property + def dim(self) -> int: + return self._dim + + @property + def model_name(self) -> str: + return self._model_name + + def embed(self, text: str) -> np.ndarray: + vec = self._raw_embed(text) + return np.array(vec, dtype=np.float32) + + def embed_batch(self, texts: list[str]) -> list[np.ndarray]: + return [self.embed(t) for t in texts] + + +# ── Singleton factory ───────────────────────────────────────────────────────── + +_embedder: Embedder | None = None + + +def get_embedder() -> Embedder | None: + """Return the configured embedder singleton, or None when unavailable. + + Lazy-initialises on first call. Callers should check EMBEDDING_AVAILABLE + or test for None rather than calling this unconditionally. + """ + global _embedder, EMBEDDING_AVAILABLE + if _embedder is not None: + return _embedder + + if _BACKEND == "sentence_transformers": + try: + _embedder = SentenceTransformerEmbedder(_MODEL, _DEVICE) + EMBEDDING_AVAILABLE = True + except ImportError: + logger.warning( + "sentence-transformers not installed — embeddings disabled. " + "Install with: pip install sentence-transformers" + ) + except Exception as exc: + logger.warning("Failed to load sentence-transformers model %r: %s", _MODEL, exc) + + elif _BACKEND == "ollama": + try: + _embedder = OllamaEmbedder(_MODEL, _LLM_URL) + EMBEDDING_AVAILABLE = True + except Exception as exc: + logger.warning("Ollama embedder init failed: %s", exc) + + else: + logger.warning("Unknown TURNSTONE_EMBED_BACKEND %r — embeddings disabled", _BACKEND) + + return _embedder + + +# ── BLOB serialisation helpers ──────────────────────────────────────────────── + +def pack_vector(vec: np.ndarray) -> bytes: + """Serialise a float32 numpy vector to a SQLite BLOB.""" + arr = vec.astype(np.float32) + return struct.pack(f"{len(arr)}f", *arr.tolist()) + + +def unpack_vector(blob: bytes) -> np.ndarray: + """Deserialise a SQLite BLOB back to a float32 numpy vector.""" + n = len(blob) // 4 # 4 bytes per float32 + return np.array(struct.unpack(f"{n}f", blob), dtype=np.float32) + + +def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float: + """Cosine similarity between two L2-normalised vectors. + + Both vectors are re-normalised defensively so callers need not pre-normalise. + Returns 0.0 when either vector has zero norm. + """ + norm_a = np.linalg.norm(a) + norm_b = np.linalg.norm(b) + if norm_a == 0.0 or norm_b == 0.0: + return 0.0 + return float(np.dot(a, b) / (norm_a * norm_b)) diff --git a/app/services/incidents.py b/app/services/incidents.py index 9699ba0..dd758c1 100644 --- a/app/services/incidents.py +++ b/app/services/incidents.py @@ -6,7 +6,7 @@ import sqlite3 import uuid from pathlib import Path -from app.ingest.base import now_iso +from app.glean.base import now_iso from app.services.models import Incident, ReceivedBundle from app.services.search import SearchResult, entries_in_window, search diff --git a/app/services/models.py b/app/services/models.py index e551135..a0d5df5 100644 --- a/app/services/models.py +++ b/app/services/models.py @@ -10,7 +10,7 @@ class RetrievedEntry: entry_id: str source_id: str # log file path or service name - sequence: int # original line number — ingest order, not wall-clock order + sequence: int # original line number — glean order, not wall-clock order timestamp_raw: str | None # timestamp as it appeared in the log timestamp_iso: str | None # parsed to ISO 8601 for sorting; None if unparseable ingest_time: str # when Turnstone indexed this entry (wall clock) @@ -25,7 +25,7 @@ class RetrievedEntry: @dataclass(frozen=True) class LogPattern: - """A named regex pattern for tagging entries at ingest time.""" + """A named regex pattern for tagging entries at glean time.""" name: str # e.g. "device_disconnect", "auth_failure" pattern: str # regex string diff --git a/app/services/search.py b/app/services/search.py index d7bf12c..7252272 100644 --- a/app/services/search.py +++ b/app/services/search.py @@ -451,9 +451,8 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis else: suppressed += 1 - # When did we last ingest anything? last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone() - last_ingested: str | None = last_row["t"] if last_row else None + last_gleaned: str | None = last_row["t"] if last_row else None conn.close() @@ -465,7 +464,7 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis "source_health": source_health, "recent_criticals": recent_criticals, "suppressed_criticals": suppressed, - "last_ingested": last_ingested, + "last_gleaned": last_gleaned, } diff --git a/app/tasks/ingest_scheduler.py b/app/tasks/glean_scheduler.py similarity index 85% rename from app/tasks/ingest_scheduler.py rename to app/tasks/glean_scheduler.py index b55b152..dc80393 100644 --- a/app/tasks/ingest_scheduler.py +++ b/app/tasks/glean_scheduler.py @@ -1,10 +1,10 @@ -"""Periodic batch ingest scheduler with optional CF submission. +"""Periodic batch glean scheduler with optional CF submission. -Runs ingest_sources on a configurable interval (TURNSTONE_INGEST_INTERVAL env var, +Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var, default 900s / 15 min). Set to 0 to disable. When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote -Turnstone instance (the CF receiving store) after each ingest run. +Turnstone instance (the CF receiving store) after each glean run. """ from __future__ import annotations @@ -19,7 +19,7 @@ from typing import Any import httpx -from app.ingest.pipeline import ingest_sources +from app.glean.pipeline import glean_sources logger = logging.getLogger(__name__) @@ -96,14 +96,14 @@ async def submit_matched( if not entries: return {"ok": True, "submitted": 0, "skipped": True} - url = f"{submit_endpoint.rstrip('/')}/turnstone/api/ingest/batch" + url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch" payload = {"source_host": source_host, "entries": entries} try: async with httpx.AsyncClient(timeout=30.0) as client: resp = await client.post(url, json=payload) resp.raise_for_status() result = resp.json() - submitted = result.get("ingested", len(entries)) + submitted = result.get("gleaned", len(entries)) _state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat() _state.last_submit_count = submitted _state.last_submit_error = None @@ -121,10 +121,15 @@ async def run_once( pattern_file: Path | None = None, submit_endpoint: str | None = None, source_host: str = "unknown", + force: bool = False, ) -> dict[str, Any]: - """Ingest all sources once, then submit matched entries if configured.""" + """Ingest all sources once, then submit matched entries if configured. + + Pass ``force=True`` to bypass fingerprint checks and re-glean all local + file sources regardless of whether they appear unchanged. + """ if _lock.locked(): - return {"ok": False, "error": "ingest already running", "skipped": True} + return {"ok": False, "error": "glean already running", "skipped": True} async with _lock: _state.running = True @@ -133,7 +138,7 @@ async def run_once( loop = asyncio.get_running_loop() stats: dict[str, int] = await loop.run_in_executor( None, - lambda: ingest_sources(sources_file, db_path, pattern_file), + lambda: glean_sources(sources_file, db_path, pattern_file, force=force), ) duration = (datetime.now(tz=timezone.utc) - started).total_seconds() _state.last_run_at = started.isoformat() @@ -141,14 +146,14 @@ async def run_once( _state.last_stats = stats _state.last_error = None _state.run_count += 1 - logger.info("Batch ingest complete in %.1fs — %s", duration, stats) + logger.info("Batch glean complete in %.1fs — %s", duration, stats) except Exception as exc: duration = (datetime.now(tz=timezone.utc) - started).total_seconds() _state.last_run_at = started.isoformat() _state.last_duration_s = round(duration, 2) _state.last_error = str(exc) _state.run_count += 1 - logger.error("Batch ingest failed: %s", exc) + logger.error("Batch glean failed: %s", exc) _state.running = False return {"ok": False, "error": str(exc)} finally: @@ -168,7 +173,7 @@ async def scheduler_loop( submit_endpoint: str | None = None, source_host: str = "unknown", ) -> None: - """Run ingest + optional submission every interval_s seconds until cancelled.""" + """Run glean + optional submission every interval_s seconds until cancelled.""" logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file) if submit_endpoint: logger.info("Submission enabled — endpoint: %s", submit_endpoint) diff --git a/app/watch/watcher.py b/app/watch/watcher.py index e12038b..a9490d1 100644 --- a/app/watch/watcher.py +++ b/app/watch/watcher.py @@ -1,4 +1,4 @@ -"""Live watch: tail active log sources and ingest entries in near-real-time. +"""Live watch: tail active log sources and glean entries in near-real-time. Each WatchSource runs a subprocess (journalctl -f, podman/docker logs -f) in a daemon thread and pipes lines through the existing ingestors into SQLite. @@ -18,12 +18,12 @@ from typing import Iterator import yaml -from app.ingest import journald as journald_parser, syslog as syslog_parser -from app.ingest import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser -from app.ingest import qbittorrent as qbit_parser, caddy as caddy_parser -from app.ingest.pipeline import _detect_format -from app.ingest.base import _compile, load_patterns, now_iso -from app.ingest.pipeline import _write_batch, _SCHEMA +from app.glean import journald as journald_parser, syslog as syslog_parser +from app.glean import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser +from app.glean import qbittorrent as qbit_parser, caddy as caddy_parser +from app.glean.pipeline import _detect_format +from app.glean.base import _compile, load_patterns, now_iso +from app.glean.pipeline import _write_batch, _SCHEMA from app.services.search import build_fts_index from app.services.models import RetrievedEntry @@ -85,7 +85,7 @@ class WatchSource: "source_id": self.config.source_id, "type": self.config.source_type, "running": self._thread is not None and self._thread.is_alive(), - "entries_ingested": self._entry_count, + "entries_gleaned": self._entry_count, "last_event": self._last_event, "error": self._error, } diff --git a/docs/tautulli-setup.md b/docs/tautulli-setup.md index 5d719a8..0b61180 100644 --- a/docs/tautulli-setup.md +++ b/docs/tautulli-setup.md @@ -39,7 +39,7 @@ notification agent: ## Webhook URL ``` -http://:8534/turnstone/api/ingest/tautulli +http://:8534/turnstone/api/glean/tautulli ``` Replace `` with the hostname or IP of the machine running diff --git a/harvester/harvester.py b/harvester/harvester.py index 4f8370a..9b18867 100644 --- a/harvester/harvester.py +++ b/harvester/harvester.py @@ -2,7 +2,7 @@ """Turnstone Harvester — collect logs and ship them to a Turnstone instance. Subcommands: - push Read sources.yaml, POST each log file to Turnstone /api/ingest/upload + push Read sources.yaml, POST each log file to Turnstone /api/glean/upload incident Tag an incident on the remote Turnstone instance Usage: @@ -97,8 +97,8 @@ def cmd_push(args: argparse.Namespace) -> int: logger.warning("No sources defined in %s", sources_path) return 0 - upload_url = args.url.rstrip("/") + "/turnstone/api/ingest/upload" - total_ingested = 0 + upload_url = args.url.rstrip("/") + "/turnstone/api/glean/upload" + total_gleaned = 0 errors = 0 for src in sources: @@ -110,9 +110,9 @@ def cmd_push(args: argparse.Namespace) -> int: logger.info("Pushing %s (%s) ...", src_id, src_path) try: result = _post_file(upload_url, src_path, src_id) - count = result.get("ingested", 0) - total_ingested += count - logger.info(" %s: %d entries ingested", src_id, count) + count = result.get("gleaned", 0) + total_gleaned += count + logger.info(" %s: %d entries gleaned", src_id, count) except urllib.error.HTTPError as exc: logger.error(" %s: HTTP %d — %s", src_id, exc.code, exc.read().decode(errors="replace")) errors += 1 @@ -120,7 +120,7 @@ def cmd_push(args: argparse.Namespace) -> int: logger.error(" %s: %s", src_id, exc) errors += 1 - logger.info("Done. Total ingested: %d entries, errors: %d", total_ingested, errors) + logger.info("Done. Total gleaned: %d entries, errors: %d", total_gleaned, errors) return 1 if errors else 0 diff --git a/harvester/sources.example.yaml b/harvester/sources.example.yaml index 508e257..c521854 100644 --- a/harvester/sources.example.yaml +++ b/harvester/sources.example.yaml @@ -46,6 +46,6 @@ sources: # Wazuh SIEM — alerts.json on the Wazuh manager # Turnstone auto-detects this format; source_id is qualified per agent automatically. # For push-based ingestion from Wazuh custom integrations, use: - # POST /api/ingest/wazuh/alert (single alert JSON body) + # POST /api/glean/wazuh/alert (single alert JSON body) # - id: wazuh # path: /var/ossec/logs/alerts/alerts.json diff --git a/manage.sh b/manage.sh index 7e72b01..19a5b96 100755 --- a/manage.sh +++ b/manage.sh @@ -120,9 +120,9 @@ usage() { echo -e " ${GREEN}dev${NC} uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})" echo "" echo " Data:" - echo -e " ${GREEN}ingest PATH [DB]${NC} Ingest a log file or corpus directory" - echo -e " ${GREEN}ingest-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and ingest" - echo -e " ${GREEN}ingest-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH" + echo -e " ${GREEN}glean PATH [DB]${NC} Glean a log file or corpus directory" + echo -e " ${GREEN}glean-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and glean" + echo -e " ${GREEN}glean-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH" echo -e " ${GREEN}build-fts${NC} Rebuild the FTS search index" echo "" echo " Tests:" @@ -134,8 +134,8 @@ usage() { echo " Examples:" echo " ./manage.sh start" echo " ./manage.sh dev" - echo " ./manage.sh ingest corpus/raw/" - echo " ./manage.sh ingest corpus/raw/ data/custom.db" + echo " ./manage.sh glean corpus/raw/" + echo " ./manage.sh glean corpus/raw/ data/custom.db" echo "" } @@ -231,15 +231,15 @@ case "$CMD" in (cd web && npm run dev -- --port "$VITE_PORT") ;; - ingest) + glean) if [[ $# -lt 1 ]]; then - error "Usage: ./manage.sh ingest [DB_PATH]" + error "Usage: ./manage.sh glean [DB_PATH]" fi - info "Ingesting $1 → ${2:-$DB}…" - "$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}" + info "Gleaning $1 → ${2:-$DB}…" + "$PYTHON" scripts/glean_corpus.py "$1" "${2:-$DB}" ;; - ingest-plex) + glean-plex) PLEX_HOST="${1:-cass}" PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs" TMP_DIR="/tmp/turnstone-plex-$$" @@ -264,16 +264,16 @@ case "$CMD" in ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path" done - info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}…" + info "Gleaning ${#REMOTE_LOGS[@]} log file(s) into ${DB}…" for f in "$TMP_DIR"/*.log; do - "$PYTHON" scripts/ingest_corpus.py "$f" "$DB" + "$PYTHON" scripts/glean_corpus.py "$f" "$DB" done rm -rf "$TMP_DIR" info "Done. Restarting server…" exec bash "$0" restart ;; - ingest-qbit) + glean-qbit) QBIT_HOST="${1:-}" # Default log locations in priority order QBIT_LOG_PATHS=( @@ -316,8 +316,8 @@ case "$CMD" in info " ← ${LOCAL_LOG}" fi - info "Ingesting into ${DB}…" - "$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB" + info "Gleaning into ${DB}…" + "$PYTHON" scripts/glean_corpus.py "${TMP_DIR}"/*.log "$DB" rm -rf "$TMP_DIR" info "Done. Restarting server…" exec bash "$0" restart diff --git a/patterns/default.yaml b/patterns/default.yaml index c125aaa..6fd3450 100644 --- a/patterns/default.yaml +++ b/patterns/default.yaml @@ -1,4 +1,4 @@ -# Turnstone pattern library — named regex patterns for log tagging at ingest time. +# Turnstone pattern library — named regex patterns for log tagging at glean time. # Each matched pattern name is stored on RetrievedEntry.matched_patterns and # used to boost retrieval relevance for diagnostic queries. # @@ -128,6 +128,21 @@ patterns: severity: ERROR description: NFS mount or RPC timeout + - name: service_crash_loop + pattern: "(restart counter is at [0-9]|start request repeated too quickly|Restart limit hit)" + severity: WARN + description: systemd service crash-looping — restart counter incrementing or rate-limit hit; check for DNS resolution failures, missing dependencies, or bad config + + - name: pkg_daemon_restart + pattern: "(invoke-rc\\.d|Unit process.*(apt-get|dpkg|preinst).*remains running after unit stopped|Stopped.*service.*openssh|Restarting.*OpenBSD Secure Shell)" + severity: WARN + description: Package manager restarted a system daemon — active SSH or service sessions may have been interrupted + + - name: ssh_forward_conflict + pattern: "(channel_setup_fwd_listener_tcpip: cannot listen to port|error: bind.*Address already in use)" + severity: WARN + description: SSH port-forward conflict — previous session port still bound; stale sessions accumulating or rapid reconnects + # Add device/service-specific patterns below this line: - name: qbit_tracker_error diff --git a/patterns/sources-cluster.yaml b/patterns/sources-cluster.yaml index 1d742f4..0dafe22 100644 --- a/patterns/sources-cluster.yaml +++ b/patterns/sources-cluster.yaml @@ -1,15 +1,15 @@ -# Turnstone log sources — Heimdall cluster ingest. +# Turnstone log sources — Heimdall cluster glean. # Covers: Heimdall (local), Navi, Sif, Cass, Strahl (SSH-collected), # Docker services on Heimdall, and network device syslog. # -# Collected by scripts/collect_cluster_logs.sh before each ingest run. +# Collected by scripts/collect_cluster_logs.sh before each glean run. # All paths are container-side (/data/ = bind-mount of /devl/turnstone-cluster/data/). # -# Cron (collect + ingest, every 15 min): +# Cron (collect + glean, every 15 min): # */15 * * * * bash /Library/Development/CircuitForge/turnstone/scripts/collect_cluster_logs.sh && \ -# docker exec turnstone-cluster python scripts/ingest_corpus.py \ +# docker exec turnstone-cluster python scripts/glean_corpus.py \ # --sources /patterns/sources-cluster.yaml --db /data/turnstone.db \ -# >> /var/log/turnstone-cluster-ingest.log 2>&1 +# >> /var/log/turnstone-cluster-glean.log 2>&1 sources: # ── Heimdall (local) ───────────────────────────────────────────────────────── diff --git a/patterns/sources.yaml b/patterns/sources.yaml index 49b89d7..c299f53 100644 --- a/patterns/sources.yaml +++ b/patterns/sources.yaml @@ -1,8 +1,8 @@ # Turnstone log sources — edit this file to add or remove services. # NOTE: the system-journal entry requires export_journal.sh to run on the HOST -# before the container ingest step. See crontab setup instructions in the README. -# Run ingest manually: -# sudo podman exec turnstone python scripts/ingest_corpus.py \ +# before the container glean step. See crontab setup instructions in the README. +# Run glean manually: +# sudo podman exec turnstone python scripts/glean_corpus.py \ # --sources /patterns/sources.yaml --db /data/turnstone.db # # Paths here are container-side paths under the /opt bind mount. @@ -12,7 +12,7 @@ sources: # ── System (exported by export_journal.sh on the host) ─────────────────── # journal-export.jsonl and dmesg-export.txt are written to /opt/turnstone/data/ - # by the export script before each ingest run. + # by the export script before each glean run. - id: system-journal path: /data/journal-export.jsonl @@ -73,7 +73,7 @@ sources: # ── MQTT / IoT (live — subscribe mode, no path needed) ─────────────────── # Requires: pip install circuitforge-core[mqtt] - # These sources are handled by the live MQTT subscriber task (not batch ingest). + # These sources are handled by the live MQTT subscriber task (not batch glean). # Uncomment and configure to enable. # # Meshtastic MQTT bridge (node must have MQTT uplink enabled): diff --git a/podman-standalone.sh b/podman-standalone.sh index f937890..68c77cb 100755 --- a/podman-standalone.sh +++ b/podman-standalone.sh @@ -2,7 +2,7 @@ # podman-standalone.sh — Turnstone rootful Podman setup (no Compose) # # For hosts running system Podman (non-rootless) with systemd. -# Turnstone is a diagnostic log intelligence layer — ingest service logs, +# Turnstone is a diagnostic log intelligence layer — glean service logs, # search by symptom, and view incidents in a lightweight web UI. # # ── Prerequisites ──────────────────────────────────────────────────────────── @@ -28,18 +28,18 @@ # sudo systemctl daemon-reload # sudo systemctl enable --now turnstone # -# ── Ingesting logs ──────────────────────────────────────────────────────────── +# ── Gleaning logs ───────────────────────────────────────────────────────────── # All service logs under /opt are accessible inside the container. # Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/). # -# To ingest all sources (run manually or via cron): +# To glean all sources (run manually or via cron): # -# sudo podman exec turnstone python scripts/ingest_corpus.py \ +# sudo podman exec turnstone python scripts/glean_corpus.py \ # --sources /patterns/sources.yaml --db /data/turnstone.db # # Example cron (every 15 minutes, add to root's crontab with: sudo crontab -e): -# */15 * * * * podman exec turnstone python scripts/ingest_corpus.py \ -# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-ingest.log 2>&1 +# */15 * * * * podman exec turnstone python scripts/glean_corpus.py \ +# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1 # # To add a new log source: edit /opt/turnstone/patterns/sources.yaml — no restart needed. # @@ -73,7 +73,7 @@ TZ=America/Los_Angeles # # ── Orchard submission (opt-in telemetry) ──────────────────────────────────── # Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF -# receiving instance after each ingest run. Only matched entries are sent — +# receiving instance after each glean run. Only matched entries are sent — # no raw log content. Used to build Avocet training data. # # export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/xander @@ -142,8 +142,8 @@ echo "Check container health with:" echo " sudo podman ps" echo " sudo podman logs turnstone" echo "" -echo "To ingest all sources now:" -echo " sudo podman exec turnstone python scripts/ingest_corpus.py \\" +echo "To glean all sources now:" +echo " sudo podman exec turnstone python scripts/glean_corpus.py \\" echo " --sources /patterns/sources.yaml --db /data/turnstone.db" echo "" echo "To add a new source: edit /opt/turnstone/patterns/sources.yaml — no restart needed." diff --git a/requirements.txt b/requirements.txt index 66b35f2..b5abda4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ aiofiles>=23.0.0 python-multipart>=0.0.9 dateparser>=1.2.0 httpx>=0.27.0 +paramiko diff --git a/scripts/build_fts_index.py b/scripts/build_fts_index.py index d0d4677..37f731f 100644 --- a/scripts/build_fts_index.py +++ b/scripts/build_fts_index.py @@ -1,4 +1,4 @@ -"""CLI: build (or update) the FTS5 full-text search index after ingest.""" +"""CLI: build (or update) the FTS5 full-text search index after glean.""" from __future__ import annotations import sys @@ -13,7 +13,7 @@ if __name__ == "__main__": if not db_path.exists(): print(f"ERROR: database not found: {db_path}", file=sys.stderr) - print("Run ingest first: python scripts/ingest_corpus.py", file=sys.stderr) + print("Run glean first: python scripts/glean_corpus.py", file=sys.stderr) sys.exit(1) print(f"Building FTS index for {db_path} ...") diff --git a/scripts/collect_cluster_logs.sh b/scripts/collect_cluster_logs.sh index 2c8b0f6..9c900e0 100644 --- a/scripts/collect_cluster_logs.sh +++ b/scripts/collect_cluster_logs.sh @@ -20,7 +20,7 @@ SSH_OPTS="-o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=no" PYTHON=/devl/miniconda3/envs/cf/bin/python INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/ingest_corpus.py" DB=/devl/turnstone-cluster/data/turnstone.db -LOG=/devl/turnstone-cluster/data/ingest.log +LOG=/devl/turnstone-cluster/data/glean.log mkdir -p "${DATA_DIR}" @@ -141,7 +141,7 @@ fi # Remote journals (explicit source IDs via YAML) ${INGEST} --sources /devl/turnstone-cluster/patterns/sources-cluster.yaml --db "${DB}" - # Docker and Plex logs (source IDs derived from filenames by directory ingest) + # Docker and Plex logs (source IDs derived from filenames by directory glean) for dir in "${HEIMDALL_DIR}" "${NAVI_DIR}" "${STRAHL_DIR}" "${PLEX_DIR}"; do [[ -d "${dir}" ]] && ls "${dir}"/*.jsonl "${dir}"/*.log 2>/dev/null | grep -q . && \ ${INGEST} "${dir}" "${DB}" || true diff --git a/scripts/export_journal.sh b/scripts/export_journal.sh index 941ab70..e94a594 100644 --- a/scripts/export_journal.sh +++ b/scripts/export_journal.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Export recent system messages to files the Turnstone container can ingest. +# Export recent system messages to files the Turnstone container can glean. # # Exports: # journal-export.jsonl — journald (if journalctl is available) @@ -11,11 +11,11 @@ # Usage (standalone): # sudo bash /opt/turnstone/scripts/export_journal.sh # -# Cron (combined with ingest): +# Cron (combined with glean): # */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \ # podman exec turnstone python scripts/ingest_corpus.py \ # --sources /patterns/sources.yaml --db /data/turnstone.db \ -# >> /var/log/turnstone-ingest.log 2>&1 +# >> /var/log/turnstone-glean.log 2>&1 set -euo pipefail diff --git a/scripts/ingest_corpus.py b/scripts/glean_corpus.py similarity index 64% rename from scripts/ingest_corpus.py rename to scripts/glean_corpus.py index ca12ae6..e3d14db 100644 --- a/scripts/ingest_corpus.py +++ b/scripts/glean_corpus.py @@ -1,11 +1,11 @@ -"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database. +"""CLI: glean a log file or corpus directory into the Turnstone SQLite database. Usage: # Single file or directory (legacy) - python scripts/ingest_corpus.py [db_path] + python scripts/glean_corpus.py [db_path] # Sources config (multi-service) - python scripts/ingest_corpus.py --sources [--db ] + python scripts/glean_corpus.py --sources [--db ] """ from __future__ import annotations @@ -17,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") sys.path.insert(0, str(Path(__file__).parent.parent)) -from app.ingest.pipeline import ingest, ingest_file, ingest_sources +from app.glean.pipeline import glean_dir, glean_file, glean_sources def _print_stats(stats: dict[str, int]) -> None: @@ -33,33 +33,33 @@ if __name__ == "__main__": if not args: print( "Usage:\n" - " ingest_corpus.py [db_path]\n" - " ingest_corpus.py --sources [--db ]", + " glean_corpus.py [db_path]\n" + " glean_corpus.py --sources [--db ]", file=sys.stderr, ) sys.exit(1) if args[0] == "--sources": if len(args) < 2: - print("Usage: ingest_corpus.py --sources [--db ]", file=sys.stderr) + print("Usage: glean_corpus.py --sources [--db ]", file=sys.stderr) sys.exit(1) sources_file = Path(args[1]) db_path = Path("data/turnstone.db") if "--db" in args: db_path = Path(args[args.index("--db") + 1]) db_path.parent.mkdir(parents=True, exist_ok=True) - print(f"Ingesting sources from {sources_file} → {db_path}") - stats = ingest_sources(sources_file, db_path) + print(f"Gleaning sources from {sources_file} → {db_path}") + stats = glean_sources(sources_file, db_path) _print_stats(stats) else: target = Path(args[0]) db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db") db_path.parent.mkdir(parents=True, exist_ok=True) - print(f"Ingesting {target} → {db_path}") + print(f"Gleaning {target} → {db_path}") if target.is_file(): - stats = ingest_file(target, db_path) + stats = glean_file(target, db_path) elif target.is_dir(): - stats = ingest(target, db_path) + stats = glean_dir(target, db_path) else: print(f"Error: {target} is not a file or directory", file=sys.stderr) sys.exit(1) diff --git a/tests/context/test_doc_upload.py b/tests/context/test_doc_upload.py index 9986d62..162f6f5 100644 --- a/tests/context/test_doc_upload.py +++ b/tests/context/test_doc_upload.py @@ -3,7 +3,7 @@ import sqlite3 import pytest from pathlib import Path -from app.ingest.doc_upload import ingest_upload +from app.glean.doc_upload import glean_upload from app.context.store import list_facts, list_documents from app.context.chunker import UnsupportedDocType @@ -40,7 +40,7 @@ services: ports: - "32400:32400" """ - result = ingest_upload(db, "docker-compose.yml", yaml_bytes) + result = glean_upload(db, "docker-compose.yml", yaml_bytes) assert result["doc_type"] == "yaml" assert result["facts_written"] >= 1 assert result["chunks_written"] >= 1 @@ -53,7 +53,7 @@ services: def test_ingest_markdown_no_facts(db): md = b"# Runbook\n\nRestart plex with `systemctl restart plex`." - result = ingest_upload(db, "runbook.md", md) + result = glean_upload(db, "runbook.md", md) assert result["doc_type"] == "markdown" assert result["facts_written"] == 0 assert result["chunks_written"] >= 1 @@ -61,4 +61,4 @@ def test_ingest_markdown_no_facts(db): def test_ingest_raises_on_bad_type(db): with pytest.raises(UnsupportedDocType): - ingest_upload(db, "report.pdf", b"data") + glean_upload(db, "report.pdf", b"data") diff --git a/tests/context/test_embedder.py b/tests/context/test_embedder.py index cc84032..67a124f 100644 --- a/tests/context/test_embedder.py +++ b/tests/context/test_embedder.py @@ -1,13 +1,17 @@ -"""Tests for app/context/embedder.py — graceful no-op without sqlite-vec.""" +"""Tests for app/context/embedder.py — delegates to app.services.embeddings.""" import sqlite3 +import struct from pathlib import Path -from unittest.mock import patch +from unittest.mock import MagicMock, patch + +import numpy as np import pytest + from app.context import embedder as emb_mod -@pytest.fixture -def db(tmp_path): +@pytest.fixture() +def db(tmp_path: Path) -> Path: db_path = tmp_path / "t.db" conn = sqlite3.connect(str(db_path)) conn.executescript(""" @@ -20,34 +24,78 @@ def db(tmp_path): REFERENCES context_documents(id) ON DELETE CASCADE, chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB ); - INSERT INTO context_documents VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00'); + INSERT INTO context_documents + VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00'); INSERT INTO context_chunks VALUES ('c1','d1',0,'hello world',NULL); + INSERT INTO context_chunks VALUES ('c2','d1',1,'second chunk',NULL); """) conn.commit() conn.close() return db_path -def test_embed_skipped_when_extension_absent(db): - with patch.object(emb_mod, "EMBEDDING_AVAILABLE", False): - count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434") - assert count == 0 +def _mock_embedder(dim: int = 3) -> MagicMock: + """Return a mock Embedder that returns constant dim-length vectors.""" + m = MagicMock() + m.dim = dim + m.embed_batch.return_value = [np.zeros(dim, dtype=np.float32)] * 10 + return m -def test_embed_calls_ollama_when_available(db): - import httpx +class TestEmbedChunks: + def test_returns_zero_when_no_embedder(self, db: Path) -> None: + with patch("app.context.embedder.get_embedder", return_value=None): + count = emb_mod.embed_chunks(db, "d1") + assert count == 0 - class FakeResponse: - status_code = 200 - def raise_for_status(self): pass - def json(self): return {"embedding": [0.1, 0.2, 0.3]} + def test_returns_zero_when_no_unembedded_chunks(self, db: Path) -> None: + # Pre-fill both chunks with a blob + blob = struct.pack("3f", 0.1, 0.2, 0.3) + conn = sqlite3.connect(str(db)) + conn.execute("UPDATE context_chunks SET embedding=?", (blob,)) + conn.commit() + conn.close() - with patch.object(emb_mod, "EMBEDDING_AVAILABLE", True), \ - patch("app.context.embedder.httpx.post", return_value=FakeResponse()): - count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434") - assert count == 1 - # Verify blob was written - conn = sqlite3.connect(str(db)) - row = conn.execute("SELECT embedding FROM context_chunks WHERE id='c1'").fetchone() - conn.close() - assert row[0] is not None + embedder = _mock_embedder() + with patch("app.context.embedder.get_embedder", return_value=embedder): + count = emb_mod.embed_chunks(db, "d1") + assert count == 0 + embedder.embed_batch.assert_not_called() + + def test_embeds_all_null_chunks(self, db: Path) -> None: + embedder = _mock_embedder(dim=3) + with patch("app.context.embedder.get_embedder", return_value=embedder): + count = emb_mod.embed_chunks(db, "d1") + assert count == 2 # two chunks in fixture + + def test_blobs_written_to_db(self, db: Path) -> None: + vec = np.array([0.1, 0.2, 0.3], dtype=np.float32) + embedder = _mock_embedder(dim=3) + embedder.embed_batch.return_value = [vec, vec] + + with patch("app.context.embedder.get_embedder", return_value=embedder): + emb_mod.embed_chunks(db, "d1") + + conn = sqlite3.connect(str(db)) + rows = conn.execute( + "SELECT embedding FROM context_chunks WHERE document_id='d1'" + ).fetchall() + conn.close() + for (blob,) in rows: + assert blob is not None + unpacked = struct.unpack(f"{len(blob)//4}f", blob) + assert len(unpacked) == 3 + + def test_legacy_llm_url_param_accepted(self, db: Path) -> None: + """Ensure backward-compat signature still works (llm_url ignored).""" + embedder = _mock_embedder() + with patch("app.context.embedder.get_embedder", return_value=embedder): + count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434", "nomic-embed-text") + assert count == 2 + + def test_embed_batch_error_returns_zero(self, db: Path) -> None: + embedder = _mock_embedder() + embedder.embed_batch.side_effect = RuntimeError("model exploded") + with patch("app.context.embedder.get_embedder", return_value=embedder): + count = emb_mod.embed_chunks(db, "d1") + assert count == 0 diff --git a/tests/context/test_schema.py b/tests/context/test_schema.py index 69b0327..ea71812 100644 --- a/tests/context/test_schema.py +++ b/tests/context/test_schema.py @@ -2,7 +2,7 @@ import sqlite3 from pathlib import Path import pytest -from app.ingest.pipeline import ensure_schema +from app.glean.pipeline import ensure_schema def test_context_tables_created(tmp_path): diff --git a/tests/test_blocklist_endpoints.py b/tests/test_blocklist_endpoints.py index 1c4289a..938042f 100644 --- a/tests/test_blocklist_endpoints.py +++ b/tests/test_blocklist_endpoints.py @@ -9,7 +9,7 @@ from unittest.mock import MagicMock, patch @pytest.fixture def client(tmp_path): from fastapi.testclient import TestClient - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import app.rest as rest_module db = tmp_path / "test.db" @@ -25,7 +25,7 @@ def client(tmp_path): @pytest.fixture def client_with_candidate(tmp_path): from fastapi.testclient import TestClient - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import app.rest as rest_module import sqlite3, uuid diff --git a/tests/test_diagnose_classifier.py b/tests/test_diagnose_classifier.py new file mode 100644 index 0000000..40a1447 --- /dev/null +++ b/tests/test_diagnose_classifier.py @@ -0,0 +1,245 @@ +"""Tests for app/services/diagnose/classifier.py — SeverityClassifier. + +All ML-path tests mock ``transformers.pipeline`` so no model weights are +downloaded during the test suite. +""" +from __future__ import annotations + +from dataclasses import FrozenInstanceError +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +import app.services.diagnose.classifier as clf_module +from app.services.diagnose.classifier import SeverityClassifier +from app.services.diagnose.models import ClassifiedTimeline, EventCluster, TimelineResult + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(autouse=True) +def reset_ml_singleton(): + """Ensure the module-level ML singleton is cleared before and after each test.""" + clf_module._ml_classifier = None + yield + clf_module._ml_classifier = None + + +# --------------------------------------------------------------------------- +# Test-object builders +# --------------------------------------------------------------------------- + + +def _make_cluster( + representative_text: str = "test log", + pattern_tags: tuple[str, ...] = (), + severity: str = "INFO", +) -> EventCluster: + return EventCluster( + cluster_id="abc123", + entries=("e1",), + start_iso=None, + end_iso=None, + duration_seconds=0.0, + source_ids=("src",), + pattern_tags=pattern_tags, + severity=severity, # type: ignore[arg-type] + burst=False, + gap_before_seconds=0.0, + representative_text=representative_text, + ) + + +def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult: + return TimelineResult( + clusters=clusters, + total_entries=0, + window_start=None, + window_end=None, + gap_count=0, + burst_count=0, + dominant_sources=(), + ) + + +def _mock_hf_pipeline(label: str, score: float) -> MagicMock: + """Return a mock HF pipeline callable that always yields one result.""" + pipe = MagicMock() + pipe.return_value = [{"label": label, "score": score}] + return pipe + + +# --------------------------------------------------------------------------- +# Path A — ML classification +# --------------------------------------------------------------------------- + + +class TestMLPath: + def test_ml_error_maps_to_error(self) -> None: + """ML returning ERROR with score 0.98 → cluster severity ERROR.""" + pipe = _mock_hf_pipeline("ERROR", 0.98) + with patch( + "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe + ): + clf = SeverityClassifier(model_id="fake/model") + result = clf.classify(_make_timeline(((_make_cluster("disk error detected")),))) + + assert result.cluster_severities["abc123"] == "ERROR" + assert result.classifier_used == "ml" + assert result.model_id == "fake/model" + + def test_ml_critical_promotion(self) -> None: + """ERROR + score > 0.95 + 'kernel panic' in text → promoted to CRITICAL.""" + pipe = _mock_hf_pipeline("ERROR", 0.97) + with patch( + "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe + ): + clf = SeverityClassifier(model_id="fake/model") + result = clf.classify( + _make_timeline((_make_cluster("kernel panic: not syncing VFS"),)) + ) + + assert result.cluster_severities["abc123"] == "CRITICAL" + + def test_ml_debug_demotion(self) -> None: + """INFO + score < 0.4 → demoted to DEBUG.""" + pipe = _mock_hf_pipeline("INFO", 0.3) + with patch( + "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe + ): + clf = SeverityClassifier(model_id="fake/model") + result = clf.classify(_make_timeline((_make_cluster("routine ping"),))) + + assert result.cluster_severities["abc123"] == "DEBUG" + + def test_ml_warning_maps_to_warn(self) -> None: + """ML returning WARNING → mapped to WARN.""" + pipe = _mock_hf_pipeline("WARNING", 0.85) + with patch( + "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe + ): + clf = SeverityClassifier(model_id="fake/model") + result = clf.classify(_make_timeline((_make_cluster("low disk space"),))) + + assert result.cluster_severities["abc123"] == "WARN" + + +# --------------------------------------------------------------------------- +# Path B — pattern_tags fallback +# --------------------------------------------------------------------------- + + +class TestPatternTagsPath: + def test_pattern_tags_resolve_error_severity(self, tmp_path: Path) -> None: + """Cluster with pattern_tag 'service_crash_loop' → ERROR from pattern file.""" + pattern_yaml = tmp_path / "default.yaml" + pattern_yaml.write_text( + "patterns:\n" + " - name: service_crash_loop\n" + " pattern: crash\n" + " severity: ERROR\n" + " description: Service crashed in a loop\n" + ) + clf = SeverityClassifier(model_id="", pattern_file=pattern_yaml) + cluster = _make_cluster( + representative_text="service crashed", + pattern_tags=("service_crash_loop",), + ) + result = clf.classify(_make_timeline((cluster,))) + + assert result.cluster_severities["abc123"] == "ERROR" + assert result.classifier_used == "pattern_tags" + assert result.model_id is None + + +# --------------------------------------------------------------------------- +# Path C — regex fallback +# --------------------------------------------------------------------------- + + +class TestRegexPath: + def test_regex_detects_error(self) -> None: + """No ML, no pattern file: 'ERROR: disk full' → ERROR via regex.""" + clf = SeverityClassifier(model_id="") + result = clf.classify( + _make_timeline((_make_cluster("ERROR: disk full"),)) + ) + + assert result.cluster_severities["abc123"] == "ERROR" + assert result.classifier_used == "regex" + + def test_regex_defaults_to_info_when_no_match(self) -> None: + """No severity keyword in text → defaults to INFO.""" + clf = SeverityClassifier(model_id="") + result = clf.classify( + _make_timeline((_make_cluster("mount: disk mounted successfully"),)) + ) + + assert result.cluster_severities["abc123"] == "INFO" + + +# --------------------------------------------------------------------------- +# Fallback behaviour +# --------------------------------------------------------------------------- + + +class TestImportErrorFallback: + def test_transformers_import_error_falls_back_to_pattern_tags( + self, tmp_path: Path + ) -> None: + """ImportError from transformers → clean fallback to pattern_tags path.""" + pattern_yaml = tmp_path / "default.yaml" + pattern_yaml.write_text( + "patterns:\n" + " - name: auth_failure\n" + " pattern: auth\n" + " severity: ERROR\n" + " description: Auth failure\n" + ) + + def _raising_get_ml(*_args: Any, **_kwargs: Any) -> None: + raise ImportError("No module named 'transformers'") + + with patch( + "app.services.diagnose.classifier._get_ml_classifier", + side_effect=_raising_get_ml, + ): + clf = SeverityClassifier(model_id="fake/model", pattern_file=pattern_yaml) + cluster = _make_cluster( + representative_text="auth failed", + pattern_tags=("auth_failure",), + ) + result = clf.classify(_make_timeline((cluster,))) + + # ML was attempted (classifier_used == "ml") but pattern_tags resolved it + assert result.classifier_used == "ml" + assert result.cluster_severities["abc123"] == "ERROR" + + +# --------------------------------------------------------------------------- +# Edge cases +# --------------------------------------------------------------------------- + + +class TestEdgeCases: + def test_empty_timeline_produces_empty_severities(self) -> None: + """TimelineResult with no clusters → empty cluster_severities, no crash.""" + clf = SeverityClassifier(model_id="") + result = clf.classify(_make_timeline()) + + assert isinstance(result, ClassifiedTimeline) + assert result.cluster_severities == {} + assert result.classifier_used == "regex" + + def test_classified_timeline_is_frozen(self) -> None: + """ClassifiedTimeline must be frozen (FrozenInstanceError on mutation).""" + clf = SeverityClassifier(model_id="") + result = clf.classify(_make_timeline((_make_cluster(),))) + + with pytest.raises(FrozenInstanceError): + result.classifier_used = "ml" # type: ignore[misc] diff --git a/tests/test_diagnose_hypothesizer.py b/tests/test_diagnose_hypothesizer.py new file mode 100644 index 0000000..09ffbd9 --- /dev/null +++ b/tests/test_diagnose_hypothesizer.py @@ -0,0 +1,486 @@ +"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer. + +All tests use mocking; no real LLM calls are made. +""" +from __future__ import annotations + +import json +import re +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from app.context.retriever import RetrievedContext +from app.services.diagnose.hypothesizer import RootCauseHypothesizer +from app.services.diagnose.models import ( + ClassifiedTimeline, + EventCluster, + Hypothesis, + TimelineResult, +) + + +# --------------------------------------------------------------------------- +# Fixture helpers +# --------------------------------------------------------------------------- + + +def _make_cluster( + cluster_id: str = "c1", + representative_text: str = "kernel: oom-killer invoked", + severity: str = "ERROR", + source_ids: tuple[str, ...] = ("syslog",), + pattern_tags: tuple[str, ...] = ("oom",), + start_iso: str | None = "2024-01-01T00:00:00+00:00", +) -> EventCluster: + return EventCluster( + cluster_id=cluster_id, + entries=("e1",), + start_iso=start_iso, + end_iso=None, + duration_seconds=1.0, + source_ids=source_ids, + pattern_tags=pattern_tags, + severity=severity, # type: ignore[arg-type] + burst=False, + gap_before_seconds=0.0, + representative_text=representative_text, + ) + + +def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult: + return TimelineResult( + clusters=clusters, + total_entries=len(clusters), + window_start=None, + window_end=None, + gap_count=0, + burst_count=0, + dominant_sources=(), + ) + + +def _make_classified( + clusters: tuple[EventCluster, ...] = (), + cluster_severities: dict | None = None, +) -> ClassifiedTimeline: + if cluster_severities is None: + cluster_severities = {c.cluster_id: c.severity for c in clusters} + return ClassifiedTimeline( + timeline=_make_timeline(clusters), + cluster_severities=cluster_severities, + classifier_used="pattern_tags", + model_id=None, + ) + + +def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext: + return RetrievedContext( + facts=[], + chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}], + ) + + +def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock: + """Build a mock httpx.Response that returns the given list as JSON.""" + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "choices": [{"message": {"content": json.dumps(items)}}] + } + return mock_resp + + +_SAMPLE_HYPOTHESES = [ + { + "title": "OOM killer terminated critical process", + "description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.", + "confidence": 0.85, + "severity": "CRITICAL", + "supporting_clusters": ["c1"], + }, + { + "title": "Disk I/O saturation", + "description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.", + "confidence": 0.6, + "severity": "ERROR", + "supporting_clusters": ["c2"], + }, +] + + +# --------------------------------------------------------------------------- +# Test 1: Valid JSON response returns correct Hypothesis objects +# --------------------------------------------------------------------------- + + +def test_valid_json_response_returns_hypotheses(): + """Valid LLM JSON array produces a list of Hypothesis objects with correct fields.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="why is memory failing?", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 2 + assert isinstance(results[0], Hypothesis) + assert results[0].title == "OOM killer terminated critical process" + assert results[0].confidence == pytest.approx(0.85) + assert results[0].severity == "CRITICAL" + assert results[0].supporting_cluster_ids == ("c1",) + assert results[1].title == "Disk I/O saturation" + assert results[1].severity == "ERROR" + + +# --------------------------------------------------------------------------- +# Test 2: hypothesis_id is a non-empty UUID string on each result +# --------------------------------------------------------------------------- + + +_UUID_RE = re.compile( + r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$" +) + + +def test_hypothesis_id_is_uuid(): + """Each returned Hypothesis carries a distinct UUID v4 hypothesis_id.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 2 + for h in results: + assert h.hypothesis_id, "hypothesis_id must not be empty" + assert _UUID_RE.match(h.hypothesis_id), ( + f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4" + ) + # Each ID must be distinct + ids = [h.hypothesis_id for h in results] + assert len(set(ids)) == len(ids), "hypothesis_ids must be unique" + + +# --------------------------------------------------------------------------- +# Test 3: Malformed JSON response returns [] with a logged warning +# --------------------------------------------------------------------------- + + +def test_malformed_json_returns_empty_and_warns(caplog): + """When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + bad_resp = MagicMock() + bad_resp.status_code = 200 + bad_resp.json.return_value = { + "choices": [{"message": {"content": "not valid json"}}] + } + + import logging + with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert results == [] + assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records) + + +# --------------------------------------------------------------------------- +# Test 4: Non-list JSON (dict) returns [] +# --------------------------------------------------------------------------- + + +def test_non_list_json_returns_empty(caplog): + """When the LLM returns a JSON object instead of an array, hypothesize() returns [].""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + dict_resp = MagicMock() + dict_resp.status_code = 200 + dict_resp.json.return_value = { + "choices": [{"message": {"content": '{"error": "oops"}'}}] + } + + import logging + with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert results == [] + assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records) + + +# --------------------------------------------------------------------------- +# Test 5: Empty clusters returns [] without any LLM call +# --------------------------------------------------------------------------- + + +def test_empty_clusters_returns_empty_no_llm_call(): + """ClassifiedTimeline with no clusters returns [] and never calls the LLM.""" + classified = _make_classified(clusters=()) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + with patch("httpx.post") as mock_post: + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert results == [] + mock_post.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test 6: No LLM URL returns [] without any HTTP call +# --------------------------------------------------------------------------- + + +def test_no_llm_url_returns_empty_no_http_call(): + """When llm_url is None, hypothesize() returns [] immediately with no HTTP requests.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + with patch("httpx.post") as mock_post: + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url=None, + llm_model="llama3", + ) + + assert results == [] + mock_post.assert_not_called() + + +def test_empty_llm_url_returns_empty_no_http_call(): + """When llm_url is empty string, hypothesize() returns [] immediately.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + with patch("httpx.post") as mock_post: + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="", + llm_model="llama3", + ) + + assert results == [] + mock_post.assert_not_called() + + +def test_no_llm_model_returns_empty_no_http_call(): + """When llm_model is None, hypothesize() returns [] immediately.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + with patch("httpx.post") as mock_post: + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model=None, + ) + + assert results == [] + mock_post.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test 7: max_hypotheses is respected +# --------------------------------------------------------------------------- + + +def test_max_hypotheses_respected(): + """When LLM returns more items than max_hypotheses, only max_hypotheses are returned.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer(max_hypotheses=3) + + six_items = [ + { + "title": f"Hypothesis {i}", + "description": "Some description. A second sentence. Third sentence here.", + "confidence": 0.5, + "severity": "ERROR", + "supporting_clusters": ["c1"], + } + for i in range(6) + ] + mock_resp = _llm_json_response(six_items) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 3 + + +# --------------------------------------------------------------------------- +# Test 8: Severity validation — WARNING → WARN, garbage → ERROR +# --------------------------------------------------------------------------- + + +def test_severity_warning_maps_to_warn(): + """'WARNING' from the LLM is normalised to 'WARN'.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + items = [ + { + "title": "A warning severity hypothesis", + "description": "Test description. Second sentence. Third.", + "confidence": 0.7, + "severity": "WARNING", + "supporting_clusters": ["c1"], + } + ] + mock_resp = _llm_json_response(items) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 1 + assert results[0].severity == "WARN" + + +def test_severity_garbage_maps_to_error(): + """An unrecognised severity string from the LLM defaults to 'ERROR'.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + items = [ + { + "title": "A garbage severity hypothesis", + "description": "Test description. Second sentence. Third.", + "confidence": 0.4, + "severity": "GARBAGE", + "supporting_clusters": ["c1"], + } + ] + mock_resp = _llm_json_response(items) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 1 + assert results[0].severity == "ERROR" + + +# --------------------------------------------------------------------------- +# Test 9: Confidence field works with string floats from the LLM +# --------------------------------------------------------------------------- + + +def test_confidence_string_float_coercion(): + """A confidence value returned as a string by the LLM is coerced to float via float().""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + items = [ + { + "title": "String confidence test", + "description": "Some description. Second sentence. Third.", + "confidence": "0.8", # LLM returned a string, not a float + "severity": "INFO", + "supporting_clusters": ["c1"], + } + ] + mock_resp = _llm_json_response(items) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 1 + assert isinstance(results[0].confidence, float) + assert results[0].confidence == pytest.approx(0.8) + + +# --------------------------------------------------------------------------- +# Test 10: Non-numeric confidence string falls back to default 0.5 +# --------------------------------------------------------------------------- + + +def test_non_numeric_confidence_uses_default(): + """LLM returning 'high' for confidence should not raise and defaults to 0.5.""" + cluster = _make_cluster() + classified = _make_classified(clusters=(cluster,)) + ctx = _make_ctx() + hypothesizer = RootCauseHypothesizer() + + items = [ + { + "title": "t", + "description": "d", + "confidence": "high", + "severity": "ERROR", + "supporting_clusters": [], + } + ] + mock_resp = _llm_json_response(items) + + with patch("httpx.post", return_value=mock_resp): + results = hypothesizer.hypothesize( + classified, ctx, query="test", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert len(results) == 1 + assert isinstance(results[0].confidence, float) + assert results[0].confidence == pytest.approx(0.5) diff --git a/tests/test_diagnose_pipeline.py b/tests/test_diagnose_pipeline.py new file mode 100644 index 0000000..9cc1c1d --- /dev/null +++ b/tests/test_diagnose_pipeline.py @@ -0,0 +1,489 @@ +"""Tests for app/services/diagnose/pipeline.py and __init__.py feature flag wiring. + +All tests use mocking; no real LLM, ML, or DB calls are made. +""" +from __future__ import annotations + +from pathlib import Path +from typing import Any +from unittest.mock import MagicMock, patch + +import pytest + +from app.context.retriever import RetrievedContext +from app.services.diagnose.models import ( + ClassifiedTimeline, + Hypothesis, + RankedHypothesis, + TimelineResult, +) +from app.services.search import SearchResult + + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +def _make_search_result( + entry_id: str = "e1", + source_id: str = "syslog", + timestamp_iso: str | None = "2026-01-01T00:00:00+00:00", + severity: str | None = "ERROR", + text: str = "ssh: invalid user", +) -> SearchResult: + return SearchResult( + entry_id=entry_id, + source_id=source_id, + sequence=1, + timestamp_iso=timestamp_iso, + severity=severity, + repeat_count=1, + out_of_order=False, + matched_patterns=["ssh_fail"], + text=text, + rank=1.0, + ) + + +def _make_ctx() -> RetrievedContext: + return RetrievedContext(facts=[], chunks=[]) + + +def _make_timeline(n_clusters: int = 2) -> TimelineResult: + return TimelineResult( + clusters=tuple(), + total_entries=5, + window_start="2026-01-01T00:00:00+00:00", + window_end="2026-01-01T01:00:00+00:00", + gap_count=0, + burst_count=1, + dominant_sources=("syslog",), + ) + + +def _make_classified(timeline: TimelineResult | None = None) -> ClassifiedTimeline: + tl = timeline or _make_timeline() + return ClassifiedTimeline( + timeline=tl, + cluster_severities={}, + classifier_used="regex", + model_id=None, + ) + + +def _make_hypothesis( + hypothesis_id: str = "h1", + title: str = "SSH flood", + confidence: float = 0.87, + severity: str = "CRITICAL", +) -> Hypothesis: + return Hypothesis( + hypothesis_id=hypothesis_id, + title=title, + description="Multiple failed SSH attempts.", + confidence=confidence, + supporting_cluster_ids=("c1",), + runbook_refs=(), + severity=severity, # type: ignore[arg-type] + ) + + +def _make_ranked(hypothesis: Hypothesis | None = None, suppress: bool = False) -> RankedHypothesis: + h = hypothesis or _make_hypothesis() + return RankedHypothesis( + hypothesis=h, + novelty_score=0.95, + similarity_to_known=0.05, + suppress=suppress, + suppression_reason="similar to known" if suppress else None, + ) + + +# --------------------------------------------------------------------------- +# Helper: collect all events from run_pipeline +# --------------------------------------------------------------------------- + +async def _collect_pipeline_events(**kwargs) -> list[dict[str, Any]]: + """Run run_pipeline and collect all yielded events into a list.""" + from app.services.diagnose.pipeline import run_pipeline + events = [] + async for event in run_pipeline(**kwargs): + events.append(event) + return events + + +def _default_pipeline_kwargs(entries=None, db_path=None) -> dict: + return dict( + db_path=db_path or Path("/tmp/fake.db"), + entries=entries or [_make_search_result()], + ctx=_make_ctx(), + query="ssh brute force", + since="2026-01-01T00:00:00+00:00", + until="2026-01-01T01:00:00+00:00", + llm_url=None, + llm_model=None, + llm_api_key=None, + ) + + +# --------------------------------------------------------------------------- +# Mock factories for all 5 stage classes +# --------------------------------------------------------------------------- + +def _mock_all_stages( + hypotheses=None, + ranked=None, + synthesis_text="VERDICT: CRITICAL — SSH flood (87% confidence)", +): + """Return a dict of patch targets and their mock return values.""" + timeline = _make_timeline() + classified = _make_classified(timeline) + hyps = hypotheses if hypotheses is not None else [_make_hypothesis()] + rnk = ranked if ranked is not None else [_make_ranked()] + + mock_reconstructor = MagicMock() + mock_reconstructor.return_value.reconstruct.return_value = timeline + + mock_classifier = MagicMock() + mock_classifier.return_value.classify.return_value = classified + + mock_hypothesizer = MagicMock() + mock_hypothesizer.return_value.hypothesize.return_value = hyps + + mock_suppressor = MagicMock() + mock_suppressor.return_value.suppress.return_value = rnk + + mock_synthesizer = MagicMock() + mock_synthesizer.return_value.synthesize.return_value = synthesis_text + + return { + "app.services.diagnose.pipeline.TimelineReconstructor": mock_reconstructor, + "app.services.diagnose.pipeline.SeverityClassifier": mock_classifier, + "app.services.diagnose.pipeline.RootCauseHypothesizer": mock_hypothesizer, + "app.services.diagnose.pipeline.FalsePositiveSuppressor": mock_suppressor, + "app.services.diagnose.pipeline.SummarySynthesizer": mock_synthesizer, + } + + +# --------------------------------------------------------------------------- +# 1. Feature flag off: legacy summarize() path runs, not run_pipeline +# --------------------------------------------------------------------------- + +class TestFeatureFlagOff: + @pytest.mark.asyncio + async def test_legacy_path_when_flag_off(self): + """With MULTI_AGENT_ENABLED=False, run_pipeline is never called.""" + from app.services import diagnose as diagnose_module + + entries = [_make_search_result()] + + with ( + patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False), + patch("app.services.diagnose.search", return_value=entries), + patch("app.services.diagnose.entries_in_window", return_value=[]), + patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()), + patch("app.services.diagnose.format_context_block", return_value=None), + patch("app.services.diagnose.run_pipeline") as mock_pipeline, + patch("app.services.diagnose.summarize", return_value=None), + ): + events = [] + async for event in diagnose_module.diagnose_stream( + db_path=Path("/tmp/fake.db"), + query="ssh failures", + llm_url=None, + llm_model=None, + ): + events.append(event) + + # run_pipeline must NOT have been called + mock_pipeline.assert_not_called() + + # SSE sequence must end with done + types = [e["type"] for e in events] + assert "done" in types + assert types[-1] == "done" + + @pytest.mark.asyncio + async def test_legacy_done_event_is_last(self): + """Legacy path: done is always the last event.""" + from app.services import diagnose as diagnose_module + + with ( + patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False), + patch("app.services.diagnose.search", return_value=[]), + patch("app.services.diagnose.entries_in_window", return_value=[]), + patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()), + patch("app.services.diagnose.format_context_block", return_value=None), + ): + events = [] + async for event in diagnose_module.diagnose_stream( + db_path=Path("/tmp/fake.db"), + query="check logs", + ): + events.append(event) + + assert events[-1] == {"type": "done"} + + +# --------------------------------------------------------------------------- +# 2. Feature flag on, all stages mocked: verify SSE event sequence +# --------------------------------------------------------------------------- + +class TestFeatureFlagOn: + @pytest.mark.asyncio + async def test_pipeline_stage_events_in_order(self): + """pipeline_stage events must be emitted stages 1→2→3→4 in order.""" + mocks = _mock_all_stages() + kwargs = _default_pipeline_kwargs() + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + stage_events = [e for e in events if e.get("type") == "pipeline_stage"] + stages = [e["stage"] for e in stage_events] + assert stages == [1, 2, 3, 4] + + @pytest.mark.asyncio + async def test_hypotheses_event_after_stage4(self): + """hypotheses event must appear after pipeline_stage stage=4.""" + mocks = _mock_all_stages() + kwargs = _default_pipeline_kwargs() + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + stage4_idx = next( + i for i, e in enumerate(events) + if e.get("type") == "pipeline_stage" and e.get("stage") == 4 + ) + hyp_idx = next(i for i, e in enumerate(events) if e.get("type") == "hypotheses") + assert hyp_idx > stage4_idx + + @pytest.mark.asyncio + async def test_reasoning_event_emitted(self): + """reasoning event must be present when synthesizer returns text.""" + mocks = _mock_all_stages(synthesis_text="VERDICT: CRITICAL — SSH flood") + kwargs = _default_pipeline_kwargs() + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + reasoning_events = [e for e in events if e.get("type") == "reasoning"] + assert len(reasoning_events) == 1 + assert "VERDICT" in reasoning_events[0]["text"] + + @pytest.mark.asyncio + async def test_done_event_is_last(self): + """done must always be the last event in the pipeline sequence.""" + mocks = _mock_all_stages() + kwargs = _default_pipeline_kwargs() + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + assert events[-1] == {"type": "done"} + + @pytest.mark.asyncio + async def test_pipeline_wired_from_diagnose_stream(self): + """diagnose_stream routes through run_pipeline when flag is on.""" + from app.services import diagnose as diagnose_module + + entries = [_make_search_result()] + + async def fake_pipeline(**kwargs): + yield {"type": "status", "message": "Building timeline…"} + yield {"type": "pipeline_stage", "stage": 1, "name": "timeline", "message": "Built 1 clusters, 0 bursts"} + yield {"type": "done"} + + with ( + patch.object(diagnose_module, "MULTI_AGENT_ENABLED", True), + patch("app.services.diagnose.search", return_value=entries), + patch("app.services.diagnose.entries_in_window", return_value=[]), + patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()), + patch("app.services.diagnose.format_context_block", return_value=None), + patch("app.services.diagnose.run_pipeline", side_effect=fake_pipeline), + ): + events = [] + async for event in diagnose_module.diagnose_stream( + db_path=Path("/tmp/fake.db"), + query="ssh failures", + ): + events.append(event) + + types = [e["type"] for e in events] + assert "pipeline_stage" in types + assert types[-1] == "done" + # Legacy summarize() must NOT have been called — done event came from pipeline + assert types.count("done") == 1 + + +# --------------------------------------------------------------------------- +# 3. Empty entries: pipeline completes with done +# --------------------------------------------------------------------------- + +class TestEmptyEntries: + @pytest.mark.asyncio + async def test_empty_entries_pipeline_completes(self): + """Pipeline with entries=[] must still complete and emit done.""" + mocks = _mock_all_stages(hypotheses=[], ranked=[]) + kwargs = _default_pipeline_kwargs(entries=[]) + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + types = [e["type"] for e in events] + assert "done" in types + assert types[-1] == "done" + + @pytest.mark.asyncio + async def test_empty_entries_all_stage_events_present(self): + """Even with empty entries, all 4 pipeline_stage events are emitted.""" + mocks = _mock_all_stages(hypotheses=[], ranked=[]) + kwargs = _default_pipeline_kwargs(entries=[]) + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + stage_events = [e for e in events if e.get("type") == "pipeline_stage"] + assert len(stage_events) == 4 + + +# --------------------------------------------------------------------------- +# 4. No LLM: Stage 3 and Stage 5 return empty/fallback; done still emitted +# --------------------------------------------------------------------------- + +class TestNoLLM: + @pytest.mark.asyncio + async def test_no_llm_pipeline_completes_with_done(self): + """No llm_url/llm_model: pipeline runs all stages and emits done.""" + mocks = _mock_all_stages(hypotheses=[], ranked=[], synthesis_text="VERDICT: UNKNOWN — no hypotheses generated") + kwargs = _default_pipeline_kwargs() + # llm_url and llm_model already None in default kwargs + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + assert events[-1] == {"type": "done"} + + @pytest.mark.asyncio + async def test_no_llm_no_reasoning_event_when_synthesis_empty(self): + """When synthesizer returns empty string, no reasoning event is emitted.""" + mocks = _mock_all_stages(synthesis_text="") + kwargs = _default_pipeline_kwargs() + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + reasoning_events = [e for e in events if e.get("type") == "reasoning"] + assert len(reasoning_events) == 0 + + +# --------------------------------------------------------------------------- +# 5. Stage 1 cluster count in pipeline_stage message +# --------------------------------------------------------------------------- + +class TestStage1Message: + @pytest.mark.asyncio + async def test_stage1_message_contains_cluster_count(self): + """pipeline_stage stage=1 message must report cluster count.""" + timeline = TimelineResult( + clusters=tuple(), + total_entries=10, + window_start=None, + window_end=None, + gap_count=0, + burst_count=3, + dominant_sources=("syslog",), + ) + classified = _make_classified(timeline) + + mock_reconstructor = MagicMock() + mock_reconstructor.return_value.reconstruct.return_value = timeline + mock_classifier = MagicMock() + mock_classifier.return_value.classify.return_value = classified + mock_hypothesizer = MagicMock() + mock_hypothesizer.return_value.hypothesize.return_value = [] + mock_suppressor = MagicMock() + mock_suppressor.return_value.suppress.return_value = [] + mock_synthesizer = MagicMock() + mock_synthesizer.return_value.synthesize.return_value = "VERDICT: INFO — nothing found" + + kwargs = _default_pipeline_kwargs() + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mock_reconstructor), + patch("app.services.diagnose.pipeline.SeverityClassifier", mock_classifier), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mock_hypothesizer), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mock_suppressor), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mock_synthesizer), + ): + events = await _collect_pipeline_events(**kwargs) + + stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1) + # 0 clusters (empty tuple), 3 bursts + assert "0" in stage1["message"] # cluster count + assert "3" in stage1["message"] # burst count + + @pytest.mark.asyncio + async def test_stage1_name_is_timeline(self): + """pipeline_stage stage=1 must have name='timeline'.""" + mocks = _mock_all_stages() + kwargs = _default_pipeline_kwargs() + + with ( + patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]), + patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]), + patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]), + patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]), + patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]), + ): + events = await _collect_pipeline_events(**kwargs) + + stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1) + assert stage1["name"] == "timeline" diff --git a/tests/test_diagnose_suppressor.py b/tests/test_diagnose_suppressor.py new file mode 100644 index 0000000..fb13e77 --- /dev/null +++ b/tests/test_diagnose_suppressor.py @@ -0,0 +1,432 @@ +"""Tests for app/services/diagnose/suppressor.py — FalsePositiveSuppressor. + +All tests use mocking; no real model downloads are made. +""" +from __future__ import annotations + +import math +import sqlite3 +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +import app.services.diagnose.suppressor as sup_module +from app.services.diagnose.models import Hypothesis, RankedHypothesis +from app.services.diagnose.suppressor import FalsePositiveSuppressor + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_hypothesis( + title: str = "Test", + description: str = "A test hypothesis.", + confidence: float = 0.8, + severity: str = "ERROR", +) -> Hypothesis: + return Hypothesis( + hypothesis_id="test-id", + title=title, + description=description, + confidence=confidence, + supporting_cluster_ids=(), + runbook_refs=(), + severity=severity, # type: ignore[arg-type] + ) + + +def _make_db_with_incidents(incidents: list[tuple[str, str]], db_path: Path) -> Path: + """Create a temporary SQLite database with resolved incidents. Returns the db path.""" + with sqlite3.connect(str(db_path)) as conn: + conn.execute( + "CREATE TABLE incidents " + "(id INTEGER PRIMARY KEY, label TEXT, notes TEXT, ended_at TEXT)" + ) + for label, notes in incidents: + conn.execute( + "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)", + (label, notes, "2024-01-01T00:00:00"), + ) + conn.commit() + return db_path + + +def _make_empty_db(db_path: Path) -> Path: + """Create a temporary SQLite DB with no incidents table.""" + with sqlite3.connect(str(db_path)) as conn: + conn.execute("CREATE TABLE unrelated (id INTEGER PRIMARY KEY)") + conn.commit() + return db_path + + +def _make_mock_embedder( + embed_return: list[float] | None = None, + embed_batch_return: list[list[float]] | None = None, +) -> MagicMock: + """Build a mock embedder with controllable embed/embed_batch responses.""" + embedder = MagicMock() + + # Default: unit vector along first dimension + default_vec = [1.0] + [0.0] * 383 + + raw_single = embed_return if embed_return is not None else default_vec + raw_batch = embed_batch_return if embed_batch_return is not None else [default_vec] + + # Wrap scalars in numpy-like MagicMock with .tolist() + def _wrap(vec: list[float]) -> MagicMock: + m = MagicMock() + m.tolist.return_value = vec + return m + + embedder.embed.return_value = _wrap(raw_single) + embedder.embed_batch.return_value = [_wrap(v) for v in raw_batch] + return embedder + + +# --------------------------------------------------------------------------- +# Autouse fixture: reset module-level cache between tests +# --------------------------------------------------------------------------- + +@pytest.fixture(autouse=True) +def reset_suppressor_cache(): + sup_module._corpus_cache.clear() + yield + sup_module._corpus_cache.clear() + + +# --------------------------------------------------------------------------- +# Test 1: No model configured — passthrough, ranked by confidence +# --------------------------------------------------------------------------- + +def test_no_model_passthrough_ranked_by_confidence(tmp_path): + """model_id='' → all novelty_score=1.0, suppress=False, ranked by confidence desc.""" + h_low = _make_hypothesis(title="Low", confidence=0.3) + h_high = _make_hypothesis(title="High", confidence=0.9) + h_mid = _make_hypothesis(title="Mid", confidence=0.6) + + db_path = tmp_path / "turnstone.db" + suppressor = FalsePositiveSuppressor(model_id="") + results = suppressor.suppress([h_low, h_high, h_mid], db_path) + + assert len(results) == 3 + assert all(isinstance(r, RankedHypothesis) for r in results) + assert all(r.novelty_score == pytest.approx(1.0) for r in results) + assert all(r.similarity_to_known == pytest.approx(0.0) for r in results) + assert all(r.suppress is False for r in results) + assert all(r.suppression_reason is None for r in results) + # Ranked by confidence descending + confidences = [r.hypothesis.confidence for r in results] + assert confidences == sorted(confidences, reverse=True) + + +# --------------------------------------------------------------------------- +# Test 2: High similarity → suppressed +# --------------------------------------------------------------------------- + +def test_high_similarity_suppresses_hypothesis(tmp_path): + """Hypothesis with embedding nearly identical to corpus → suppress=True.""" + identical_vec = [1.0] + [0.0] * 383 + corpus_vec = [1.0] + [0.0] * 383 # cosine similarity = 1.0 + + mock_embedder = _make_mock_embedder( + embed_return=identical_vec, + embed_batch_return=[corpus_vec], + ) + + db_path = _make_db_with_incidents( + [("OOM killer", "Memory pressure caused OOM kill")], + tmp_path / "turnstone.db", + ) + suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85) + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([_make_hypothesis()], db_path) + + assert len(results) == 1 + result = results[0] + assert result.suppress is True + assert result.suppression_reason is not None + assert "Similar to resolved incident" in result.suppression_reason + assert result.similarity_to_known == pytest.approx(1.0, abs=0.01) + assert result.novelty_score == pytest.approx(0.0, abs=0.01) + + +# --------------------------------------------------------------------------- +# Test 3: Low similarity → not suppressed +# --------------------------------------------------------------------------- + +def test_low_similarity_does_not_suppress(tmp_path): + """Hypothesis with embedding orthogonal to corpus → suppress=False.""" + hypothesis_vec = [1.0] + [0.0] * 383 + corpus_vec = [0.0, 1.0] + [0.0] * 382 # orthogonal → similarity = 0.0 + + mock_embedder = _make_mock_embedder( + embed_return=hypothesis_vec, + embed_batch_return=[corpus_vec], + ) + + db_path = _make_db_with_incidents( + [("Disk I/O", "Storage saturation caused latency")], + tmp_path / "turnstone.db", + ) + suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85) + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([_make_hypothesis()], db_path) + + assert len(results) == 1 + result = results[0] + assert result.suppress is False + assert result.suppression_reason is None + assert result.similarity_to_known == pytest.approx(0.0, abs=0.01) + assert result.novelty_score == pytest.approx(1.0, abs=0.01) + + +# --------------------------------------------------------------------------- +# Test 3b: Borderline similarity — exactly at threshold vs. just below +# --------------------------------------------------------------------------- + +def test_similarity_threshold_boundary(tmp_path): + """similarity == threshold is suppressed; similarity just below threshold is not. + + This test locks down the boundary semantics: suppress when max_sim >= threshold, + not when novelty_score < threshold (the inverted form that was the original bug). + With threshold=0.85: + - similarity=0.85 → suppressed (at boundary, inclusive) + - similarity=0.84 → NOT suppressed (just below) + """ + db_path = _make_db_with_incidents( + [("Disk I/O", "Storage saturation caused latency")], + tmp_path / "turnstone.db", + ) + + # Corpus unit vector along first axis + corpus_vec = [1.0] + [0.0] * 383 + + for sim_value, expected_suppress in [(0.85, True), (0.84, False)]: + # Build a hypothesis embedding whose cosine similarity to corpus_vec ≈ sim_value. + # query = [sim, sqrt(1 - sim^2), 0, ...] → cosine sim = sim exactly. + import math + hyp_vec = [sim_value, math.sqrt(max(0.0, 1.0 - sim_value ** 2))] + [0.0] * 382 + + mock_embedder = _make_mock_embedder( + embed_return=hyp_vec, + embed_batch_return=[corpus_vec], + ) + + suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85) + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([_make_hypothesis()], db_path) + + assert len(results) == 1 + result = results[0] + assert result.suppress is expected_suppress, ( + f"similarity={sim_value:.2f}: expected suppress={expected_suppress}, " + f"got suppress={result.suppress} (similarity_to_known={result.similarity_to_known:.4f})" + ) + + +# --------------------------------------------------------------------------- +# Test 4: Empty hypotheses list returns [] +# --------------------------------------------------------------------------- + +def test_empty_hypotheses_returns_empty(tmp_path): + """suppress([]) → [] regardless of model or db state.""" + db_path = tmp_path / "turnstone.db" + suppressor = FalsePositiveSuppressor(model_id="test-model") + results = suppressor.suppress([], db_path) + assert results == [] + + +# --------------------------------------------------------------------------- +# Test 5: Ranking by novelty_score * confidence +# --------------------------------------------------------------------------- + +def test_ranking_by_novelty_times_confidence(tmp_path): + """Results are sorted by novelty_score * confidence descending.""" + # Hypothesis A: novelty=0.9, confidence=0.5 → score=0.45 + # Hypothesis B: novelty=0.5, confidence=0.9 → score=0.45 (tie, order stable-ish) + # Hypothesis C: novelty=0.8, confidence=0.9 → score=0.72 (highest) + # Expected order: C, then A or B + + # We'll use orthogonal embeddings to get predictable similarities. + # Corpus has 3 incidents with different embeddings. + # We'll control novelty_score by setting similarity carefully. + + # Simplest: set up so each hypothesis gets a specific similarity to its corpus. + # corpus_embs[0] = [1,0,0,...], [0,1,0,...], [0,0,1,...] — unit vectors + # hyp A embed = [cos(0.1), sin(0.1), 0...] → sim to corpus[0] = cos(0.1) ≈ 0.995 high + # This gets complex. Instead, mock _load_embedder to return None and rely + # on passthrough with controlled confidence, then verify confidence-based ranking. + # Then do a second test variant with manual novelty injection via embed return values. + + # Simpler approach: create 3 hypotheses and verify output is sorted correctly + # by providing distinct embeddings that produce known similarities. + + # Corpus: single vector [1, 0, 0, ...] + corpus_vec = [1.0] + [0.0] * 383 + + # H_A: similarity = 0.1 → novelty = 0.9, confidence = 0.5 → score = 0.45 + angle_a = math.acos(0.1) + vec_a = [0.1, math.sin(angle_a)] + [0.0] * 382 + + # H_B: similarity = 0.5 → novelty = 0.5, confidence = 0.9 → score = 0.45 + angle_b = math.acos(0.5) + vec_b = [0.5, math.sin(angle_b)] + [0.0] * 382 + + # H_C: similarity = 0.2 → novelty = 0.8, confidence = 0.9 → score = 0.72 (highest) + angle_c = math.acos(0.2) + vec_c = [0.2, math.sin(angle_c)] + [0.0] * 382 + + h_a = _make_hypothesis(title="A", confidence=0.5) + h_b = _make_hypothesis(title="B", confidence=0.9) + h_c = _make_hypothesis(title="C", confidence=0.9) + + call_count = [0] + vecs_in_order = [vec_a, vec_b, vec_c] + + def side_effect_embed(text: str) -> MagicMock: + m = MagicMock() + m.tolist.return_value = vecs_in_order[call_count[0] % len(vecs_in_order)] + call_count[0] += 1 + return m + + mock_embedder = MagicMock() + batch_m = MagicMock() + batch_m.tolist.return_value = corpus_vec + mock_embedder.embed_batch.return_value = [batch_m] + mock_embedder.embed.side_effect = side_effect_embed + + db_path = _make_db_with_incidents( + [("OOM", "Memory exhaustion")], + tmp_path / "turnstone.db", + ) + suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85) + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([h_a, h_b, h_c], db_path) + + assert len(results) == 3 + titles = [r.hypothesis.title for r in results] + # H_C should be first (highest novelty*confidence score) + assert titles[0] == "C", f"Expected C first, got {titles}" + # Verify sort is descending by novelty*confidence + scores = [r.novelty_score * r.hypothesis.confidence for r in results] + assert scores == sorted(scores, reverse=True) + + +# --------------------------------------------------------------------------- +# Test 6: DB with no resolved incidents → novelty_score=1.0 +# --------------------------------------------------------------------------- + +def test_no_resolved_incidents_in_db_passthrough(tmp_path): + """When incidents table is empty, all hypotheses get novelty_score=1.0.""" + db_path = _make_db_with_incidents([], tmp_path / "turnstone.db") # table exists but zero rows + mock_embedder = _make_mock_embedder() + suppressor = FalsePositiveSuppressor(model_id="test-model") + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([_make_hypothesis()], db_path) + + assert len(results) == 1 + assert results[0].novelty_score == pytest.approx(1.0) + assert results[0].suppress is False + # embed_batch should NOT have been called (empty corpus short-circuits) + mock_embedder.embed_batch.assert_not_called() + + +# --------------------------------------------------------------------------- +# Test 7: DB query failure → graceful fallback, no crash +# --------------------------------------------------------------------------- + +def test_db_query_failure_graceful_fallback(tmp_path): + """When the incidents table is missing, suppress() returns passthrough without raising.""" + db_path = _make_empty_db(tmp_path / "turnstone.db") # no 'incidents' table + mock_embedder = _make_mock_embedder() + suppressor = FalsePositiveSuppressor(model_id="test-model") + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + results = suppressor.suppress([_make_hypothesis()], db_path) + + assert len(results) == 1 + assert results[0].novelty_score == pytest.approx(1.0) + assert results[0].suppress is False + + +# --------------------------------------------------------------------------- +# Test 8: Embedding service unavailable (returns None) → graceful fallback +# --------------------------------------------------------------------------- + +def test_embedding_service_unavailable_passthrough(tmp_path): + """When get_embedder() returns None, suppress() falls back without crashing.""" + db_path = _make_db_with_incidents( + [("OOM", "Memory pressure")], + tmp_path / "turnstone.db", + ) + suppressor = FalsePositiveSuppressor(model_id="test-model") + + with patch.object(suppressor, "_load_embedder", return_value=None): + results = suppressor.suppress([_make_hypothesis(confidence=0.7)], db_path) + + assert len(results) == 1 + assert results[0].novelty_score == pytest.approx(1.0) + assert results[0].suppress is False + assert results[0].suppression_reason is None + + +# --------------------------------------------------------------------------- +# Test 9: Corpus cache invalidated when corpus changes +# --------------------------------------------------------------------------- + +def test_corpus_cache_invalidated_on_corpus_change(tmp_path): + """When the corpus changes between calls, embed_batch is called again.""" + # First DB: one incident + db_path = _make_db_with_incidents( + [("OOM", "Memory pressure")], + tmp_path / "turnstone.db", + ) + + corpus_vec_1 = [1.0] + [0.0] * 383 + corpus_vec_2 = [0.0, 1.0] + [0.0] * 382 + + hyp_vec = [1.0] + [0.0] * 383 + + # embedder will be called twice for embed_batch (different corpus each time) + mock_embedder = MagicMock() + single_m = MagicMock() + single_m.tolist.return_value = hyp_vec + + batch_m1 = MagicMock() + batch_m1.tolist.return_value = corpus_vec_1 + batch_m2 = MagicMock() + batch_m2.tolist.return_value = corpus_vec_2 + + mock_embedder.embed.return_value = single_m + mock_embedder.embed_batch.side_effect = [[batch_m1], [batch_m2]] + + suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85) + + with patch.object(suppressor, "_load_embedder", return_value=mock_embedder): + # First call — populates cache + results_1 = suppressor.suppress([_make_hypothesis()], db_path) + assert mock_embedder.embed_batch.call_count == 1 + + # Mutate the DB to add a second incident (changes corpus) + with sqlite3.connect(str(db_path)) as conn: + conn.execute( + "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)", + ("Disk I/O", "Storage saturation", "2024-01-02T00:00:00"), + ) + conn.commit() + + # Second call — corpus changed, should re-embed + results_2 = suppressor.suppress([_make_hypothesis()], db_path) + assert mock_embedder.embed_batch.call_count == 2, ( + "embed_batch should be called again when corpus changes" + ) + + assert len(results_1) == 1 + assert len(results_2) == 1 diff --git a/tests/test_diagnose_synthesizer.py b/tests/test_diagnose_synthesizer.py new file mode 100644 index 0000000..5229c99 --- /dev/null +++ b/tests/test_diagnose_synthesizer.py @@ -0,0 +1,285 @@ +"""Tests for app/services/diagnose/synthesizer.py — SummarySynthesizer. + +All tests use mocking; no real LLM calls are made. +""" +from __future__ import annotations + +from unittest.mock import MagicMock, patch + +from app.context.retriever import RetrievedContext +from app.services.diagnose.models import Hypothesis, RankedHypothesis, TimelineResult +from app.services.diagnose.synthesizer import SummarySynthesizer + + +# --------------------------------------------------------------------------- +# Fixture helpers +# --------------------------------------------------------------------------- + +def _make_hypothesis( + hypothesis_id: str = "h1", + title: str = "SSH flood from external IPs", + description: str = "Repeated failed login attempts from multiple IPs.", + confidence: float = 0.87, + severity: str = "CRITICAL", +) -> Hypothesis: + return Hypothesis( + hypothesis_id=hypothesis_id, + title=title, + description=description, + confidence=confidence, + supporting_cluster_ids=("c1",), + runbook_refs=(), + severity=severity, # type: ignore[arg-type] + ) + + +def _make_ranked( + hypothesis: Hypothesis | None = None, + novelty_score: float = 0.95, + similarity_to_known: float = 0.05, + suppress: bool = False, + suppression_reason: str | None = None, +) -> RankedHypothesis: + h = hypothesis or _make_hypothesis() + return RankedHypothesis( + hypothesis=h, + novelty_score=novelty_score, + similarity_to_known=similarity_to_known, + suppress=suppress, + suppression_reason=suppression_reason, + ) + + +def _make_timeline( + total_entries: int = 42, + n_clusters: int = 3, +) -> TimelineResult: + return TimelineResult( + clusters=tuple(), + total_entries=total_entries, + window_start="2026-01-01T00:00:00+00:00", + window_end="2026-01-01T01:00:00+00:00", + gap_count=1, + burst_count=2, + dominant_sources=("syslog", "auth"), + ) + + +def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext: + return RetrievedContext( + facts=[{"category": "network", "key": "host", "value": "heimdall", "source": "facts"}], + chunks=chunks or [{"filename": "runbook.md", "text": "Restart sshd if flooded"}], + ) + + +# --------------------------------------------------------------------------- +# Test cases +# --------------------------------------------------------------------------- + +class TestSynthesizerWithHypotheses: + """With hypotheses, result must contain VERDICT.""" + + def test_returns_verdict_string_with_llm(self): + synthesizer = SummarySynthesizer() + ranked = [_make_ranked()] + timeline = _make_timeline() + ctx = _make_ctx() + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)\nTIMELINE: lots of hits."}}] + } + + with patch("httpx.post", return_value=mock_resp): + result = synthesizer.synthesize( + ranked=ranked, + timeline=timeline, + ctx=ctx, + query="ssh brute force", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert "VERDICT" in result + + def test_returns_nonempty_string(self): + synthesizer = SummarySynthesizer() + ranked = [_make_ranked()] + timeline = _make_timeline() + ctx = _make_ctx() + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)"}}] + } + + with patch("httpx.post", return_value=mock_resp): + result = synthesizer.synthesize( + ranked=ranked, + timeline=timeline, + ctx=ctx, + query="why is auth failing", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert isinstance(result, str) + assert len(result) > 0 + + +class TestSynthesizerSuppressedHypotheses: + """Suppressed hypotheses must be excluded from the LLM prompt.""" + + def test_suppressed_hypotheses_excluded_from_prompt(self): + suppressed = _make_ranked( + hypothesis=_make_hypothesis( + hypothesis_id="h2", + title="Wazuh alert processing backlog", + severity="ERROR", + confidence=0.72, + ), + suppress=True, + suppression_reason="similar to 2025-04 SSH incident", + novelty_score=0.1, + ) + active = _make_ranked( + hypothesis=_make_hypothesis( + hypothesis_id="h1", + title="SSH flood from external IPs", + severity="CRITICAL", + confidence=0.87, + ), + suppress=False, + novelty_score=0.95, + ) + + captured_messages: list = [] + + def fake_post(url, json=None, headers=None, timeout=None): + if json and "payload" in json: + captured_messages.extend(json["payload"].get("messages", [])) + elif json and "messages" in json: + captured_messages.extend(json.get("messages", [])) + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood"}}] + } + return mock_resp + + synthesizer = SummarySynthesizer() + with patch("httpx.post", side_effect=fake_post): + synthesizer.synthesize( + ranked=[active, suppressed], + timeline=_make_timeline(), + ctx=_make_ctx(), + query="auth failures", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + # The user message should contain the active hypothesis title + # and NOT contain the suppressed one (or mark it suppressed) + user_content = next( + (m["content"] for m in captured_messages if m.get("role") == "user"), "" + ) + assert "SSH flood from external IPs" in user_content + # Wazuh should not appear as a standalone top-level hypothesis + # (suppressed items are excluded from the active list sent to the LLM) + assert "Wazuh alert processing backlog" not in user_content + + +class TestSynthesizerNoLLM: + """No LLM configured: must return deterministic fallback (not empty).""" + + def test_no_llm_url_returns_fallback(self): + synthesizer = SummarySynthesizer() + ranked = [_make_ranked()] + timeline = _make_timeline() + ctx = _make_ctx() + + result = synthesizer.synthesize( + ranked=ranked, + timeline=timeline, + ctx=ctx, + query="disk errors", + ) + + assert isinstance(result, str) + assert len(result) > 0 + assert "VERDICT" in result + + def test_no_llm_model_returns_fallback(self): + synthesizer = SummarySynthesizer() + ranked = [_make_ranked()] + + result = synthesizer.synthesize( + ranked=ranked, + timeline=_make_timeline(), + ctx=_make_ctx(), + query="oom killer", + llm_url="http://localhost:11434", + # llm_model omitted + ) + + assert "VERDICT" in result + assert "SSH flood from external IPs" in result + + def test_llm_failure_returns_fallback(self): + synthesizer = SummarySynthesizer() + ranked = [_make_ranked()] + + with patch("httpx.post", side_effect=ConnectionError("refused")): + result = synthesizer.synthesize( + ranked=ranked, + timeline=_make_timeline(), + ctx=_make_ctx(), + query="why is disk full", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert "VERDICT" in result + assert len(result) > 0 + + +class TestSynthesizerEmptyRanked: + """Empty ranked list: must return deterministic fallback text, not raise.""" + + def test_empty_ranked_no_llm_returns_fallback(self): + synthesizer = SummarySynthesizer() + result = synthesizer.synthesize( + ranked=[], + timeline=_make_timeline(), + ctx=_make_ctx(), + query="check everything", + ) + + assert isinstance(result, str) + assert len(result) > 0 + assert "VERDICT" in result + + def test_empty_ranked_with_llm_returns_fallback_or_llm_text(self): + """Even with empty ranked, we attempt LLM and return something.""" + synthesizer = SummarySynthesizer() + + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = { + "choices": [{"message": {"content": "VERDICT: UNKNOWN — no hypotheses generated"}}] + } + + with patch("httpx.post", return_value=mock_resp): + result = synthesizer.synthesize( + ranked=[], + timeline=_make_timeline(), + ctx=_make_ctx(), + query="nothing found", + llm_url="http://localhost:11434", + llm_model="llama3", + ) + + assert isinstance(result, str) + assert len(result) > 0 diff --git a/tests/test_diagnose_timeline.py b/tests/test_diagnose_timeline.py new file mode 100644 index 0000000..0f5c6dc --- /dev/null +++ b/tests/test_diagnose_timeline.py @@ -0,0 +1,234 @@ +"""Tests for app/services/diagnose/timeline.py — TimelineReconstructor.""" +from __future__ import annotations + +from app.services.diagnose.timeline import TimelineReconstructor +from app.services.diagnose.models import TimelineResult +from app.services.search import SearchResult + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def _make_entry( + entry_id: str = "e1", + source_id: str = "src-a", + timestamp_iso: str | None = "2026-01-01T00:00:00+00:00", + severity: str | None = "INFO", + rank: float = 0.0, + text: str = "log line", + matched_patterns: list[str] | None = None, + sequence: int = 1, +) -> SearchResult: + return SearchResult( + entry_id=entry_id, + source_id=source_id, + sequence=sequence, + timestamp_iso=timestamp_iso, + severity=severity, + repeat_count=1, + out_of_order=False, + matched_patterns=matched_patterns or [], + text=text, + rank=rank, + ) + + +def _ts(offset_seconds: int) -> str: + """Return an ISO timestamp offset_seconds after 2026-01-01T00:00:00+00:00.""" + from datetime import datetime, timezone, timedelta + base = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc) + dt = base + timedelta(seconds=offset_seconds) + return dt.isoformat() + + +# --------------------------------------------------------------------------- +# Test cases +# --------------------------------------------------------------------------- + +class TestEmptyInput: + def test_empty_returns_empty_timeline(self): + rt = TimelineReconstructor() + result = rt.reconstruct([]) + assert result == TimelineResult( + clusters=(), + total_entries=0, + gap_count=0, + burst_count=0, + window_start=None, + window_end=None, + dominant_sources=(), + ) + + +class TestSingleEntry: + def test_single_entry_one_cluster(self): + rt = TimelineReconstructor() + entry = _make_entry(entry_id="e1", timestamp_iso=_ts(0)) + result = rt.reconstruct([entry]) + assert len(result.clusters) == 1 + cluster = result.clusters[0] + assert cluster.gap_before_seconds == 0.0 + assert cluster.burst is False + assert result.total_entries == 1 + + +class TestClusteringWithinWindow: + def test_two_entries_10s_apart_same_cluster(self): + rt = TimelineReconstructor(cluster_window_seconds=30) + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", timestamp_iso=_ts(10)), + ] + result = rt.reconstruct(entries) + assert len(result.clusters) == 1 + assert len(result.clusters[0].entries) == 2 + + +class TestClusteringOutsideWindow: + def test_two_entries_60s_apart_two_clusters(self): + rt = TimelineReconstructor(cluster_window_seconds=30) + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", timestamp_iso=_ts(60)), + ] + result = rt.reconstruct(entries) + assert len(result.clusters) == 2 + second_cluster = result.clusters[1] + assert second_cluster.gap_before_seconds >= 60.0 + + def test_gap_count_correct_for_60s_gap(self): + rt = TimelineReconstructor(cluster_window_seconds=30) + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", timestamp_iso=_ts(60)), + ] + result = rt.reconstruct(entries) + assert result.gap_count == 1 + + +class TestBurst: + def test_15_entries_within_3s_is_burst(self): + rt = TimelineReconstructor( + cluster_window_seconds=30, + burst_threshold=10, + burst_window_seconds=5, + ) + # All 15 entries within a 3-second window — well under burst_window=5 + entries = [ + _make_entry(entry_id=f"e{i}", timestamp_iso=_ts(i % 3), sequence=i) + for i in range(15) + ] + result = rt.reconstruct(entries) + # All should land in one cluster + assert len(result.clusters) == 1 + assert result.clusters[0].burst is True + assert result.burst_count == 1 + + +class TestNullTimestamps: + def test_null_timestamp_joins_current_cluster(self): + rt = TimelineReconstructor(cluster_window_seconds=30) + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", timestamp_iso=None), + ] + # Should not raise, and null entry should join the existing cluster + result = rt.reconstruct(entries) + assert len(result.clusters) == 1 + assert "e2" in result.clusters[0].entries + + def test_null_timestamp_does_not_start_new_cluster(self): + rt = TimelineReconstructor(cluster_window_seconds=30) + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", timestamp_iso=None), + _make_entry(entry_id="e3", timestamp_iso=_ts(5)), + ] + result = rt.reconstruct(entries) + # e3 is within 30s of e1, so all three in one cluster + assert len(result.clusters) == 1 + + def test_all_null_timestamps_one_cluster_no_crash(self): + rt = TimelineReconstructor() + entries = [ + _make_entry(entry_id="e1", timestamp_iso=None), + _make_entry(entry_id="e2", timestamp_iso=None), + ] + result = rt.reconstruct(entries) + assert len(result.clusters) == 1 + cluster = result.clusters[0] + assert cluster.start_iso is None + assert cluster.end_iso is None + assert result.window_start is None + assert result.window_end is None + + +class TestDominantSources: + def test_dominant_sources_ordered_by_count_descending(self): + rt = TimelineReconstructor() + # src-b has 3 entries, src-a has 1 + entries = [ + _make_entry(entry_id="e1", source_id="src-a", timestamp_iso=_ts(0)), + _make_entry(entry_id="e2", source_id="src-b", timestamp_iso=_ts(1)), + _make_entry(entry_id="e3", source_id="src-b", timestamp_iso=_ts(2)), + _make_entry(entry_id="e4", source_id="src-b", timestamp_iso=_ts(3)), + ] + result = rt.reconstruct(entries) + assert result.dominant_sources[0] == "src-b" + assert result.dominant_sources[1] == "src-a" + + +class TestRepresentativeText: + def test_representative_text_uses_highest_rank(self): + rt = TimelineReconstructor() + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=-5.0, text="low score"), + _make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=-1.0, text="high score"), + ] + result = rt.reconstruct(entries) + assert result.clusters[0].representative_text == "high score" + + def test_representative_text_tiebreak_on_longest_text(self): + rt = TimelineReconstructor() + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=0.0, text="short"), + _make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=0.0, text="much longer text here"), + ] + result = rt.reconstruct(entries) + assert result.clusters[0].representative_text == "much longer text here" + + +class TestClusterId: + def test_cluster_id_is_12_char_hex(self): + rt = TimelineReconstructor() + entry = _make_entry(entry_id="abc123", timestamp_iso=_ts(0)) + result = rt.reconstruct([entry]) + cluster_id = result.clusters[0].cluster_id + assert len(cluster_id) == 12 + assert all(c in "0123456789abcdef" for c in cluster_id) + + +class TestSeverity: + def test_critical_wins_over_error(self): + rt = TimelineReconstructor() + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0), severity="ERROR"), + _make_entry(entry_id="e2", timestamp_iso=_ts(1), severity="CRITICAL"), + _make_entry(entry_id="e3", timestamp_iso=_ts(2), severity="INFO"), + ] + result = rt.reconstruct(entries) + assert result.clusters[0].severity == "CRITICAL" + + +class TestPatternTags: + def test_pattern_tags_union_across_entries(self): + rt = TimelineReconstructor() + entries = [ + _make_entry(entry_id="e1", timestamp_iso=_ts(0), matched_patterns=["oom-killer"]), + _make_entry(entry_id="e2", timestamp_iso=_ts(1), matched_patterns=["disk-full"]), + ] + result = rt.reconstruct(entries) + tags = set(result.clusters[0].pattern_tags) + assert "oom-killer" in tags + assert "disk-full" in tags diff --git a/tests/test_ingest_dmesg.py b/tests/test_glean_dmesg.py similarity index 97% rename from tests/test_ingest_dmesg.py rename to tests/test_glean_dmesg.py index 9e64c09..ff4fdbe 100644 --- a/tests/test_ingest_dmesg.py +++ b/tests/test_glean_dmesg.py @@ -1,7 +1,7 @@ -"""Tests for the dmesg log ingestor.""" +"""Tests for the dmesg log gleaner.""" from __future__ import annotations -from app.ingest.dmesg_log import is_dmesg_log, parse +from app.glean.dmesg_log import is_dmesg_log, parse RELATIVE_SAMPLE = """\ [ 0.000000] Linux version 6.8.0-65-generic diff --git a/tests/test_glean_fingerprint.py b/tests/test_glean_fingerprint.py new file mode 100644 index 0000000..96aca23 --- /dev/null +++ b/tests/test_glean_fingerprint.py @@ -0,0 +1,236 @@ +"""Tests for fingerprint-based incremental glean skipping (issue #30). + +Verifies that _glean_files() (and its public wrappers) skip local files whose +mtime+size fingerprint has not changed since the last glean, and that force=True +bypasses that check. +""" +from __future__ import annotations + +import sqlite3 +import time +from pathlib import Path + +import pytest + +from app.glean.pipeline import ( + _fingerprint, + _fp_unchanged, + _save_fingerprint, + ensure_schema, + glean_dir, + glean_file, +) +from app.glean.base import now_iso + + +# ── Fixtures ────────────────────────────────────────────────────────────────── + +@pytest.fixture() +def db_path(tmp_path: Path) -> Path: + path = tmp_path / "test.db" + ensure_schema(path) + return path + + +@pytest.fixture() +def log_file(tmp_path: Path) -> Path: + """A minimal plaintext log file.""" + f = tmp_path / "test.log" + f.write_text("May 24 10:00:00 heimdall kernel: test message\n") + return f + + +# ── Unit: fingerprint helpers ────────────────────────────────────────────────── + +class TestFingerprintHelpers: + def test_fingerprint_returns_mtime_and_size(self, log_file: Path) -> None: + mtime, size = _fingerprint(log_file) + st = log_file.stat() + assert mtime == st.st_mtime + assert size == st.st_size + + def test_fp_unchanged_returns_false_when_no_record(self, db_path: Path, log_file: Path) -> None: + conn = sqlite3.connect(str(db_path)) + mtime, size = _fingerprint(log_file) + assert _fp_unchanged(conn, log_file, mtime, size) is False + conn.close() + + def test_fp_unchanged_returns_true_after_save(self, db_path: Path, log_file: Path) -> None: + conn = sqlite3.connect(str(db_path)) + mtime, size = _fingerprint(log_file) + _save_fingerprint(conn, log_file, mtime, size, now_iso()) + conn.commit() + assert _fp_unchanged(conn, log_file, mtime, size) is True + conn.close() + + def test_fp_unchanged_returns_false_on_size_change(self, db_path: Path, log_file: Path) -> None: + conn = sqlite3.connect(str(db_path)) + mtime, size = _fingerprint(log_file) + _save_fingerprint(conn, log_file, mtime, size, now_iso()) + conn.commit() + # Simulate size change (new content appended) + assert _fp_unchanged(conn, log_file, mtime, size + 1) is False + conn.close() + + def test_fp_unchanged_returns_false_on_mtime_change(self, db_path: Path, log_file: Path) -> None: + conn = sqlite3.connect(str(db_path)) + mtime, size = _fingerprint(log_file) + _save_fingerprint(conn, log_file, mtime, size, now_iso()) + conn.commit() + assert _fp_unchanged(conn, log_file, mtime + 1.0, size) is False + conn.close() + + def test_save_fingerprint_upserts(self, db_path: Path, log_file: Path) -> None: + """Second save with different values replaces the first (UPSERT semantics).""" + conn = sqlite3.connect(str(db_path)) + _save_fingerprint(conn, log_file, 1000.0, 100, "2026-01-01T00:00:00Z") + conn.commit() + _save_fingerprint(conn, log_file, 2000.0, 200, "2026-01-02T00:00:00Z") + conn.commit() + row = conn.execute( + "SELECT mtime, size FROM glean_fingerprints WHERE path = ?", + (str(log_file),), + ).fetchone() + assert row == (2000.0, 200) + conn.close() + + +# ── Integration: glean_file skipping ───────────────────────────────────────── + +class TestGleanFileFingerprint: + def test_first_glean_writes_fingerprint(self, db_path: Path, log_file: Path) -> None: + glean_file(log_file, db_path) + conn = sqlite3.connect(str(db_path)) + row = conn.execute( + "SELECT mtime, size FROM glean_fingerprints WHERE path = ?", + (str(log_file),), + ).fetchone() + conn.close() + assert row is not None + mtime, size = _fingerprint(log_file) + assert row == (mtime, size) + + def test_second_glean_skips_unchanged_file(self, db_path: Path, log_file: Path) -> None: + stats_first = glean_file(log_file, db_path) + count_first = sum(stats_first.values()) + + # Re-glean without touching the file — should produce 0 new entries. + stats_second = glean_file(log_file, db_path) + count_second = sum(stats_second.values()) + + assert count_first >= 1, "First glean should find at least one entry" + assert count_second == 0, "Second glean should skip unchanged file" + + def test_second_glean_runs_when_file_grows(self, db_path: Path, log_file: Path) -> None: + glean_file(log_file, db_path) + + # Append a new line and update mtime by rewriting. + original = log_file.read_text() + log_file.write_text(original + "May 24 10:01:00 heimdall kernel: second message\n") + + stats_second = glean_file(log_file, db_path) + # INSERT OR IGNORE means the original entry won't re-count, but parsing + # does happen — at minimum the new line is processed. + assert sum(stats_second.values()) >= 0 # glean ran (not skipped) + + # Confirm fingerprint updated to new size. + conn = sqlite3.connect(str(db_path)) + row = conn.execute( + "SELECT size FROM glean_fingerprints WHERE path = ?", + (str(log_file),), + ).fetchone() + conn.close() + assert row is not None + assert row[0] == log_file.stat().st_size + + def test_force_bypasses_fingerprint(self, db_path: Path, log_file: Path) -> None: + glean_file(log_file, db_path) + + # Without force: skipped. + stats_no_force = glean_file(log_file, db_path) + assert sum(stats_no_force.values()) == 0 + + # With force: glean runs (INSERT OR IGNORE means count may be 0, but + # we verify the fingerprint was re-saved with a fresh gleaned_at). + conn_before = sqlite3.connect(str(db_path)) + ts_before = conn_before.execute( + "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?", + (str(log_file),), + ).fetchone()[0] + conn_before.close() + + time.sleep(0.01) # ensure gleaned_at advances + glean_file(log_file, db_path, force=True) + + conn_after = sqlite3.connect(str(db_path)) + ts_after = conn_after.execute( + "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?", + (str(log_file),), + ).fetchone()[0] + conn_after.close() + + assert ts_after > ts_before, "force=True should update gleaned_at timestamp" + + +# ── Integration: glean_dir skipping ────────────────────────────────────────── + +class TestGleanDirFingerprint: + def test_glean_dir_skips_unchanged_on_second_run(self, db_path: Path, tmp_path: Path) -> None: + log1 = tmp_path / "a.log" + log2 = tmp_path / "b.log" + log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n") + log2.write_text("May 24 10:00:00 heimdall kernel: msg two\n") + + glean_dir(tmp_path, db_path) + + stats_second = glean_dir(tmp_path, db_path) + assert sum(stats_second.values()) == 0, "Both unchanged files should be skipped" + + def test_glean_dir_force_reruns_all(self, db_path: Path, tmp_path: Path) -> None: + log1 = tmp_path / "a.log" + log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n") + + glean_dir(tmp_path, db_path) + + # force=True: runs even though nothing changed; INSERT OR IGNORE keeps DB clean. + conn_before = sqlite3.connect(str(db_path)) + ts_before = conn_before.execute( + "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?", + (str(log1),), + ).fetchone()[0] + conn_before.close() + + time.sleep(0.01) + glean_dir(tmp_path, db_path, force=True) + + conn_after = sqlite3.connect(str(db_path)) + ts_after = conn_after.execute( + "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?", + (str(log1),), + ).fetchone()[0] + conn_after.close() + + assert ts_after > ts_before + + +# ── Schema: ensure fingerprints table created ───────────────────────────────── + +class TestEnsureSchema: + def test_fingerprints_table_exists_after_ensure_schema(self, tmp_path: Path) -> None: + db = tmp_path / "fresh.db" + ensure_schema(db) + conn = sqlite3.connect(str(db)) + tables = { + row[0] + for row in conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ).fetchall() + } + conn.close() + assert "glean_fingerprints" in tables + + def test_ensure_schema_idempotent(self, tmp_path: Path) -> None: + """Calling ensure_schema twice on the same DB must not raise.""" + db = tmp_path / "fresh.db" + ensure_schema(db) + ensure_schema(db) # second call — should be a no-op diff --git a/tests/test_glean_pipeline_ssh.py b/tests/test_glean_pipeline_ssh.py new file mode 100644 index 0000000..e00683a --- /dev/null +++ b/tests/test_glean_pipeline_ssh.py @@ -0,0 +1,444 @@ +"""Tests for SSH source handling in app/glean/pipeline.py. + +Verifies that glean_sources() correctly: +- Dispatches SSH sources to SSHTransport (local sources unchanged) +- Routes each glean-type to the right command builder + parser +- Writes parsed entries to SQLite +- Gracefully skips sources on SSHConnectionError or SSHCommandError +""" +from __future__ import annotations + +import json +import sqlite3 +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest +import yaml + +from app.glean.pipeline import glean_sources, ensure_schema +from app.glean.ssh import SSHConnectionError, SSHCommandError + + +# ── Shared fixtures ─────────────────────────────────────────────────────────── + +JOURNALD_LINE = json.dumps({ + "__REALTIME_TIMESTAMP": "1747000000000000", + "PRIORITY": "3", + "MESSAGE": "SSH brute force detected from 192.168.1.99", + "SYSLOG_IDENTIFIER": "sshd", + "_HOSTNAME": "rack01", +}) + "\n" + +SYSLOG_LINE = "May 20 22:00:00 rack01 sshd[1234]: Failed password for invalid user admin\n" + +PLAINTEXT_LINE = "2026-05-20 22:00:00 ERROR app crashed with exit code 1\n" + +DOCKER_LINE = "2026-05-20T22:00:00.000000000Z stderr F container startup failed\n" + + +def _ssh_sources_yaml(sources: list[dict]) -> str: + return yaml.dump({"sources": sources}) + + +def _mock_transport(lines: list[str] | None = None): + """Return a mock SSHTransport context manager whose exec_stream yields given lines.""" + mock_t = MagicMock() + mock_t.exec_stream.return_value = iter(lines or []) + return mock_t + + +def _patch_transport(mock_t): + """Patch SSHTransport in pipeline so __enter__ returns mock_t.""" + p = patch("app.glean.pipeline.SSHTransport") + MockClass = p.start() + MockClass.return_value.__enter__.return_value = mock_t + MockClass.return_value.__exit__.return_value = None + return p, MockClass + + +def _entry_count(db_path: Path) -> int: + conn = sqlite3.connect(db_path) + n = conn.execute("SELECT COUNT(*) FROM log_entries").fetchone()[0] + conn.close() + return n + + +# ── journald type ───────────────────────────────────────────────────────────── + +class TestSSHJournaldGlean: + def test_journald_entries_written_to_db(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "journald"}], + }])) + + mock_t = _mock_transport([JOURNALD_LINE]) + p, MockClass = _patch_transport(mock_t) + try: + stats = glean_sources(sources_file, db_path) + finally: + p.stop() + + assert _entry_count(db_path) >= 1 + assert any("rack01" in k for k in stats) + + def test_journald_args_passed_to_command_builder(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "journald", "args": ["--since", "1 hour ago"]}], + }])) + + mock_t = _mock_transport([JOURNALD_LINE]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + # The command passed to exec_stream must contain the args + call_args = mock_t.exec_stream.call_args[0][0] + assert "--since" in call_args + assert "1 hour ago" in call_args + + def test_journald_unit_shorthand(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "journald", "unit": "sshd"}], + }])) + + mock_t = _mock_transport([]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + call_args = mock_t.exec_stream.call_args[0][0] + assert "sshd" in call_args + + +# ── syslog type ─────────────────────────────────────────────────────────────── + +class TestSSHSyslogGlean: + def test_syslog_entries_written_to_db(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01-syslog", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "syslog", "path": "/var/log/syslog"}], + }])) + + mock_t = _mock_transport([SYSLOG_LINE]) + p, _ = _patch_transport(mock_t) + try: + stats = glean_sources(sources_file, db_path) + finally: + p.stop() + + assert _entry_count(db_path) >= 1 + + def test_syslog_command_contains_path(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "syslog", "path": "/var/log/auth.log"}], + }])) + + mock_t = _mock_transport([]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + call_args = mock_t.exec_stream.call_args[0][0] + assert "/var/log/auth.log" in call_args + + +# ── plaintext type ──────────────────────────────────────────────────────────── + +class TestSSHPlaintextGlean: + def test_plaintext_entries_written_to_db(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01-app", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "plaintext", "path": "/var/log/app/error.log"}], + }])) + + mock_t = _mock_transport([PLAINTEXT_LINE]) + p, _ = _patch_transport(mock_t) + try: + stats = glean_sources(sources_file, db_path) + finally: + p.stop() + + assert _entry_count(db_path) >= 1 + + def test_plaintext_command_contains_path(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "plaintext", "path": "/opt/myapp/app.log"}], + }])) + + mock_t = _mock_transport([]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + call_args = mock_t.exec_stream.call_args[0][0] + assert "/opt/myapp/app.log" in call_args + + +# ── docker type ─────────────────────────────────────────────────────────────── + +class TestSSHDockerGlean: + def test_docker_single_container_command_issued(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "docker", "containers": ["myapp"]}], + }])) + + mock_t = _mock_transport([DOCKER_LINE]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + call_args = mock_t.exec_stream.call_args[0][0] + assert "myapp" in call_args + + def test_docker_multiple_containers_exec_per_container(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "docker", "containers": ["app", "nginx"]}], + }])) + + mock_t = MagicMock() + mock_t.exec_stream.return_value = iter([]) + p, _ = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + # One exec_stream call per container + assert mock_t.exec_stream.call_count == 2 + all_cmds = " ".join(c[0][0] for c in mock_t.exec_stream.call_args_list) + assert "app" in all_cmds + assert "nginx" in all_cmds + + +# ── error handling ──────────────────────────────────────────────────────────── + +class TestSSHGleanErrorHandling: + def test_connection_error_skips_source_returns_empty_stats(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "unreachable", + "transport": "ssh", + "host": "192.168.99.99", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "journald"}], + }])) + + with patch("app.glean.pipeline.SSHTransport") as MockClass: + MockClass.return_value.__enter__.side_effect = SSHConnectionError("no route") + MockClass.return_value.__exit__.return_value = None + stats = glean_sources(sources_file, db_path) + + assert _entry_count(db_path) == 0 + # Stats for the source should either be absent or zero + for v in stats.values(): + assert v == 0 + + def test_command_error_skips_item_continues_next(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + # Two glean items: first raises SSHCommandError, second yields a valid line + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [ + {"type": "journald"}, + {"type": "syslog", "path": "/var/log/syslog"}, + ], + }])) + + mock_t = MagicMock() + # side_effect list: exception instances are raised; other values are returned + mock_t.exec_stream.side_effect = [ + SSHCommandError("journalctl: command not found"), # raised on first call + iter([SYSLOG_LINE]), # returned on second call + ] + + p, _ = _patch_transport(mock_t) + try: + # Should not raise — bad item is skipped, good item is processed + stats = glean_sources(sources_file, db_path) + finally: + p.stop() + + # The syslog line should have been written + assert _entry_count(db_path) >= 1 + + def test_unknown_glean_type_skipped(self, tmp_path): + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "mqtt"}], # not a valid remote type + }])) + + mock_t = _mock_transport([]) + p, _ = _patch_transport(mock_t) + try: + stats = glean_sources(sources_file, db_path) # must not raise + finally: + p.stop() + + assert _entry_count(db_path) == 0 + + +# ── mixed local + SSH sources ───────────────────────────────────────────────── + +class TestMixedLocalAndSSH: + def test_local_and_ssh_both_processed(self, tmp_path): + # Local syslog file + local_log = tmp_path / "local.log" + local_log.write_text(SYSLOG_LINE) + + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([ + {"id": "local-syslog", "path": str(local_log)}, + { + "id": "remote01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [{"type": "syslog", "path": "/var/log/syslog"}], + }, + ])) + + mock_t = _mock_transport([SYSLOG_LINE]) + p, _ = _patch_transport(mock_t) + try: + stats = glean_sources(sources_file, db_path) + finally: + p.stop() + + # Both sources should have contributed entries + assert _entry_count(db_path) >= 2 + assert "local-syslog" in stats + assert any("remote01" in k for k in stats) + + def test_local_only_sources_never_calls_ssh(self, tmp_path): + local_log = tmp_path / "local.log" + local_log.write_text(SYSLOG_LINE) + + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([ + {"id": "local", "path": str(local_log)}, + ])) + + with patch("app.glean.pipeline.SSHTransport") as MockClass: + glean_sources(sources_file, db_path) + MockClass.assert_not_called() + + +# ── multiple glean items per SSH source ─────────────────────────────────────── + +class TestMultipleGleanItemsPerHost: + def test_one_connection_multiple_commands(self, tmp_path): + """One SSHTransport instance is shared across all glean items for a host.""" + sources_file = tmp_path / "sources.yaml" + db_path = tmp_path / "test.db" + sources_file.write_text(_ssh_sources_yaml([{ + "id": "rack01", + "transport": "ssh", + "host": "192.168.1.10", + "user": "admin", + "key_path": "~/.ssh/id_ed25519", + "glean": [ + {"type": "journald"}, + {"type": "syslog", "path": "/var/log/syslog"}, + {"type": "plaintext", "path": "/var/log/app.log"}, + ], + }])) + + mock_t = _mock_transport([]) + p, MockClass = _patch_transport(mock_t) + try: + glean_sources(sources_file, db_path) + finally: + p.stop() + + # SSHTransport() should be instantiated only once for the one host + MockClass.assert_called_once() + # exec_stream should be called once per glean item + assert mock_t.exec_stream.call_count == 3 diff --git a/tests/test_ingest_qbittorrent.py b/tests/test_glean_qbittorrent.py similarity index 98% rename from tests/test_ingest_qbittorrent.py rename to tests/test_glean_qbittorrent.py index 4b3c874..5c5d5bf 100644 --- a/tests/test_ingest_qbittorrent.py +++ b/tests/test_glean_qbittorrent.py @@ -1,9 +1,9 @@ -"""Tests for the qBittorrent log ingestor.""" +"""Tests for the qBittorrent log gleaner.""" from __future__ import annotations import pytest -from app.ingest.qbittorrent import is_qbit_log, parse +from app.glean.qbittorrent import is_qbit_log, parse # --------------------------------------------------------------------------- # Classic format sample (pre-5.x GUI builds) diff --git a/tests/test_glean_ssh.py b/tests/test_glean_ssh.py new file mode 100644 index 0000000..9a240ad --- /dev/null +++ b/tests/test_glean_ssh.py @@ -0,0 +1,185 @@ +"""Tests for SSH transport layer (app/glean/ssh.py). + +All SSH network I/O is mocked — no real SSH connection required. +""" +from __future__ import annotations + +import io +from unittest.mock import MagicMock, patch, call + +import pytest + +from app.glean.ssh import ( + SSHTransport, + SSHConnectionError, + SSHCommandError, + _build_journald_command, + _build_syslog_command, + _build_plaintext_command, + _build_docker_command, +) + + +# ── Command builders ────────────────────────────────────────────────────────── + +class TestBuildJournaldCommand: + def test_no_args_returns_base_command(self): + cmd = _build_journald_command({}) + assert "journalctl" in cmd + assert "-o json" in cmd + + def test_args_list_appended(self): + cmd = _build_journald_command({"args": ["--since", "2 hours ago", "--unit", "sshd"]}) + assert "--since" in cmd + assert "2 hours ago" in cmd + assert "--unit" in cmd + assert "sshd" in cmd + + def test_unit_shorthand(self): + cmd = _build_journald_command({"unit": "docker"}) + assert "--unit docker" in cmd or "--unit=docker" in cmd + + +class TestBuildSyslogCommand: + def test_returns_cat_command(self): + cmd = _build_syslog_command({"path": "/var/log/syslog"}) + assert "cat" in cmd + assert "/var/log/syslog" in cmd + + def test_default_path_when_omitted(self): + cmd = _build_syslog_command({}) + assert "cat" in cmd + assert "/var/log" in cmd + + +class TestBuildPlaintextCommand: + def test_cat_with_path(self): + cmd = _build_plaintext_command({"path": "/var/log/app/error.log"}) + assert "cat" in cmd + assert "/var/log/app/error.log" in cmd + + def test_raises_without_path(self): + with pytest.raises((ValueError, KeyError)): + _build_plaintext_command({}) + + +class TestBuildDockerCommand: + def test_single_container(self): + cmd = _build_docker_command({"containers": ["myapp"]}) + assert "myapp" in cmd + + def test_multiple_containers_returns_list(self): + cmds = _build_docker_command({"containers": ["app", "nginx"]}) + # Multiple containers → must produce a command per container OR joined + assert "app" in (cmds if isinstance(cmds, str) else " ".join(cmds)) + assert "nginx" in (cmds if isinstance(cmds, str) else " ".join(cmds)) + + def test_raises_without_containers(self): + with pytest.raises((ValueError, KeyError)): + _build_docker_command({}) + + +# ── SSHTransport context manager ────────────────────────────────────────────── + +def _mock_ssh_client(stdout_lines: list[str] | None = None): + """Return a mock SSHClient whose exec_command yields the given lines.""" + client = MagicMock() + stdout = MagicMock() + stdout.__iter__ = MagicMock(return_value=iter(stdout_lines or [])) + stderr = MagicMock() + stderr.read.return_value = b"" + client.exec_command.return_value = (MagicMock(), stdout, stderr) + return client + + +class TestSSHTransportConnect: + def test_connects_with_key_path(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + MockClient.return_value = _mock_ssh_client() + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)): + pass + MockClient.return_value.connect.assert_called_once() + call_kwargs = MockClient.return_value.connect.call_args + assert call_kwargs.kwargs.get("hostname") == "10.0.0.1" or \ + call_kwargs.args[0] == "10.0.0.1" + + def test_disconnects_on_exit(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + mock_client = _mock_ssh_client() + MockClient.return_value = mock_client + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)): + pass + mock_client.close.assert_called_once() + + def test_disconnects_on_exception(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + mock_client = _mock_ssh_client() + MockClient.return_value = mock_client + with pytest.raises(RuntimeError): + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)): + raise RuntimeError("boom") + mock_client.close.assert_called_once() + + def test_raises_ssh_connection_error_on_auth_failure(self, tmp_path): + import paramiko + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + MockClient.return_value.connect.side_effect = paramiko.AuthenticationException("denied") + with pytest.raises(SSHConnectionError, match="auth"): + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)): + pass + + def test_raises_ssh_connection_error_on_no_route(self, tmp_path): + import paramiko + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + MockClient.return_value.connect.side_effect = paramiko.SSHException("no route") + with pytest.raises(SSHConnectionError): + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)): + pass + + +class TestSSHTransportExecStream: + def test_yields_stdout_lines(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + lines = ["line one\n", "line two\n", "line three\n"] + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + MockClient.return_value = _mock_ssh_client(lines) + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t: + result = list(t.exec_stream("echo hello")) + assert result == lines + + def test_raises_ssh_command_error_on_nonzero_exit(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + mock_client = _mock_ssh_client([]) + # Simulate non-zero exit code + channel = MagicMock() + channel.recv_exit_status.return_value = 1 + mock_client.exec_command.return_value[1].channel = channel + mock_client.exec_command.return_value[2].read.return_value = b"command not found" + MockClient.return_value = mock_client + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t: + with pytest.raises(SSHCommandError, match="command not found"): + list(t.exec_stream("notacommand")) + + def test_strips_trailing_newlines(self, tmp_path): + key_file = tmp_path / "id_ed25519" + key_file.write_bytes(b"fake-key") + lines = [" line with spaces \n"] + with patch("app.glean.ssh.paramiko.SSHClient") as MockClient: + MockClient.return_value = _mock_ssh_client(lines) + with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t: + # exec_stream should yield the raw lines; stripping is parser's job + result = list(t.exec_stream("echo hello")) + assert len(result) == 1 diff --git a/tests/test_ingest_syslog.py b/tests/test_glean_syslog.py similarity index 96% rename from tests/test_ingest_syslog.py rename to tests/test_glean_syslog.py index cde2d43..b7e0846 100644 --- a/tests/test_ingest_syslog.py +++ b/tests/test_glean_syslog.py @@ -1,7 +1,7 @@ -"""Tests for the syslog (RFC 3164) ingestor.""" +"""Tests for the syslog (RFC 3164) gleaner.""" from __future__ import annotations -from app.ingest.syslog import is_syslog, parse +from app.glean.syslog import is_syslog, parse SYSLOG_SAMPLE = """\ May 11 14:23:01 xanderland sshd[1234]: Accepted publickey for x from 192.168.1.1 port 54321 ssh2 diff --git a/tests/test_ingest_tautulli.py b/tests/test_glean_tautulli.py similarity index 96% rename from tests/test_ingest_tautulli.py rename to tests/test_glean_tautulli.py index a3820f8..4b12b08 100644 --- a/tests/test_ingest_tautulli.py +++ b/tests/test_glean_tautulli.py @@ -1,10 +1,10 @@ -"""Tests for the Tautulli webhook ingestor.""" +"""Tests for the Tautulli webhook gleaner.""" from __future__ import annotations import pytest from unittest.mock import patch -from app.ingest.tautulli import is_tautulli_payload, parse_webhook +from app.glean.tautulli import is_tautulli_payload, parse_webhook # --------------------------------------------------------------------------- @@ -253,7 +253,7 @@ class TestEndpoint: @pytest.fixture def client(self, tmp_path): from fastapi.testclient import TestClient - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import app.rest as rest_module db = tmp_path / "test.db" @@ -267,14 +267,14 @@ class TestEndpoint: def test_missing_action_returns_400(self, client): resp = client.post( - "/turnstone/api/ingest/tautulli", + "/turnstone/api/glean/tautulli", json={"session_key": "x"}, ) assert resp.status_code == 400 def test_wrong_token_returns_403(self, tmp_path): from fastapi.testclient import TestClient - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import app.rest as rest_module db = tmp_path / "test.db" @@ -288,7 +288,7 @@ class TestEndpoint: patch.object(rest_module, "_compiled_patterns", []): with TestClient(rest_module.app, raise_server_exceptions=True) as c: resp = c.post( - "/turnstone/api/ingest/tautulli", + "/turnstone/api/glean/tautulli", json=_ERROR_PAYLOAD, headers={"X-Tautulli-Token": "wrong"}, ) @@ -296,7 +296,7 @@ class TestEndpoint: def test_valid_payload_returns_200(self, client): resp = client.post( - "/turnstone/api/ingest/tautulli", + "/turnstone/api/glean/tautulli", json=_ERROR_PAYLOAD, ) assert resp.status_code == 200 diff --git a/tests/test_ingest_wazuh.py b/tests/test_glean_wazuh.py similarity index 96% rename from tests/test_ingest_wazuh.py rename to tests/test_glean_wazuh.py index d218ebd..98c0458 100644 --- a/tests/test_ingest_wazuh.py +++ b/tests/test_glean_wazuh.py @@ -1,11 +1,11 @@ -"""Tests for the Wazuh alert ingestor.""" +"""Tests for the Wazuh alert gleaner.""" from __future__ import annotations import json from datetime import datetime -from app.ingest.wazuh import is_wazuh_alert, parse -from app.ingest.pipeline import _detect_format +from app.glean.wazuh import is_wazuh_alert, parse +from app.glean.pipeline import _detect_format _ALERT = { "timestamp": "2024-01-15T10:23:45.123+0000", diff --git a/tests/test_service_blocklist.py b/tests/test_service_blocklist.py index 893a076..f64d6e6 100644 --- a/tests/test_service_blocklist.py +++ b/tests/test_service_blocklist.py @@ -8,7 +8,7 @@ from pathlib import Path class TestSchema: def test_blocklist_candidates_table_exists(self, tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema db = tmp_path / "test.db" ensure_schema(db) conn = sqlite3.connect(str(db)) @@ -16,7 +16,7 @@ class TestSchema: assert "blocklist_candidates" in tables def test_blocklist_candidates_columns(self, tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema db = tmp_path / "test.db" ensure_schema(db) conn = sqlite3.connect(str(db)) @@ -28,7 +28,7 @@ class TestSchema: } def test_status_default_is_pending(self, tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import uuid db = tmp_path / "test.db" ensure_schema(db) @@ -89,7 +89,7 @@ class TestTelemetry: class TestExtraction: @pytest.fixture def db(self, tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema p = tmp_path / "test.db" ensure_schema(p) return p @@ -195,7 +195,7 @@ class TestExtraction: class TestCandidateManagement: @pytest.fixture def db_with_candidate(self, tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema import sqlite3, uuid db = tmp_path / "test.db" ensure_schema(db) diff --git a/tests/test_services_diagnose.py b/tests/test_services_diagnose.py index 2cb0e01..e50debb 100644 --- a/tests/test_services_diagnose.py +++ b/tests/test_services_diagnose.py @@ -54,7 +54,7 @@ def test_keywords_cleaned_of_extra_spaces(): def test_diagnose_with_explicit_window_sets_time_detected(tmp_path): - from app.ingest.pipeline import ensure_schema + from app.glean.pipeline import ensure_schema db = tmp_path / "test.db" ensure_schema(db) result = diagnose(db, query="plex", since="2026-05-11T14:00:00+00:00", until="2026-05-11T15:00:00+00:00") diff --git a/web/src/components/QuickCapture.vue b/web/src/components/QuickCapture.vue index 1d71b14..0d40398 100644 --- a/web/src/components/QuickCapture.vue +++ b/web/src/components/QuickCapture.vue @@ -104,7 +104,7 @@

No {{ severityFilter }} entries in this result set.

diff --git a/web/src/views/DashboardView.vue b/web/src/views/DashboardView.vue index 8ca5720..98a9c4f 100644 --- a/web/src/views/DashboardView.vue +++ b/web/src/views/DashboardView.vue @@ -10,7 +10,7 @@ class="w-2 h-2 rounded-full flex-shrink-0" > - {{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual ingest mode' }} + {{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual glean mode' }} @@ -20,8 +20,8 @@ class="flex items-center gap-2 rounded border border-surface-border bg-surface-raised px-4 py-2.5 text-xs text-text-dim" > - Live watch active — last event: {{ shortTs(stats.last_ingested) }}. Waiting for new entries to arrive. - Last ingested: {{ shortTs(stats.last_ingested) }} — 24h counts reflect this window, not today. + Live watch active — last event: {{ shortTs(stats.last_gleaned) }}. Waiting for new entries to arrive. + Last gleaned: {{ shortTs(stats.last_gleaned) }} — 24h counts reflect this window, not today. @@ -171,7 +171,7 @@ interface StatsResponse { criticals_24h: number errors_24h: number suppressed_criticals: number - last_ingested: string | null + last_gleaned: string | null source_health: SourceHealth[] recent_criticals: Array<{ entry_id: string @@ -186,7 +186,7 @@ interface WatchSourceStatus { source_id: string type: string running: boolean - entries_ingested: number + entries_gleaned: number last_event: string | null error: string | null } @@ -211,8 +211,8 @@ const watchActive = computed(() => ) const isStale = computed(() => { - if (!stats.value?.last_ingested) return false - const age = Date.now() - new Date(stats.value.last_ingested).getTime() + if (!stats.value?.last_gleaned) return false + const age = Date.now() - new Date(stats.value.last_gleaned).getTime() return age > 25 * 60 * 60 * 1000 // older than 25h }) diff --git a/web/src/views/LogSearchView.vue b/web/src/views/LogSearchView.vue index 1f03371..f7ea3b2 100644 --- a/web/src/views/LogSearchView.vue +++ b/web/src/views/LogSearchView.vue @@ -106,7 +106,7 @@

No results for "{{ store.query }}"

-

Try broader terms or check the Sources tab to confirm data is ingested.

+

Try broader terms or check the Sources tab to confirm data is gleaned.

diff --git a/web/src/views/SourcesView.vue b/web/src/views/SourcesView.vue index 81029ef..599fce9 100644 --- a/web/src/views/SourcesView.vue +++ b/web/src/views/SourcesView.vue @@ -3,7 +3,7 @@

Log Sources

-

All hosts and services in the ingested corpus.

+

All hosts and services in the gleaned corpus.