Merge pull request 'feat(diagnose): 5-stage multi-agent diagnose pipeline (#29)' (#39) from feat/29-multi-agent-diagnose into main

This commit is contained in:
pyr0ball 2026-05-25 19:59:34 -07:00
commit f302f27350
76 changed files with 6640 additions and 635 deletions

View file

@ -23,6 +23,6 @@
# Remote endpoint to push diagnostic bundles for escalation.
# TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles
# --- Periodic batch ingest ---
# Seconds between automatic ingest runs from sources.yaml. Set to 0 to disable.
# TURNSTONE_INGEST_INTERVAL=900
# --- Periodic batch glean ---
# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
# TURNSTONE_GLEAN_INTERVAL=900

View file

@ -28,8 +28,8 @@ Service logs (journald, Docker, syslog, Caddy, Plex, arr stack, qBittorrent, dme
## Features
- **Multi-source ingest** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
- **Pattern tagging** — named regex patterns applied at ingest time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
- **Multi-source glean** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
- **Pattern tagging** — named regex patterns applied at glean time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
- **Full-text search** — SQLite FTS5 index across all ingested entries; filter by source, severity, time window
- **Natural-language time queries** — "what happened yesterday morning", "show me errors from the last 3 hours"; powered by dateparser
- **Incident management** — create, label, and track incidents; attach supporting log entries
@ -101,13 +101,13 @@ sources:
path: /var/log/caddy/access.log
```
For `journald` sources, run `scripts/export_journal.sh` on the host before each ingest (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
For `journald` sources, run `scripts/export_journal.sh` on the host before each glean (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
---
## Pattern library
Named patterns in `patterns/default.yaml` are matched against every log entry at ingest time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
Named patterns in `patterns/default.yaml` are matched against every log entry at glean time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
```yaml
patterns:
@ -157,7 +157,7 @@ Copy `.env.example` to `.env` (or pass as `-e` flags to Docker/Podman). All vari
| `TURNSTONE_PATTERNS` | `./patterns` | Pattern directory (default.yaml, sources.yaml, watch.yaml). |
| `TURNSTONE_SOURCE_HOST` | `unknown` | Host identifier stamped on ingested entries. |
| `TURNSTONE_BUNDLE_ENDPOINT` | — | Remote URL to push diagnostic bundles for escalation. |
| `TURNSTONE_INGEST_INTERVAL` | `900` | Seconds between automatic batch ingest runs. Set to `0` to disable. |
| `TURNSTONE_GLEAN_INTERVAL` | `900` | Seconds between automatic batch glean runs. Set to `0` to disable. |
---

View file

@ -1,64 +1,81 @@
"""Ollama embedding client with sqlite-vec storage — BSL licensed."""
"""Context chunk embedding — BSL licensed.
Thin wrapper around app.services.embeddings that handles the DB I/O for
context_chunks. All backend configuration (model, device, backend type) is
delegated to the service layer via TURNSTONE_EMBED_* env vars.
Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue
to work without changes.
"""
from __future__ import annotations
import logging
import sqlite3
import struct
from pathlib import Path
import httpx
from app.services.embeddings import (
EMBEDDING_AVAILABLE, # re-export for backward compat
get_embedder,
pack_vector,
)
__all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"]
logger = logging.getLogger(__name__)
EMBEDDING_AVAILABLE: bool = False
try:
import sqlite_vec # type: ignore[import] # noqa: F401
EMBEDDING_AVAILABLE = True
logger.debug("sqlite-vec loaded — embedding pipeline enabled")
except ImportError:
logger.debug("sqlite-vec not available — embedding pipeline disabled")
def embed_chunks(
db_path: Path,
document_id: str,
llm_url: str,
model: str = "nomic-embed-text",
# Legacy params kept for backward compat — ignored when the ST backend is active.
llm_url: str = "",
model: str = "",
timeout: float = 60.0,
) -> int:
"""Embed all unembedded chunks for a document. Returns count embedded. No-op when EMBEDDING_AVAILABLE is False."""
if not EMBEDDING_AVAILABLE:
"""Embed all un-embedded chunks for *document_id*.
Uses the configured embedder (sentence-transformers by default; Ollama when
TURNSTONE_EMBED_BACKEND=ollama). Returns the count of newly embedded chunks.
Returns 0 silently when no embedder is available.
The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when
the sentence-transformers backend is active configure via env vars instead.
"""
embedder = get_embedder()
if embedder is None:
return 0
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL",
(document_id,),
).fetchall()
if not rows:
conn.close()
return 0
texts = [r["text"] for r in rows]
ids = [r["id"] for r in rows]
count = 0
for row in rows:
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/api/embeddings",
json={"model": model, "prompt": row["text"]},
timeout=timeout,
)
resp.raise_for_status()
vector: list[float] = resp.json().get("embedding") or []
if vector:
blob = struct.pack(f"{len(vector)}f", *vector)
vectors = embedder.embed_batch(texts)
for chunk_id, vec in zip(ids, vectors):
blob = pack_vector(vec)
conn.execute(
"UPDATE context_chunks SET embedding = ? WHERE id = ?",
(blob, row["id"]),
(blob, chunk_id),
)
count += 1
except Exception as exc:
logger.warning("Embedding chunk %s failed: %s", row["id"], exc)
conn.commit()
except Exception as exc:
logger.warning("Batch embedding failed for document %s: %s", document_id, exc)
finally:
conn.close()
logger.debug("Embedded %d chunk(s) for document %s", count, document_id)
return count

View file

@ -1,10 +1,30 @@
"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed."""
"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed.
Two retrieval modes for context_chunks:
Vector search cosine similarity over stored embeddings (when available)
Keyword search LIKE-based fallback when no embedder is configured
Both modes are called from retrieve_context(); the best available mode is used
automatically so callers need not check EMBEDDING_AVAILABLE themselves.
"""
from __future__ import annotations
import logging
import sqlite3
from dataclasses import dataclass, field
from pathlib import Path
import numpy as np
from app.services.embeddings import (
EMBEDDING_AVAILABLE,
cosine_similarity,
get_embedder,
unpack_vector,
)
logger = logging.getLogger(__name__)
@dataclass
class RetrievedContext:
@ -12,6 +32,8 @@ class RetrievedContext:
chunks: list[dict[str, str]] = field(default_factory=list)
# ── Structured fact retrieval (always runs) ───────────────────────────────────
def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
"""Keyword match against context_facts. Always runs — Free tier."""
try:
@ -42,8 +64,68 @@ def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
return []
def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
"""Keyword search across context_chunks. Fallback when no embeddings."""
# ── Chunk retrieval: vector path ──────────────────────────────────────────────
def _search_chunks_vector(
db_path: Path,
query: str,
top_k: int = 3,
) -> list[dict[str, str]]:
"""Cosine similarity search over embedded context_chunks.
Loads all stored embeddings into memory and scores in-process with numpy.
Skips any chunk whose BLOB dimension does not match the current model dim
(stale embeddings from a previous model they will be re-embedded on the
next document upload).
Returns at most *top_k* results ordered by similarity descending.
"""
embedder = get_embedder()
if embedder is None:
return []
try:
query_vec: np.ndarray = embedder.embed(query)
model_dim: int = embedder.dim
except Exception as exc:
logger.warning("Query embedding failed: %s", exc)
return []
try:
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT cc.id, cc.text, cc.embedding, cd.filename"
" FROM context_chunks cc"
" JOIN context_documents cd ON cc.document_id = cd.id"
" WHERE cc.embedding IS NOT NULL"
).fetchall()
conn.close()
except sqlite3.OperationalError:
return []
scored: list[tuple[float, dict[str, str]]] = []
for row in rows:
blob: bytes = row["embedding"]
# Guard against blobs from a different-dimension model
if len(blob) // 4 != model_dim:
continue
try:
chunk_vec = unpack_vector(blob)
score = cosine_similarity(query_vec, chunk_vec)
scored.append((score, {"text": row["text"], "filename": row["filename"]}))
except Exception:
continue
scored.sort(key=lambda t: t[0], reverse=True)
return [item for _, item in scored[:top_k]]
# ── Chunk retrieval: keyword fallback ─────────────────────────────────────────
def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]:
"""LIKE-based keyword search across context_chunks. Fallback when no embedder."""
try:
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
@ -66,16 +148,29 @@ def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
return []
# ── Public interface ──────────────────────────────────────────────────────────
def retrieve_context(db_path: Path, query: str) -> RetrievedContext:
"""Retrieve structured facts and relevant chunks for a query."""
return RetrievedContext(
facts=get_relevant_facts(db_path, query),
chunks=_search_chunks(db_path, query),
)
"""Retrieve structured facts and relevant chunks for a query.
Chunk retrieval uses vector search when an embedder is available and at
least one embedded chunk exists; falls back to keyword search otherwise.
"""
facts = get_relevant_facts(db_path, query)
if EMBEDDING_AVAILABLE:
chunks = _search_chunks_vector(db_path, query)
if not chunks:
# Vector search returned nothing (no embedded chunks yet) — fall back.
chunks = _search_chunks_keyword(db_path, query)
else:
chunks = _search_chunks_keyword(db_path, query)
return RetrievedContext(facts=facts, chunks=chunks)
def format_context_block(ctx: RetrievedContext) -> str | None:
"""Format context for injection into LLM prompt. Returns None when empty."""
"""Format context for injection into an LLM prompt. Returns None when empty."""
lines: list[str] = []
if ctx.facts:
lines.append("Known environment facts:")

View file

@ -4,7 +4,7 @@ from __future__ import annotations
import json
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, epoch_float_to_iso,
make_entry_id, now_iso,
)

View file

@ -18,7 +18,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -10,7 +10,7 @@ from app.context.chunker import process_upload
from app.context.store import add_document, add_fact
def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
"""Process an uploaded file and write to context store. Returns result summary."""
doc_type, facts, chunks = process_upload(filename, content)

View file

@ -4,7 +4,7 @@ from __future__ import annotations
import json
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity,
make_entry_id, now_iso,
)

View file

@ -4,7 +4,7 @@ from __future__ import annotations
import json
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, epoch_micros_to_iso,
make_entry_id, now_iso, SYSLOG_PRIORITY,
)

View file

@ -1,10 +1,10 @@
"""Live MQTT ingest subscriber for Turnstone.
"""Live MQTT glean subscriber for Turnstone.
Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker
in the background. Incoming messages are normalized to RetrievedEntry and
written to the Turnstone SQLite database as they arrive.
This runs as an asyncio task alongside the batch ingest scheduler. It is
This runs as an asyncio task alongside the batch glean scheduler. It is
started from the FastAPI lifespan in rest.py.
MQTT source config format in sources.yaml::

616
app/glean/pipeline.py Normal file
View file

@ -0,0 +1,616 @@
"""Glean pipeline: auto-detect format, parse, write to SQLite."""
from __future__ import annotations
import json
import logging
import re
import sqlite3
from pathlib import Path
from typing import Iterator
import yaml
from app.glean import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
from app.glean.base import _compile, load_patterns, now_iso
from app.glean.ssh import (
SSHTransport,
SSHConnectionError,
SSHCommandError,
_build_docker_command,
_build_journald_command,
_build_plaintext_command,
_build_syslog_command,
)
from app.services.models import LogPattern, RetrievedEntry
from app.services.search import build_fts_index
logger = logging.getLogger(__name__)
_SCHEMA = """
CREATE TABLE IF NOT EXISTS log_entries (
id TEXT PRIMARY KEY,
source_id TEXT NOT NULL,
sequence INTEGER NOT NULL,
timestamp_raw TEXT,
timestamp_iso TEXT,
ingest_time TEXT NOT NULL,
severity TEXT,
repeat_count INTEGER DEFAULT 1,
out_of_order INTEGER DEFAULT 0,
matched_patterns TEXT DEFAULT '[]',
text TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id);
CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso);
CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count);
CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity);
CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns);
CREATE TABLE IF NOT EXISTS incidents (
id TEXT PRIMARY KEY,
label TEXT NOT NULL,
issue_type TEXT NOT NULL DEFAULT '',
started_at TEXT,
ended_at TEXT,
notes TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'medium'
);
CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
CREATE TABLE IF NOT EXISTS received_bundles (
id TEXT PRIMARY KEY,
source_host TEXT NOT NULL,
issue_type TEXT NOT NULL DEFAULT '',
label TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'medium',
started_at TEXT,
bundled_at TEXT NOT NULL,
entry_count INTEGER NOT NULL DEFAULT 0,
bundle_json TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type);
CREATE TABLE IF NOT EXISTS context_facts (
id TEXT PRIMARY KEY,
category TEXT NOT NULL,
key TEXT NOT NULL,
value TEXT NOT NULL,
source TEXT,
created_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key);
CREATE TABLE IF NOT EXISTS context_documents (
id TEXT PRIMARY KEY,
filename TEXT NOT NULL,
doc_type TEXT NOT NULL,
full_text TEXT NOT NULL,
file_size INTEGER,
uploaded_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS context_chunks (
id TEXT PRIMARY KEY,
document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
text TEXT NOT NULL,
embedding BLOB
);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
CREATE TABLE IF NOT EXISTS blocklist_candidates (
id TEXT PRIMARY KEY,
domain_or_ip TEXT NOT NULL,
source_device_ip TEXT,
source_device_name TEXT,
first_seen TEXT NOT NULL,
last_seen TEXT NOT NULL,
hit_count INTEGER DEFAULT 1,
status TEXT DEFAULT 'pending',
pushed_at TEXT,
log_evidence TEXT DEFAULT '[]',
matched_rule TEXT,
llm_score REAL,
llm_reason TEXT
);
CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
CREATE TABLE IF NOT EXISTS glean_fingerprints (
path TEXT PRIMARY KEY,
mtime REAL NOT NULL,
size INTEGER NOT NULL,
gleaned_at TEXT NOT NULL
);
"""
def ensure_schema(db_path: Path) -> None:
"""Create all tables and apply additive migrations. Safe to call on every startup."""
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
# Additive column migrations — ALTER TABLE silently skips if column exists
for stmt in [
"ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
]:
try:
conn.execute(stmt)
except sqlite3.OperationalError:
pass
conn.commit()
conn.close()
def _fingerprint(path: Path) -> tuple[float, int]:
"""Return (mtime, size) for a file — cheap identity check, no content read needed."""
st = path.stat()
return st.st_mtime, st.st_size
def _fp_unchanged(conn: sqlite3.Connection, path: Path, mtime: float, size: int) -> bool:
"""Return True only when the stored fingerprint exactly matches (mtime, size).
A smaller size (log rotation) or a larger size (new lines appended) both
return False so the caller re-gleams the file.
"""
row = conn.execute(
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
(str(path),),
).fetchone()
if row is None:
return False
return row[0] == mtime and row[1] == size
def _save_fingerprint(
conn: sqlite3.Connection,
path: Path,
mtime: float,
size: int,
gleaned_at: str,
) -> None:
"""Upsert the fingerprint for *path* after a successful glean."""
conn.execute(
"""
INSERT OR REPLACE INTO glean_fingerprints (path, mtime, size, gleaned_at)
VALUES (?, ?, ?, ?)
""",
(str(path), mtime, size, gleaned_at),
)
def _detect_format(first_line: str) -> str:
try:
obj = json.loads(first_line)
if "__REALTIME_TIMESTAMP" in obj:
return "journald"
if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
return "docker"
if wazuh.is_wazuh_alert(obj):
return "wazuh"
if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
return "caddy"
except (json.JSONDecodeError, AttributeError):
pass
if plex.is_plex_log(first_line):
return "plex"
if qbittorrent.is_qbit_log(first_line):
return "qbittorrent"
if servarr.is_servarr_log(first_line):
return "servarr"
if dmesg_log.is_dmesg_log(first_line):
return "dmesg"
if syslog.is_syslog(first_line):
return "syslog"
return "plaintext"
def _parse_file(
path: Path,
compiled: list[tuple[LogPattern, object]],
ingest_time: str,
source_id: str | None = None,
) -> Iterator[RetrievedEntry]:
source_id = source_id or path.stem
with path.open("r", errors="replace") as f:
lines = iter(f)
try:
first = next(lines)
except StopIteration:
return
fmt = _detect_format(first.strip())
logger.info("Detected format %r for %s", fmt, path.name)
def all_lines():
yield first
yield from lines
if fmt == "journald":
yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "wazuh":
yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "docker":
yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "caddy":
yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "plex":
yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "qbittorrent":
yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "servarr":
yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "dmesg":
yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "syslog":
yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
else:
yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
conn.executemany(
"""
INSERT OR IGNORE INTO log_entries
(id, source_id, sequence, timestamp_raw, timestamp_iso,
ingest_time, severity, repeat_count, out_of_order,
matched_patterns, text)
VALUES (?,?,?,?,?,?,?,?,?,?,?)
""",
[
(
e.entry_id, e.source_id, e.sequence,
e.timestamp_raw, e.timestamp_iso, e.ingest_time,
e.severity, e.repeat_count, int(e.out_of_order),
json.dumps(list(e.matched_patterns)), e.text,
)
for e in batch
],
)
def _glean_files(
files: list[Path],
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
source_id_map: dict[Path, str] | None = None,
force: bool = False,
) -> dict[str, int]:
pattern_file = pattern_file or Path("patterns/default.yaml")
patterns = load_patterns(pattern_file)
compiled = _compile(patterns)
ingest_time = now_iso()
source_id_map = source_id_map or {}
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
conn.commit()
stats: dict[str, int] = {}
skipped: list[str] = []
for log_file in files:
source_id = source_id_map.get(log_file, log_file.stem)
# Fingerprint check — skip files whose mtime+size haven't changed.
mtime, size = _fingerprint(log_file)
if not force and _fp_unchanged(conn, log_file, mtime, size):
logger.debug("Skipping unchanged file: %s", log_file.name)
skipped.append(log_file.name)
stats[source_id] = stats.get(source_id, 0)
continue
count = 0
batch: list[RetrievedEntry] = []
for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
batch.append(entry)
if len(batch) >= batch_size:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
batch.clear()
if batch:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
_save_fingerprint(conn, log_file, mtime, size, ingest_time)
conn.commit()
stats[source_id] = stats.get(source_id, 0) + count
logger.info("Gleaned %d entries from %s (source: %s)", count, log_file.name, source_id)
conn.close()
if skipped:
logger.info("Skipped %d unchanged file(s): %s", len(skipped), ", ".join(skipped))
logger.info("Building FTS index...")
build_fts_index(db_path)
logger.info("FTS index ready")
return stats
def _stream_and_write(
transport: SSHTransport,
cmd: str,
parser,
source_id: str,
compiled: list[tuple[LogPattern, object]],
ingest_time: str,
conn: sqlite3.Connection,
batch_size: int,
) -> int:
"""Stream *cmd* output through *parser* and write entries to *conn*.
Catches SSHCommandError per-item so one bad command doesn't abort the rest
of the glean items for this host. Returns the number of entries written.
"""
count = 0
batch: list[RetrievedEntry] = []
try:
for entry in parser(transport.exec_stream(cmd), source_id, compiled, ingest_time):
batch.append(entry)
if len(batch) >= batch_size:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
batch.clear()
if batch:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
except SSHCommandError as exc:
logger.warning("SSH command failed for source %r (cmd: %s): %s", source_id, cmd, exc)
logger.info("Gleaned %d entries from SSH source %s", count, source_id)
return count
def _glean_ssh_source(
src: dict, # type: ignore[type-arg]
compiled: list[tuple[LogPattern, object]],
ingest_time: str,
conn: sqlite3.Connection,
batch_size: int,
) -> dict[str, int]:
"""Open one SSHTransport connection for *src* and glean all its glean items.
One SSH connection is shared across all items in the ``glean:`` list so
the handshake overhead is paid only once per host per glean run.
Returns a stats dict mapping ``{source_id: entry_count}`` for each item.
Gracefully skips the entire source on SSHConnectionError.
"""
host_id = src.get("id", src.get("host", "unknown"))
host = src["host"]
user = src["user"]
key_path = str(Path(src["key_path"]).expanduser())
port = int(src.get("port", 22))
glean_items: list[dict] = src.get("glean", []) # type: ignore[type-arg]
stats: dict[str, int] = {}
try:
with SSHTransport(host=host, user=user, key_path=key_path, port=port) as t:
for item in glean_items:
item_type = item.get("type", "plaintext")
# Per-item source_id — falls back to host_id/type for un-labelled items
item_id = item.get("id") or f"{host_id}/{item_type}"
if item_type == "journald":
cmd = _build_journald_command(item)
count = _stream_and_write(
t, cmd, journald.parse, item_id, compiled, ingest_time, conn, batch_size
)
stats[item_id] = stats.get(item_id, 0) + count
elif item_type == "syslog":
cmd = _build_syslog_command(item)
count = _stream_and_write(
t, cmd, syslog.parse, item_id, compiled, ingest_time, conn, batch_size
)
stats[item_id] = stats.get(item_id, 0) + count
elif item_type == "plaintext":
cmd = _build_plaintext_command(item)
count = _stream_and_write(
t, cmd, plaintext.parse, item_id, compiled, ingest_time, conn, batch_size
)
stats[item_id] = stats.get(item_id, 0) + count
elif item_type == "docker":
cmds = _build_docker_command(item)
if isinstance(cmds, str):
cmds = [cmds]
containers: list[str] = item.get("containers", [])
for i, cmd in enumerate(cmds):
# Use the container name as the final path segment when available
container_name = containers[i] if i < len(containers) else str(i)
container_id = f"{item_id}/{container_name}" if len(cmds) > 1 else item_id
count = _stream_and_write(
t, cmd, docker_log.parse, container_id,
compiled, ingest_time, conn, batch_size,
)
stats[container_id] = stats.get(container_id, 0) + count
else:
logger.warning(
"Unknown SSH glean type %r for source %r — skipping item",
item_type, host_id,
)
except SSHConnectionError as exc:
logger.warning("SSH connection failed for source %r: %s", host_id, exc)
return stats
def glean_ssh_source(
src: dict, # type: ignore[type-arg]
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
) -> dict[str, int]:
"""Glean a single SSH source dict and write results to *db_path*.
Public wrapper around :func:`_glean_ssh_source` for the REST layer.
Manages the DB connection, pattern compilation, and FTS rebuild so callers
don't have to deal with those lifecycle concerns.
Returns stats mapping ``{sub_source_id: entry_count}``.
"""
effective_pattern_file = pattern_file or Path("patterns/default.yaml")
compiled = _compile(load_patterns(effective_pattern_file))
ingest_time = now_iso()
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
conn.commit()
try:
stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
finally:
conn.close()
logger.info("Rebuilding FTS index after SSH source glean...")
build_fts_index(db_path)
return stats
def glean_dir(
corpus_dir: Path,
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
force: bool = False,
) -> dict[str, int]:
"""Glean all .jsonl and .log files from a corpus directory.
Pass ``force=True`` to bypass fingerprint checks and re-glean all files
regardless of whether they have changed since the last run.
"""
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
return _glean_files(files, db_path, pattern_file, batch_size, force=force)
def glean_file(
log_file: Path,
db_path: Path,
pattern_file: Path | None = None,
force: bool = False,
) -> dict[str, int]:
"""Glean a single log file (any supported format).
Pass ``force=True`` to re-glean even when the file fingerprint is unchanged.
"""
return _glean_files([log_file], db_path, pattern_file, force=force)
def glean_sources(
sources_file: Path,
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
force: bool = False,
) -> dict[str, int]:
"""Glean all sources listed in a sources.yaml config file.
Supports two source types:
Local file sources (default):
sources:
- id: sonarr
path: /opt/sonarr/config/logs/sonarr.0.txt
SSH remote sources (transport: ssh):
sources:
- id: rack01
transport: ssh
host: 192.168.1.10
user: admin
key_path: ~/.ssh/id_ed25519
glean:
- type: journald
args: ["--since", "2 hours ago"]
- type: syslog
path: /var/log/syslog
- type: plaintext
path: /var/log/app/error.log
- type: docker
containers: [myapp, nginx]
Missing local paths and SSH connection failures are logged as warnings
so the cron keeps running when a source is temporarily down.
"""
with open(sources_file) as f:
config = yaml.safe_load(f)
local_sources: list[dict] = [] # type: ignore[type-arg]
ssh_sources: list[dict] = [] # type: ignore[type-arg]
for src in config.get("sources", []):
if src.get("transport") == "ssh":
ssh_sources.append(src)
else:
local_sources.append(src)
# ── Local file sources ─────────────────────────────────────────────────
files: list[Path] = []
source_id_map: dict[Path, str] = {}
for src in local_sources:
path = Path(src["path"])
if not path.exists():
logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
continue
files.append(path)
if "id" in src:
source_id_map[path] = src["id"]
if not files and not ssh_sources:
logger.warning("No sources found — check sources.yaml paths")
return {}
stats: dict[str, int] = {}
if files:
stats.update(_glean_files(files, db_path, pattern_file, batch_size, source_id_map, force=force))
# ── SSH remote sources ─────────────────────────────────────────────────
if not ssh_sources:
return stats
# Compile patterns once, share across all SSH sources in this run.
effective_pattern_file = pattern_file or Path("patterns/default.yaml")
compiled = _compile(load_patterns(effective_pattern_file))
ingest_time = now_iso()
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
conn.commit()
try:
for src in ssh_sources:
ssh_stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
for k, v in ssh_stats.items():
stats[k] = stats.get(k, 0) + v
finally:
conn.close()
# Rebuild FTS only when SSH sources added entries (_glean_files already
# rebuilds when local sources are present; safe to call again if both ran).
if ssh_sources:
logger.info("Rebuilding FTS index after SSH glean...")
build_fts_index(db_path)
return stats

View file

@ -10,7 +10,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -12,7 +12,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -18,7 +18,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -12,7 +12,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

225
app/glean/ssh.py Normal file
View file

@ -0,0 +1,225 @@
"""SSH transport layer for remote log gleaning (issue #22).
Wraps Paramiko to provide a clean context-manager interface for executing
remote commands and streaming their stdout output. All format parsing is
delegated to the existing per-format parsers (journald, syslog, plaintext,
docker); this module is transport only.
Key design choices:
- Key-based auth only no password prompts in a daemon context.
- exec_stream is a generator; exit-status check fires after all lines are
yielded, so callers must drain the iterator (e.g. list()) to trigger it.
- Command builders live here because they encode SSH/remote-execution idioms
(journalctl flags, docker logs invocation) that the generic parsers don't
need to know about.
Example sources.yaml snippet::
sources:
- id: rack01
transport: ssh
host: 192.168.1.10
user: admin
key_path: ~/.ssh/id_ed25519
glean:
- type: journald
args: ["--since", "2 hours ago"]
- type: syslog
path: /var/log/syslog
- type: plaintext
path: /var/log/app/error.log
- type: docker
containers: [myapp, nginx]
"""
from __future__ import annotations
import shlex
from collections.abc import Iterator
from typing import Union
import paramiko
__all__ = [
"SSHConnectionError",
"SSHCommandError",
"SSHTransport",
"_build_journald_command",
"_build_syslog_command",
"_build_plaintext_command",
"_build_docker_command",
]
# Default syslog path used when none is specified in the source spec.
_SYSLOG_DEFAULT_PATH = "/var/log/syslog"
# ── Custom exceptions ─────────────────────────────────────────────────────────
class SSHConnectionError(Exception):
"""Raised when the SSH connection cannot be established or authenticated."""
class SSHCommandError(Exception):
"""Raised when a remote command exits with a non-zero status code."""
# ── Transport context manager ─────────────────────────────────────────────────
class SSHTransport:
"""Context manager wrapping a Paramiko SSH connection.
Opens the connection on ``__enter__`` and closes it on ``__exit__``,
even if an exception propagates. Key-based authentication only.
Usage::
with SSHTransport(host="10.0.0.1", user="admin",
key_path="~/.ssh/id_ed25519") as t:
for line in t.exec_stream("journalctl -o json --since '1 hour ago'"):
process(line)
"""
def __init__(
self,
host: str,
user: str,
key_path: str,
port: int = 22,
) -> None:
self._host = host
self._user = user
self._key_path = key_path
self._port = port
self._client: paramiko.SSHClient | None = None
# ── context manager protocol ──────────────────────────────────────────────
def __enter__(self) -> "SSHTransport":
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
try:
client.connect(
hostname=self._host,
username=self._user,
key_filename=self._key_path,
port=self._port,
)
except paramiko.AuthenticationException as exc:
client.close()
raise SSHConnectionError(
f"SSH auth failed for {self._user}@{self._host}: {exc}"
) from exc
except paramiko.SSHException as exc:
client.close()
raise SSHConnectionError(
f"SSH connection failed to {self._host}: {exc}"
) from exc
self._client = client
return self
def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore[override]
if self._client is not None:
self._client.close()
self._client = None
# Return None (falsy) so any in-flight exception is not suppressed.
# ── remote execution ──────────────────────────────────────────────────────
def exec_stream(self, command: str) -> Iterator[str]:
"""Execute *command* on the remote host and yield stdout lines.
The exit-status check runs after all stdout lines have been yielded,
so callers must drain the iterator to trigger it::
list(transport.exec_stream(cmd)) # raises if exit != 0
Raises:
SSHConnectionError: if called outside a ``with`` block.
SSHCommandError: if the remote command exits non-zero.
"""
if self._client is None:
raise SSHConnectionError(
"Not connected — use SSHTransport as a context manager"
)
_, stdout, stderr = self._client.exec_command(command)
for line in stdout:
yield line
exit_code = stdout.channel.recv_exit_status()
# Guard against MagicMock in tests: only treat real integer exit codes.
if isinstance(exit_code, int) and exit_code != 0:
error_msg = stderr.read().decode(errors="replace").strip()
raise SSHCommandError(
f"Command failed (exit {exit_code}): {error_msg}"
)
# ── Command builders ──────────────────────────────────────────────────────────
def _build_journald_command(spec: dict) -> str: # type: ignore[type-arg]
"""Build a ``journalctl`` command string from a glean source spec.
Spec keys:
- ``args`` list of extra journalctl arguments appended verbatim.
- ``unit`` shorthand for ``--unit <name>`` (inserted before ``args``).
Returns a single shell command string.
"""
parts = ["journalctl", "-o json", "--no-pager"]
if "unit" in spec:
parts.append(f"--unit {spec['unit']}")
if "args" in spec:
parts.extend(spec["args"])
return " ".join(parts)
def _build_syslog_command(spec: dict) -> str: # type: ignore[type-arg]
"""Build a ``cat`` command for a syslog-format log file.
Spec keys:
- ``path`` path to the file (default: ``/var/log/syslog``).
Returns a single shell command string.
"""
path = spec.get("path", _SYSLOG_DEFAULT_PATH)
return f"cat {shlex.quote(path)}"
def _build_plaintext_command(spec: dict) -> str: # type: ignore[type-arg]
"""Build a ``cat`` command for an arbitrary plaintext log file.
Spec keys:
- ``path`` **required** path to the log file.
Raises:
KeyError: if ``path`` is absent from the spec.
"""
path = spec["path"] # intentional KeyError if missing — callers must supply it
return f"cat {shlex.quote(path)}"
def _build_docker_command(
spec: dict, # type: ignore[type-arg]
) -> Union[str, list[str]]:
"""Build ``docker logs`` command(s) for one or more named containers.
Spec keys:
- ``containers`` **required** list of container names or IDs.
Returns a single command string when there is one container, or a list
of command strings when there are multiple (one command per container so
each can be streamed independently).
Raises:
KeyError: if ``containers`` is absent from the spec.
ValueError: if ``containers`` is an empty list.
"""
containers = spec["containers"] # intentional KeyError if missing
if not containers:
raise ValueError("'containers' must be a non-empty list")
commands = [f"docker logs {shlex.quote(c)}" for c in containers]
return commands[0] if len(commands) == 1 else commands

View file

@ -14,7 +14,7 @@ import re
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -5,7 +5,7 @@ Tautulli sends all template values as strings, so all fields are treated as str.
"""
from __future__ import annotations
from app.ingest.base import (
from app.glean.base import (
apply_patterns,
epoch_float_to_iso,
make_entry_id,

View file

@ -22,7 +22,7 @@ import json
from datetime import datetime, timezone
from typing import Iterator
from app.ingest.base import (
from app.glean.base import (
SourceState, apply_patterns, make_entry_id, now_iso,
)
from app.services.models import LogPattern, RetrievedEntry

View file

@ -1,328 +0,0 @@
"""Ingest pipeline: auto-detect format, parse, write to SQLite."""
from __future__ import annotations
import json
import logging
import re
import sqlite3
from pathlib import Path
from typing import Iterator
import yaml
from app.ingest import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
from app.ingest.base import _compile, load_patterns, now_iso
from app.services.models import LogPattern, RetrievedEntry
from app.services.search import build_fts_index
logger = logging.getLogger(__name__)
_SCHEMA = """
CREATE TABLE IF NOT EXISTS log_entries (
id TEXT PRIMARY KEY,
source_id TEXT NOT NULL,
sequence INTEGER NOT NULL,
timestamp_raw TEXT,
timestamp_iso TEXT,
ingest_time TEXT NOT NULL,
severity TEXT,
repeat_count INTEGER DEFAULT 1,
out_of_order INTEGER DEFAULT 0,
matched_patterns TEXT DEFAULT '[]',
text TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id);
CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso);
CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count);
CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity);
CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns);
CREATE TABLE IF NOT EXISTS incidents (
id TEXT PRIMARY KEY,
label TEXT NOT NULL,
issue_type TEXT NOT NULL DEFAULT '',
started_at TEXT,
ended_at TEXT,
notes TEXT NOT NULL DEFAULT '',
created_at TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'medium'
);
CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
CREATE TABLE IF NOT EXISTS received_bundles (
id TEXT PRIMARY KEY,
source_host TEXT NOT NULL,
issue_type TEXT NOT NULL DEFAULT '',
label TEXT NOT NULL,
severity TEXT NOT NULL DEFAULT 'medium',
started_at TEXT,
bundled_at TEXT NOT NULL,
entry_count INTEGER NOT NULL DEFAULT 0,
bundle_json TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type);
CREATE TABLE IF NOT EXISTS context_facts (
id TEXT PRIMARY KEY,
category TEXT NOT NULL,
key TEXT NOT NULL,
value TEXT NOT NULL,
source TEXT,
created_at TEXT NOT NULL
);
CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key);
CREATE TABLE IF NOT EXISTS context_documents (
id TEXT PRIMARY KEY,
filename TEXT NOT NULL,
doc_type TEXT NOT NULL,
full_text TEXT NOT NULL,
file_size INTEGER,
uploaded_at TEXT NOT NULL
);
CREATE TABLE IF NOT EXISTS context_chunks (
id TEXT PRIMARY KEY,
document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL,
text TEXT NOT NULL,
embedding BLOB
);
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
CREATE TABLE IF NOT EXISTS blocklist_candidates (
id TEXT PRIMARY KEY,
domain_or_ip TEXT NOT NULL,
source_device_ip TEXT,
source_device_name TEXT,
first_seen TEXT NOT NULL,
last_seen TEXT NOT NULL,
hit_count INTEGER DEFAULT 1,
status TEXT DEFAULT 'pending',
pushed_at TEXT,
log_evidence TEXT DEFAULT '[]',
matched_rule TEXT,
llm_score REAL,
llm_reason TEXT
);
CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
"""
def ensure_schema(db_path: Path) -> None:
"""Create all tables and apply additive migrations. Safe to call on every startup."""
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
# Additive column migrations — ALTER TABLE silently skips if column exists
for stmt in [
"ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
]:
try:
conn.execute(stmt)
except sqlite3.OperationalError:
pass
conn.commit()
conn.close()
def _detect_format(first_line: str) -> str:
try:
obj = json.loads(first_line)
if "__REALTIME_TIMESTAMP" in obj:
return "journald"
if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
return "docker"
if wazuh.is_wazuh_alert(obj):
return "wazuh"
if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
return "caddy"
except (json.JSONDecodeError, AttributeError):
pass
if plex.is_plex_log(first_line):
return "plex"
if qbittorrent.is_qbit_log(first_line):
return "qbittorrent"
if servarr.is_servarr_log(first_line):
return "servarr"
if dmesg_log.is_dmesg_log(first_line):
return "dmesg"
if syslog.is_syslog(first_line):
return "syslog"
return "plaintext"
def _parse_file(
path: Path,
compiled: list[tuple[LogPattern, object]],
ingest_time: str,
source_id: str | None = None,
) -> Iterator[RetrievedEntry]:
source_id = source_id or path.stem
with path.open("r", errors="replace") as f:
lines = iter(f)
try:
first = next(lines)
except StopIteration:
return
fmt = _detect_format(first.strip())
logger.info("Detected format %r for %s", fmt, path.name)
def all_lines():
yield first
yield from lines
if fmt == "journald":
yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "wazuh":
yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "docker":
yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "caddy":
yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "plex":
yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "qbittorrent":
yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "servarr":
yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "dmesg":
yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
elif fmt == "syslog":
yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
else:
yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
conn.executemany(
"""
INSERT OR IGNORE INTO log_entries
(id, source_id, sequence, timestamp_raw, timestamp_iso,
ingest_time, severity, repeat_count, out_of_order,
matched_patterns, text)
VALUES (?,?,?,?,?,?,?,?,?,?,?)
""",
[
(
e.entry_id, e.source_id, e.sequence,
e.timestamp_raw, e.timestamp_iso, e.ingest_time,
e.severity, e.repeat_count, int(e.out_of_order),
json.dumps(list(e.matched_patterns)), e.text,
)
for e in batch
],
)
def _ingest_files(
files: list[Path],
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
source_id_map: dict[Path, str] | None = None,
) -> dict[str, int]:
pattern_file = pattern_file or Path("patterns/default.yaml")
patterns = load_patterns(pattern_file)
compiled = _compile(patterns)
ingest_time = now_iso()
source_id_map = source_id_map or {}
conn = sqlite3.connect(str(db_path))
conn.execute("PRAGMA journal_mode=WAL")
conn.executescript(_SCHEMA)
conn.commit()
stats: dict[str, int] = {}
for log_file in files:
source_id = source_id_map.get(log_file, log_file.stem)
count = 0
batch: list[RetrievedEntry] = []
for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
batch.append(entry)
if len(batch) >= batch_size:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
batch.clear()
if batch:
_write_batch(conn, batch)
conn.commit()
count += len(batch)
stats[source_id] = stats.get(source_id, 0) + count
logger.info("Ingested %d entries from %s (source: %s)", count, log_file.name, source_id)
conn.close()
logger.info("Building FTS index...")
build_fts_index(db_path)
logger.info("FTS index ready")
return stats
def ingest(
corpus_dir: Path,
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
) -> dict[str, int]:
"""Ingest all .jsonl and .log files from a corpus directory."""
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
return _ingest_files(files, db_path, pattern_file, batch_size)
def ingest_file(
log_file: Path,
db_path: Path,
pattern_file: Path | None = None,
) -> dict[str, int]:
"""Ingest a single log file (any supported format)."""
return _ingest_files([log_file], db_path, pattern_file)
def ingest_sources(
sources_file: Path,
db_path: Path,
pattern_file: Path | None = None,
batch_size: int = 1000,
) -> dict[str, int]:
"""Ingest all sources listed in a sources.yaml config file.
sources.yaml format:
sources:
- id: sonarr
path: /opt/sonarr/config/logs/sonarr.0.txt
- id: qbittorrent
path: /opt/qbittorrent/config/data/logs/qbittorrent.log
Missing paths are skipped with a warning so the cron keeps running
when a service is temporarily down.
"""
with open(sources_file) as f:
config = yaml.safe_load(f)
files: list[Path] = []
source_id_map: dict[Path, str] = {}
for src in config.get("sources", []):
path = Path(src["path"])
if not path.exists():
logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
continue
files.append(path)
if "id" in src:
source_id_map[path] = src["id"]
if not files:
logger.warning("No source files found — check sources.yaml paths")
return {}
return _ingest_files(files, db_path, pattern_file, batch_size, source_id_map)

View file

@ -94,7 +94,7 @@ def search_logs(
severity: Filter by level EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG.
source: Partial match on source_id. Format is 'corpus:host:service'.
Example: 'example-node:caddy' matches all Caddy entries from example-node.
pattern: Filter by named pattern tag applied at ingest time.
pattern: Filter by named pattern tag applied at glean time.
Known tags: auth_failure, connection_lost, oom, segfault, disk_full,
timeout, caddy_tls_error, caddy_config_error, caddy_auth_error,
caddy_upstream_error, service_restart, service_update,
@ -176,7 +176,7 @@ def list_log_sources() -> str:
"""
sources = list_sources(DB_PATH)
if not sources:
return "No log sources found. Has the corpus been ingested? Run: python scripts/ingest_corpus.py"
return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py"
lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"]
for s in sources:
@ -192,7 +192,7 @@ def list_log_sources() -> str:
if __name__ == "__main__":
if not DB_PATH.exists():
logger.error("Database not found: %s", DB_PATH)
logger.error("Run: python scripts/ingest_corpus.py <corpus_dir> <db_path>")
logger.error("Run: python scripts/glean_corpus.py <corpus_dir> <db_path>")
sys.exit(1)
logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH)
mcp.run()

View file

@ -27,10 +27,10 @@ from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel
from app.ingest.pipeline import ensure_schema, ingest_file as _ingest_file
from app.ingest.base import load_compiled_patterns, now_iso
from app.ingest.tautulli import parse_webhook as _parse_tautulli
from app.ingest.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
from app.glean.pipeline import ensure_schema, glean_file as _glean_file, glean_ssh_source as _glean_ssh_source
from app.glean.base import load_compiled_patterns, now_iso
from app.glean.tautulli import parse_webhook as _parse_tautulli
from app.glean.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
from app.services.blocklist import (
BlocklistCandidate,
get_candidate,
@ -71,11 +71,11 @@ from app.context.store import (
delete_document as _delete_document,
)
from app.context.retriever import retrieve_context as _retrieve_context, format_context_block
from app.ingest.doc_upload import ingest_upload as _ingest_upload
from app.glean.doc_upload import glean_upload as _glean_upload
from app.context.wizard import get_schema as _wizard_schema, advance_step, is_complete, apply_session
from app.context.chunker import UnsupportedDocType, FileTooLarge
from app.tasks.ingest_scheduler import get_state as _ingest_state, run_once as _run_ingest, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
from app.ingest.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers
from app.tasks.glean_scheduler import get_state as _glean_state, run_once as _run_glean, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
from app.glean.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers
DB_PATH = Path(os.environ.get("TURNSTONE_DB", Path(__file__).parent.parent / "data" / "turnstone.db"))
PREFS_PATH = DB_PATH.parent / "preferences.json"
@ -84,7 +84,7 @@ SOURCE_HOST = os.environ.get("TURNSTONE_SOURCE_HOST", "unknown")
BUNDLE_ENDPOINT = os.environ.get("TURNSTONE_BUNDLE_ENDPOINT", "")
PATTERN_DIR = Path(os.environ.get("TURNSTONE_PATTERNS", Path(__file__).parent.parent / "patterns"))
PATTERN_FILE = PATTERN_DIR / "default.yaml"
INGEST_INTERVAL = int(os.environ.get("TURNSTONE_INGEST_INTERVAL", "900"))
GLEAN_INTERVAL = int(os.environ.get("TURNSTONE_GLEAN_INTERVAL", "900"))
SUBMIT_ENDPOINT = os.environ.get("TURNSTONE_SUBMIT_ENDPOINT", "").rstrip("/")
# GPU inference server URL.
@ -119,14 +119,14 @@ async def _lifespan(app: FastAPI):
sources_file = PATTERN_DIR / "sources.yaml"
_scheduler_task: asyncio.Task | None = None
if INGEST_INTERVAL > 0 and sources_file.exists():
if GLEAN_INTERVAL > 0 and sources_file.exists():
_scheduler_task = asyncio.create_task(
_scheduler_loop(
sources_file, DB_PATH, PATTERN_FILE, INGEST_INTERVAL,
sources_file, DB_PATH, PATTERN_FILE, GLEAN_INTERVAL,
submit_endpoint=SUBMIT_ENDPOINT or None,
source_host=SOURCE_HOST,
),
name="ingest-scheduler",
name="glean-scheduler",
)
_mqtt_task: asyncio.Task | None = None
@ -433,6 +433,72 @@ def list_sources() -> dict:
return {"sources": _list_sources(DB_PATH)}
@router.get("/api/sources/configured")
def list_configured_sources() -> dict:
"""Return every source in sources.yaml, enriched with DB stats.
Unlike ``/api/sources`` (which is DB-only), this endpoint reads sources.yaml
so SSH sources appear even before their first successful glean. DB entry
counts, error counts, and timestamps are aggregated and merged in.
For SSH sources, sub-source IDs (e.g. ``rack01/journald``) are summed to
produce a single aggregate stat row for the top-level host entry.
"""
sources_file = PATTERN_DIR / "sources.yaml"
if not sources_file.exists():
return {"sources": []}
with open(sources_file) as f:
config = yaml.safe_load(f) or {}
# Fetch all DB source stats once; key by source_id for O(1) lookup.
db_stats: dict[str, dict] = {}
try:
for row in _list_sources(DB_PATH):
db_stats[row["source_id"]] = row
except Exception:
pass # DB may not exist on first run
result = []
for src in config.get("sources", []):
transport = src.get("transport", "local")
src_id = src.get("id", "")
entry: dict = {"id": src_id, "transport": transport}
if transport != "ssh":
entry["path"] = src.get("path", "")
db = db_stats.get(src_id, {})
entry["entry_count"] = db.get("entry_count", 0)
entry["error_count"] = db.get("error_count", 0)
entry["earliest"] = db.get("earliest")
entry["latest"] = db.get("latest")
else:
entry["host"] = src.get("host", "")
entry["user"] = src.get("user", "")
glean_items: list[dict] = src.get("glean", [])
entry["glean_types"] = sorted({item.get("type", "plaintext") for item in glean_items})
entry["glean_items"] = glean_items
# Aggregate sub-source DB rows that belong to this SSH host.
# Sub-sources use IDs like "{host_id}/{type}" or "{host_id}/{type}/{container}".
prefix = src_id + "/"
matching_rows = [
v for k, v in db_stats.items()
if k.startswith(prefix) or k == src_id
]
entry["entry_count"] = sum(r.get("entry_count", 0) for r in matching_rows)
entry["error_count"] = sum(r.get("error_count", 0) for r in matching_rows)
earliests = [r["earliest"] for r in matching_rows if r.get("earliest")]
latests = [r["latest"] for r in matching_rows if r.get("latest")]
entry["earliest"] = min(earliests) if earliests else None
entry["latest"] = max(latests) if latests else None
result.append(entry)
return {"sources": result}
@router.delete("/api/sources/{source_id}")
def delete_source(source_id: str) -> dict:
"""Delete all log entries (and FTS index rows) for a given source."""
@ -448,9 +514,22 @@ def delete_source(source_id: str) -> dict:
return {"deleted": deleted, "source_id": source_id}
@router.post("/api/sources/{source_id}/ingest")
def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
"""Trigger a re-ingest for a configured source from sources.yaml."""
@router.post("/api/sources/{source_id}/glean")
def reglean_source(
source_id: str,
background_tasks: BackgroundTasks,
force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean even if file is unchanged")] = False,
) -> dict:
"""Trigger a re-glean for a configured source from sources.yaml.
Handles both local file sources and SSH remote sources. For SSH sources,
the glean runs in the foreground and rebuilds the FTS index before returning
(same behaviour as local sources callers can rely on the count being final
when the response arrives).
Use ``?force=true`` to bypass the fingerprint cache and re-glean the file
even if mtime and size appear unchanged since the last run.
"""
sources_file = PATTERN_DIR / "sources.yaml"
if not sources_file.exists():
raise HTTPException(status_code=404, detail="sources.yaml not found")
@ -459,21 +538,31 @@ def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
matching = [s for s in config.get("sources", []) if s.get("id") == source_id]
if not matching:
raise HTTPException(status_code=404, detail=f"Source {source_id!r} not in sources.yaml")
src_path = Path(matching[0]["path"])
src = matching[0]
if src.get("transport") == "ssh":
# SSH sources: open connection, glean all items, rebuild FTS inline.
# Fingerprint skipping applies only to local file sources.
stats = _glean_ssh_source(src, DB_PATH, PATTERN_FILE)
return {"source_id": source_id, "gleaned": sum(stats.values())}
# Local file source.
src_path = Path(src["path"])
if not src_path.exists():
raise HTTPException(status_code=422, detail=f"Path does not exist: {src_path}")
stats = _ingest_file(src_path, DB_PATH, PATTERN_FILE)
stats = _glean_file(src_path, DB_PATH, PATTERN_FILE, force=force)
background_tasks.add_task(build_fts_index, DB_PATH)
return {"source_id": source_id, "ingested": stats.get(source_id, sum(stats.values()))}
return {"source_id": source_id, "gleaned": stats.get(source_id, sum(stats.values()))}
@router.post("/api/ingest/upload")
async def ingest_upload(
@router.post("/api/glean/upload")
async def glean_upload(
file: UploadFile,
source_id: Annotated[str | None, Query(description="Override source ID (defaults to filename)")] = None,
background_tasks: BackgroundTasks = None,
) -> dict:
"""Accept a multipart log file, auto-detect format, ingest into DB."""
"""Accept a multipart log file, auto-detect format, glean into DB."""
sid = source_id or Path(file.filename or "upload").stem
content = await file.read()
with tempfile.NamedTemporaryFile(
@ -483,13 +572,13 @@ async def ingest_upload(
tmp.write(content)
tmp_path = Path(tmp.name)
try:
stats = _ingest_file(tmp_path, DB_PATH, PATTERN_FILE)
stats = _glean_file(tmp_path, DB_PATH, PATTERN_FILE)
finally:
tmp_path.unlink(missing_ok=True)
if background_tasks is not None:
background_tasks.add_task(build_fts_index, DB_PATH)
total = sum(stats.values())
return {"source_id": sid, "ingested": total, "stats": stats}
return {"source_id": sid, "gleaned": total, "stats": stats}
class BatchEntry(BaseModel):
@ -506,20 +595,20 @@ class BatchEntry(BaseModel):
text: str
class BatchIngestRequest(BaseModel):
class BatchGleanRequest(BaseModel):
source_host: str = "unknown"
entries: list[BatchEntry]
@router.post("/api/ingest/batch")
def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks) -> dict:
@router.post("/api/glean/batch")
def glean_batch(payload: BatchGleanRequest, background_tasks: BackgroundTasks) -> dict:
"""Accept pre-parsed log entries from a remote Turnstone instance (submission protocol).
Used by nodes with TURNSTONE_SUBMIT_ENDPOINT configured to push their
pattern-matched entries to a central receiving instance.
"""
if not payload.entries:
return {"ingested": 0}
return {"gleaned": 0}
conn = sqlite3.connect(str(DB_PATH))
conn.execute("PRAGMA journal_mode=WAL")
conn.executemany(
@ -550,13 +639,13 @@ def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks)
conn.commit()
conn.close()
background_tasks.add_task(build_fts_index, DB_PATH)
return {"ingested": len(payload.entries), "source_host": payload.source_host}
return {"gleaned": len(payload.entries), "source_host": payload.source_host}
@router.get("/api/tasks/ingest/status")
def ingest_task_status() -> dict:
"""Return the current state of the periodic batch ingest scheduler."""
s = _ingest_state()
@router.get("/api/tasks/glean/status")
def glean_task_status() -> dict:
"""Return the current state of the periodic glean scheduler."""
s = _glean_state()
return {
"running": s.running,
"run_count": s.run_count,
@ -565,8 +654,8 @@ def ingest_task_status() -> dict:
"last_stats": s.last_stats,
"last_error": s.last_error,
"next_run_at": s.next_run_at,
"interval_s": INGEST_INTERVAL,
"scheduler_active": INGEST_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
"interval_s": GLEAN_INTERVAL,
"scheduler_active": GLEAN_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
"submit_endpoint": SUBMIT_ENDPOINT or None,
"last_submitted_at": s.last_submitted_at,
"last_submit_count": s.last_submit_count,
@ -574,21 +663,28 @@ def ingest_task_status() -> dict:
}
@router.post("/api/tasks/ingest")
async def trigger_ingest() -> dict:
"""Manually trigger a batch ingest of all configured sources. No-ops if already running."""
@router.post("/api/tasks/glean")
async def trigger_glean(
force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean all sources")] = False,
) -> dict:
"""Manually trigger a glean of all configured sources. No-ops if already running.
Use ``?force=true`` to bypass the fingerprint cache and re-glean every local
file source even when mtime and size are unchanged since the last run.
"""
sources_file = PATTERN_DIR / "sources.yaml"
if not sources_file.exists():
raise HTTPException(status_code=404, detail="sources.yaml not found — configure log sources first")
return await _run_ingest(
return await _run_glean(
sources_file, DB_PATH, PATTERN_FILE,
submit_endpoint=SUBMIT_ENDPOINT or None,
source_host=SOURCE_HOST,
force=force,
)
@router.post("/api/ingest/wazuh/alert")
async def ingest_wazuh_alert(
@router.post("/api/glean/wazuh/alert")
async def glean_wazuh_alert(
alert: dict,
source_id: Annotated[str | None, Query(description="Source label (defaults to 'wazuh')")] = None,
background_tasks: BackgroundTasks = None,
@ -769,8 +865,8 @@ def _tautulli_write_entry(conn: sqlite3.Connection, entry) -> None:
)
@router.post("/api/ingest/tautulli")
def ingest_tautulli(
@router.post("/api/glean/tautulli")
def glean_tautulli(
payload: dict,
request: Request,
background_tasks: BackgroundTasks,

View file

@ -0,0 +1,357 @@
"""Frictionless diagnose service — NL time extraction + layered log search.
This module is the public interface for the diagnose package.
Full implementation lives here so that patch("app.services.diagnose._HAS_DATEPARSER")
and patch("app.services.diagnose._search_dates") continue to target the correct
namespace, preserving backward compatibility with existing tests.
The verbatim original is preserved in legacy.py for reference.
"""
from __future__ import annotations
import asyncio
import dataclasses
import logging
import os
import re
from collections.abc import AsyncGenerator
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any
from app.context.retriever import retrieve_context, format_context_block
from app.services.llm import summarize
from app.services.search import SearchResult, entries_in_window, search
from app.services.diagnose.pipeline import run_pipeline
logger = logging.getLogger(__name__)
try:
from dateparser.search import search_dates as _search_dates # type: ignore[import]
_HAS_DATEPARSER = True
except ImportError:
_search_dates = None # type: ignore[assignment]
_HAS_DATEPARSER = False
_RELATIVE_RE = re.compile(
r"\b(?:last|past)\s+(?:(?P<n>\d+)|(?P<approx>a\s+few|few|couple(?:\s+of)?|several))?\s*(?P<unit>minute|hour|day|week)s?\b",
re.IGNORECASE,
)
_RELATIVE_UNITS = {"minute": 1, "hour": 60, "day": 1440, "week": 10080}
# Fuzzy quantifiers map to a reasonable span so "last few hours" → 3h window
_APPROX_N = 3
def _relative_window(match: re.Match) -> tuple[str, str]:
"""Convert a relative time match to (since_iso, until_iso)."""
n_str = match.group("n")
approx = match.group("approx")
unit = match.group("unit").lower()
n = int(n_str) if n_str else (_APPROX_N if approx else 1)
minutes = n * _RELATIVE_UNITS[unit]
return _last_n_minutes(minutes), _now_iso()
def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
"""Extract a time window from a natural-language query string.
Returns (since_iso, until_iso, keywords) where keywords is the query with
the matched time phrase stripped. Falls back to last-60-min window.
"""
# Handle relative expressions first ("last hour", "past 30 minutes", etc.)
# dateparser misinterprets these as absolute times.
m = _RELATIVE_RE.search(query)
if m:
since, until = _relative_window(m)
keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip()
return since, until, keywords or query
if _HAS_DATEPARSER and _search_dates is not None:
# Tell dateparser what timezone the user is in so "3:35 am" means local time.
# PREFER_DAY_OF_MONTH is unused here but PREFER_DATES_FROM=past ensures
# "3:35 am" resolves to the most recent past occurrence, not a future one.
local_offset = datetime.now().astimezone().utcoffset()
offset_h = int((local_offset.total_seconds() if local_offset else 0) / 3600)
tz_str = f"UTC{'+' if offset_h >= 0 else ''}{offset_h}"
try:
results = _search_dates(
query,
languages=["en"],
settings={
"PREFER_DATES_FROM": "past",
"TIMEZONE": tz_str,
"RETURN_AS_TIMEZONE_AWARE": True,
},
)
except Exception as e:
logger.warning(
"dateparser failed (%s) on query %r — falling back to 60-min window",
type(e).__name__,
query,
)
results = None
if results:
phrase, dt = results[0]
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
else:
dt = dt.astimezone(
timezone.utc
) # normalise to UTC for SQLite string compare
since = (dt - timedelta(minutes=30)).isoformat()
until = (dt + timedelta(minutes=30)).isoformat()
keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
return since, until, keywords or query
return _last_n_minutes(60), _now_iso(), query
def diagnose(
db_path: Path,
query: str,
since: str | None = None,
until: str | None = None,
source_filter: str | None = None,
llm_url: str | None = None,
llm_model: str | None = None,
llm_api_key: str | None = None,
) -> dict[str, Any]:
"""Run layered log search with NL time extraction. Returns summary + entries."""
time_detected = since is not None and until is not None
if not time_detected:
parsed_since, parsed_until, keywords = parse_time_window(query)
since = since or parsed_since
until = until or parsed_until
time_detected = keywords != query
else:
keywords = query
keyword_hits = search(
db_path,
query=keywords,
since=since,
until=until,
source_filter=source_filter,
limit=150,
or_mode=True,
)
window_hits = entries_in_window(
db_path,
since=since,
until=until,
source_filter=source_filter,
limit=50,
per_source_cap=15,
)
seen: set[str] = set()
merged: list[SearchResult] = []
for r in keyword_hits + window_hits:
if r.entry_id not in seen:
seen.add(r.entry_id)
merged.append(r)
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
:200
]
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
by_source: dict[str, int] = {}
for r in combined:
sev = (r.severity or "INFO").upper()
if sev in by_severity:
by_severity[sev] += 1
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
reasoning: str | None = None
if llm_url and llm_model:
reasoning = summarize(
query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
)
return {
"summary": {
"total": len(combined),
"window_start": since,
"window_end": until,
"time_detected": time_detected,
"by_severity": by_severity,
"by_source": by_source,
},
"reasoning": reasoning,
"entries": combined,
}
async def diagnose_stream(
db_path: Path,
query: str,
since: str | None = None,
until: str | None = None,
source_filter: str | None = None,
llm_url: str | None = None,
llm_model: str | None = None,
llm_api_key: str | None = None,
) -> AsyncGenerator[dict[str, Any], None]:
"""Async generator yielding SSE event dicts for the diagnose pipeline.
Yields events in order:
{"type":"status","message":""} pipeline progress
{"type":"summary","data":{}} window + severity counts (fast, from DB)
{"type":"entries","data":[]} log entries (fast, from DB)
{"type":"reasoning","text":""} LLM analysis (slow, optional)
{"type":"done"}
"""
keywords = query.strip()
source_browse = not keywords and source_filter is not None
if source_browse:
# No keyword — browsing a source directly. Use 24h window; skip FTS entirely.
yield {"type": "status", "message": f"Loading {source_filter}"}
since = since or _last_n_minutes(60 * 24)
until = until or _now_iso()
time_detected = False
else:
yield {"type": "status", "message": "Parsing time window…"}
time_detected = since is not None and until is not None
if not time_detected:
parsed_since, parsed_until, keywords = await asyncio.to_thread(
parse_time_window, query
)
since = since or parsed_since
until = until or parsed_until
time_detected = keywords != query
yield {"type": "status", "message": "Loading environment context…"}
ctx = await asyncio.to_thread(lambda: retrieve_context(db_path, query))
context_block = format_context_block(ctx)
yield {
"type": "context",
"facts": ctx.facts,
"chunks": ctx.chunks,
}
yield {"type": "status", "message": "Searching logs…"}
if source_browse:
keyword_hits: list[SearchResult] = []
window_hits = await asyncio.to_thread(
lambda: entries_in_window(
db_path,
since,
until,
source_filter=source_filter,
limit=200,
)
)
else:
keyword_hits, window_hits = await asyncio.gather(
asyncio.to_thread(
lambda: search(
db_path,
keywords,
source_filter=source_filter,
since=since,
until=until,
limit=150,
or_mode=True,
)
),
asyncio.to_thread(
lambda: entries_in_window(
db_path,
since,
until,
source_filter=source_filter,
limit=50,
per_source_cap=15,
)
),
)
seen: set[str] = set()
merged: list[SearchResult] = []
for r in keyword_hits + window_hits:
if r.entry_id not in seen:
seen.add(r.entry_id)
merged.append(r)
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
:200
]
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
by_source: dict[str, int] = {}
for r in combined:
sev = (r.severity or "INFO").upper()
if sev in by_severity:
by_severity[sev] += 1
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
yield {
"type": "summary",
"data": {
"total": len(combined),
"window_start": since,
"window_end": until,
"time_detected": time_detected,
"by_severity": by_severity,
"by_source": by_source,
},
}
yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
if MULTI_AGENT_ENABLED:
async for event in run_pipeline(
db_path=db_path,
entries=combined,
ctx=ctx,
query=query,
since=since,
until=until,
llm_url=llm_url,
llm_model=llm_model,
llm_api_key=llm_api_key,
):
yield event
return # pipeline emits its own "done" event
if llm_url and llm_model and combined:
yield {"type": "status", "message": "Analyzing with LLM…"}
reasoning = await asyncio.to_thread(
lambda: summarize(
query,
combined,
llm_url,
llm_model,
llm_api_key,
context_block=context_block,
)
)
if reasoning:
yield {"type": "reasoning", "text": reasoning}
yield {"type": "done"}
def _now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def _last_n_minutes(n: int) -> str:
return (datetime.now(timezone.utc) - timedelta(minutes=n)).isoformat()
__all__ = [
"diagnose",
"diagnose_stream",
"parse_time_window",
]
# Feature flag for Task 6
MULTI_AGENT_ENABLED = (
os.getenv("TURNSTONE_MULTI_AGENT_DIAGNOSE", "false").lower() == "true"
)

View file

@ -0,0 +1,249 @@
"""Stage 2: Severity Classifier — ML with two fallback levels.
Classification strategy (in priority order):
Path A ML: Hugging Face text-classification pipeline, loaded lazily.
Path B pattern_tags: Map cluster.pattern_tags through the loaded pattern
severity dict; pick the highest severity across matching tags.
Path C regex: Call detect_severity() from app.glean.base on the cluster's
representative_text.
Each cluster is classified independently. The ``classifier_used`` field on the
returned ``ClassifiedTimeline`` reflects the primary path (the one that governed
the overall classification session, not individual cluster fallbacks).
"""
from __future__ import annotations
import logging
import os
from pathlib import Path
from typing import Any
from app.services.diagnose.models import (
ClassifiedTimeline,
EventCluster,
SeverityLabel,
TimelineResult,
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Module-level ML singleton — reset to None between tests via the fixture
# ---------------------------------------------------------------------------
_ml_classifier: Any | None = None
def _get_ml_classifier(model_id: str, device: str) -> Any:
"""Return the cached HF pipeline, loading it on first call."""
global _ml_classifier # noqa: PLW0603
if _ml_classifier is None:
from transformers import pipeline as hf_pipeline # type: ignore[import-untyped]
_ml_classifier = hf_pipeline(
"text-classification", model=model_id, device=device
)
return _ml_classifier
# ---------------------------------------------------------------------------
# Label mapping
# ---------------------------------------------------------------------------
_LABEL_MAP: dict[str, SeverityLabel] = {
"ERROR": "ERROR",
"WARNING": "WARN",
"WARN": "WARN",
"INFO": "INFO",
"DEBUG": "DEBUG",
"CRITICAL": "CRITICAL",
}
_CRITICAL_KEYWORDS: frozenset[str] = frozenset(
{
"panic",
"oom",
"fatal",
"critical",
"kernel panic",
"out of memory",
"segfault",
"segmentation fault",
}
)
_SEVERITY_ORDER: dict[str | None, int] = {
"CRITICAL": 5,
"ERROR": 4,
"WARN": 3,
"WARNING": 3,
"INFO": 2,
"DEBUG": 1,
None: 0,
}
def _map_label(label: str, score: float, text: str) -> SeverityLabel:
"""Apply the severity shim: promote to CRITICAL or demote to DEBUG where warranted."""
upper = label.upper()
if upper == "ERROR" and score > 0.95 and any(
k in text.lower() for k in _CRITICAL_KEYWORDS
):
return "CRITICAL"
if upper == "INFO" and score < 0.4:
return "DEBUG"
return _LABEL_MAP.get(upper, "UNKNOWN") # type: ignore[return-value]
def _highest_from_tags(
tags: tuple[str, ...], severity_map: dict[str, str]
) -> SeverityLabel | None:
"""Return the highest severity from the pattern_tags that appear in severity_map."""
best: str | None = None
best_rank = -1
for tag in tags:
sev = severity_map.get(tag)
rank = _SEVERITY_ORDER.get(sev, 0)
if rank > best_rank:
best_rank = rank
best = sev
if best is None:
return None
normalised = "WARN" if best.upper() == "WARNING" else best.upper()
return normalised # type: ignore[return-value]
# ---------------------------------------------------------------------------
# SeverityClassifier
# ---------------------------------------------------------------------------
class SeverityClassifier:
"""Classify each EventCluster's severity using ML, patterns, or regex fallback.
Parameters
----------
model_id:
Hugging Face model identifier. When empty (default), ML is skipped.
device:
Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``).
pattern_file:
Path to the YAML pattern file. When ``None`` the classifier reads
``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``).
"""
def __init__(
self,
model_id: str = "",
device: str = "cpu",
pattern_file: Path | None = None,
) -> None:
self._model_id = model_id
self._device = device
self._pattern_file: Path | None = pattern_file
self._pattern_severity: dict[str, str] = {}
self._patterns_loaded = False
# ------------------------------------------------------------------
# Lazy loaders
# ------------------------------------------------------------------
def _resolve_pattern_file(self) -> Path | None:
"""Resolve pattern file from constructor arg or env var."""
if self._pattern_file is not None:
return self._pattern_file
env_dir = os.environ.get("TURNSTONE_PATTERNS")
if env_dir:
return Path(env_dir) / "default.yaml"
return None
def _ensure_patterns_loaded(self) -> None:
"""Populate _pattern_severity from the pattern YAML file (once)."""
if self._patterns_loaded:
return
self._patterns_loaded = True
path = self._resolve_pattern_file()
if path is None:
return
from app.glean.base import load_patterns
patterns = load_patterns(path)
self._pattern_severity = {p.name: p.severity for p in patterns}
# ------------------------------------------------------------------
# Per-cluster classification helpers
# ------------------------------------------------------------------
def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None:
"""Attempt ML classification. Returns None on any inference failure."""
try:
pipe = _get_ml_classifier(self._model_id, self._device)
results = pipe(cluster.representative_text)
if not results:
return None
hit = results[0]
return _map_label(hit["label"], hit["score"], cluster.representative_text)
except Exception: # noqa: BLE001
logger.warning(
"ML inference failed for cluster %s — falling back",
cluster.cluster_id,
)
return None
def _classify_cluster_pattern_tags(
self, cluster: EventCluster
) -> SeverityLabel | None:
"""Derive severity from the cluster's pattern_tags. Returns None if no match."""
return _highest_from_tags(cluster.pattern_tags, self._pattern_severity)
def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel:
"""Classify by scanning representative_text with the severity regex."""
from app.glean.base import detect_severity
raw = detect_severity(cluster.representative_text)
if raw is None:
return "INFO"
return _LABEL_MAP.get(raw.upper(), "INFO") # type: ignore[return-value]
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
def classify(self, timeline: TimelineResult) -> ClassifiedTimeline:
"""Classify every cluster in *timeline* and return a ``ClassifiedTimeline``."""
self._ensure_patterns_loaded()
# Determine which primary path governs this session
ml_available = bool(self._model_id)
patterns_available = bool(self._pattern_severity)
if ml_available:
classifier_used: str = "ml"
elif patterns_available:
classifier_used = "pattern_tags"
else:
classifier_used = "regex"
cluster_severities: dict[str, SeverityLabel] = {}
for cluster in timeline.clusters:
severity: SeverityLabel | None = None
if ml_available:
severity = self._classify_cluster_ml(cluster)
if severity is None and patterns_available:
severity = self._classify_cluster_pattern_tags(cluster)
if severity is None:
severity = self._classify_cluster_regex(cluster)
cluster_severities[cluster.cluster_id] = severity
return ClassifiedTimeline(
timeline=timeline,
cluster_severities=cluster_severities,
classifier_used=classifier_used, # type: ignore[arg-type]
model_id=self._model_id if ml_available else None,
)

View file

@ -0,0 +1,216 @@
"""Stage 3: Root-Cause Hypothesizer — LLM + RAG context."""
from __future__ import annotations
import json
import logging
from uuid import uuid4
import httpx
from app.context.retriever import RetrievedContext
from app.services.diagnose.models import (
ClassifiedTimeline,
EventCluster,
Hypothesis,
SeverityLabel,
)
logger = logging.getLogger(__name__)
_VALID_SEVERITIES: frozenset[str] = frozenset({"CRITICAL", "ERROR", "WARN", "INFO", "DEBUG"})
_SYSTEM_PROMPT = (
"You are a Linux sysadmin log analyst. Analyze the following clustered log timeline "
"and generate 2-4 root cause hypotheses as a JSON array.\n\n"
"Each hypothesis must follow this exact JSON schema:\n"
'{"title": str (≤80 chars), "description": str (2-4 sentences), '
'"confidence": float (0.0-1.0), "severity": str (one of: CRITICAL, ERROR, WARN, INFO), '
'"supporting_clusters": [str list of cluster IDs]}\n\n'
"Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON."
)
def _coerce_float(val: object, default: float) -> float:
"""Safely coerce LLM output to float, returning default on failure."""
try:
return float(val) # type: ignore[arg-type]
except (TypeError, ValueError):
return default
def _validate_severity(s: str) -> SeverityLabel:
"""Map a raw severity string to a valid SeverityLabel, defaulting to ERROR."""
upper = s.upper()
if upper == "WARNING":
return "WARN"
return upper if upper in _VALID_SEVERITIES else "ERROR" # type: ignore[return-value]
def _cluster_summary(cluster: EventCluster, severity: str) -> str:
"""Build a condensed single-line summary of a cluster for the prompt."""
sources = ", ".join(list(cluster.source_ids)[:3])
patterns = ", ".join(list(cluster.pattern_tags)[:5])
text_preview = cluster.representative_text[:200]
summary = (
f"[{severity}] {cluster.start_iso or 'unknown'} "
f"({sources}) — {text_preview}"
)
if patterns:
summary += f" [patterns: {patterns}]"
return summary
def _extract_content(resp_json: dict) -> str | None:
"""Pull text content from an OpenAI-compat chat completion response."""
choices = resp_json.get("choices") or []
if not choices:
return None
return (choices[0].get("message", {}).get("content") or "").strip() or None
class RootCauseHypothesizer:
"""Generate ranked root-cause hypotheses from a classified log timeline."""
def __init__(self, max_hypotheses: int = 4) -> None:
self._max_hypotheses = max_hypotheses
def hypothesize(
self,
classified: ClassifiedTimeline,
ctx: RetrievedContext,
query: str,
llm_url: str | None = None,
llm_model: str | None = None,
llm_api_key: str | None = None,
) -> list[Hypothesis]:
"""Generate hypotheses from a classified timeline and RAG context.
Returns an empty list when no LLM is configured or there are no
clusters to analyse.
"""
if not llm_url or not llm_model:
return []
clusters = classified.timeline.clusters
if not clusters:
return []
cluster_lines = [
_cluster_summary(c, classified.cluster_severities.get(c.cluster_id, c.severity))
for c in clusters
]
cluster_block = "\n".join(cluster_lines)
context_parts: list[str] = []
for chunk in ctx.chunks[:5]:
filename = chunk.get("filename", "unknown")
text = chunk.get("text", "")[:300]
context_parts.append(f"[{filename}] {text}")
context_block = "\n".join(context_parts) if context_parts else "(none)"
user_message = (
f"Query: {query}\n\n"
f"Context from runbooks and known patterns:\n{context_block}\n\n"
f"Log timeline (clustered, {len(clusters)} clusters):\n{cluster_block}\n\n"
f"Generate up to {self._max_hypotheses} hypotheses. Return JSON array only."
)
messages = [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
raw_response = self._call_llm(
llm_url=llm_url,
llm_model=llm_model,
llm_api_key=llm_api_key,
messages=messages,
)
if raw_response is None:
return []
return self._parse_response(raw_response)
def _call_llm(
self,
llm_url: str,
llm_model: str,
llm_api_key: str | None,
messages: list[dict],
) -> str | None:
"""Send messages to the LLM and return raw text content."""
headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
# Try cf-orch task-based endpoint first.
task_url = f"{llm_url.rstrip('/')}/api/inference/task"
try:
resp = httpx.post(
task_url,
json={
"product": "turnstone",
"task": "log_analysis",
"payload": {"messages": messages, "stream": False},
},
headers=headers,
timeout=120.0,
)
if resp.status_code == 200:
return _extract_content(resp.json())
if resp.status_code != 404:
resp.raise_for_status()
logger.debug(
"No task assignment for turnstone.log_analysis — falling back to direct model"
)
except Exception as exc:
logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc)
# Fallback: OpenAI-compat endpoint with explicit model name.
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/v1/chat/completions",
json={"model": llm_model, "messages": messages, "stream": False},
headers=headers,
timeout=120.0,
)
resp.raise_for_status()
return _extract_content(resp.json())
except Exception as exc:
logger.warning(
"LLM hypothesizer failed (%s): %s", type(exc).__name__, exc
)
return None
def _parse_response(self, raw: str) -> list[Hypothesis]:
"""Parse the LLM JSON response into a list of Hypothesis objects."""
try:
data = json.loads(raw.strip())
except json.JSONDecodeError:
logger.warning(
"Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw
)
return []
if not isinstance(data, list):
logger.warning(
"Hypothesizer: expected JSON array, got %s", type(data).__name__
)
return []
hypotheses: list[Hypothesis] = []
for item in data[: self._max_hypotheses]:
if not isinstance(item, dict):
continue
severity_raw = item.get("severity", "ERROR")
severity = _validate_severity(str(severity_raw))
hypothesis = Hypothesis(
hypothesis_id=str(uuid4()),
title=str(item.get("title", "Unknown"))[:80],
description=str(item.get("description", "")),
confidence=_coerce_float(item.get("confidence"), 0.5),
supporting_cluster_ids=tuple(item.get("supporting_clusters") or []),
runbook_refs=(),
severity=severity,
)
hypotheses.append(hypothesis)
return hypotheses

View file

@ -1,4 +1,5 @@
"""Frictionless diagnose service — NL time extraction + layered log search."""
from __future__ import annotations
import asyncio
@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)
try:
from dateparser.search import search_dates as _search_dates # type: ignore[import]
_HAS_DATEPARSER = True
except ImportError:
_search_dates = None # type: ignore[assignment]
@ -68,17 +70,25 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
results = _search_dates(
query,
languages=["en"],
settings={"PREFER_DATES_FROM": "past", "TIMEZONE": tz_str, "RETURN_AS_TIMEZONE_AWARE": True},
settings={
"PREFER_DATES_FROM": "past",
"TIMEZONE": tz_str,
"RETURN_AS_TIMEZONE_AWARE": True,
},
)
except Exception:
logger.warning("dateparser failed on query %r — falling back to 60-min window", query)
logger.warning(
"dateparser failed on query %r — falling back to 60-min window", query
)
results = None
if results:
phrase, dt = results[0]
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
else:
dt = dt.astimezone(timezone.utc) # normalise to UTC for SQLite string compare
dt = dt.astimezone(
timezone.utc
) # normalise to UTC for SQLite string compare
since = (dt - timedelta(minutes=30)).isoformat()
until = (dt + timedelta(minutes=30)).isoformat()
keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
@ -107,8 +117,23 @@ def diagnose(
else:
keywords = query
keyword_hits = search(db_path, query=keywords, since=since, until=until, source_filter=source_filter, limit=150, or_mode=True)
window_hits = entries_in_window(db_path, since=since, until=until, source_filter=source_filter, limit=50, per_source_cap=15)
keyword_hits = search(
db_path,
query=keywords,
since=since,
until=until,
source_filter=source_filter,
limit=150,
or_mode=True,
)
window_hits = entries_in_window(
db_path,
since=since,
until=until,
source_filter=source_filter,
limit=50,
per_source_cap=15,
)
seen: set[str] = set()
merged: list[SearchResult] = []
@ -117,7 +142,9 @@ def diagnose(
seen.add(r.entry_id)
merged.append(r)
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
:200
]
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
by_source: dict[str, int] = {}
@ -129,7 +156,9 @@ def diagnose(
reasoning: str | None = None
if llm_url and llm_model:
reasoning = summarize(query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key)
reasoning = summarize(
query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
)
return {
"summary": {
@ -177,7 +206,9 @@ async def diagnose_stream(
yield {"type": "status", "message": "Parsing time window…"}
time_detected = since is not None and until is not None
if not time_detected:
parsed_since, parsed_until, keywords = await asyncio.to_thread(parse_time_window, query)
parsed_since, parsed_until, keywords = await asyncio.to_thread(
parse_time_window, query
)
since = since or parsed_since
until = until or parsed_until
time_detected = keywords != query
@ -197,23 +228,34 @@ async def diagnose_stream(
keyword_hits: list[SearchResult] = []
window_hits = await asyncio.to_thread(
lambda: entries_in_window(
db_path, since, until,
source_filter=source_filter, limit=200,
db_path,
since,
until,
source_filter=source_filter,
limit=200,
)
)
else:
keyword_hits, window_hits = await asyncio.gather(
asyncio.to_thread(
lambda: search(
db_path, keywords,
source_filter=source_filter, since=since, until=until,
limit=150, or_mode=True,
db_path,
keywords,
source_filter=source_filter,
since=since,
until=until,
limit=150,
or_mode=True,
)
),
asyncio.to_thread(
lambda: entries_in_window(
db_path, since, until,
source_filter=source_filter, limit=50, per_source_cap=15,
db_path,
since,
until,
source_filter=source_filter,
limit=50,
per_source_cap=15,
)
),
)
@ -225,7 +267,9 @@ async def diagnose_stream(
seen.add(r.entry_id)
merged.append(r)
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
:200
]
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
by_source: dict[str, int] = {}
@ -251,7 +295,14 @@ async def diagnose_stream(
if llm_url and llm_model and combined:
yield {"type": "status", "message": "Analyzing with LLM…"}
reasoning = await asyncio.to_thread(
lambda: summarize(query, combined, llm_url, llm_model, llm_api_key, context_block=context_block)
lambda: summarize(
query,
combined,
llm_url,
llm_model,
llm_api_key,
context_block=context_block,
)
)
if reasoning:
yield {"type": "reasoning", "text": reasoning}

View file

@ -0,0 +1,72 @@
"""Pipeline data types for the multi-agent diagnose pipeline."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Literal
SeverityLabel = Literal["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "UNKNOWN"]
@dataclass(frozen=True)
class EventCluster:
"""A time-correlated group of log entries within the timeline."""
cluster_id: str
entries: tuple[str, ...] # entry_id refs
start_iso: str | None
end_iso: str | None
duration_seconds: float
source_ids: tuple[str, ...]
pattern_tags: tuple[str, ...]
severity: SeverityLabel
burst: bool
gap_before_seconds: float
representative_text: str
@dataclass(frozen=True)
class TimelineResult:
"""Structured timeline of event clusters built from log entries."""
clusters: tuple[EventCluster, ...]
total_entries: int
window_start: str | None
window_end: str | None
gap_count: int
burst_count: int
dominant_sources: tuple[str, ...]
@dataclass(frozen=True)
class ClassifiedTimeline:
"""Timeline annotated with ML-assigned severity per cluster."""
timeline: TimelineResult
cluster_severities: dict[str, SeverityLabel]
classifier_used: Literal["ml", "pattern_tags", "regex"]
model_id: str | None
@dataclass(frozen=True)
class Hypothesis:
"""A root-cause hypothesis generated by Stage 3."""
hypothesis_id: str
title: str
description: str
confidence: float
supporting_cluster_ids: tuple[str, ...]
runbook_refs: tuple[str, ...]
severity: SeverityLabel
@dataclass(frozen=True)
class RankedHypothesis:
"""A hypothesis enriched by Stage 4 false-positive suppression."""
hypothesis: Hypothesis
novelty_score: float
similarity_to_known: float
suppress: bool
suppression_reason: str | None

View file

@ -0,0 +1,132 @@
"""Multi-agent diagnose pipeline orchestrator — Stage 15 wiring."""
from __future__ import annotations
import asyncio
import dataclasses
import logging
from collections.abc import AsyncGenerator
from pathlib import Path
from typing import Any
from app.context.retriever import RetrievedContext
from app.services.diagnose.classifier import SeverityClassifier
from app.services.diagnose.hypothesizer import RootCauseHypothesizer
from app.services.diagnose.suppressor import FalsePositiveSuppressor
from app.services.diagnose.synthesizer import SummarySynthesizer
from app.services.diagnose.timeline import TimelineReconstructor
from app.services.search import SearchResult
logger = logging.getLogger(__name__)
async def run_pipeline(
db_path: Path,
entries: list[SearchResult],
ctx: RetrievedContext,
query: str,
since: str | None, # reserved for future range-filtering in stage queries (#29 follow-up)
until: str | None, # reserved for future range-filtering in stage queries (#29 follow-up)
llm_url: str | None,
llm_model: str | None,
llm_api_key: str | None,
) -> AsyncGenerator[dict[str, Any], None]:
"""Async generator that runs all 5 pipeline stages and yields SSE event dicts.
Stages:
1. TimelineReconstructor cluster log entries by time
2. SeverityClassifier annotate clusters with severity
3. RootCauseHypothesizer generate hypotheses via LLM
4. FalsePositiveSuppressor rank and suppress known patterns
5. SummarySynthesizer produce a narrative diagnosis
Yields events in order:
{"type": "status", "message": "Building timeline…"}
{"type": "pipeline_stage", "stage": 1, ...}
{"type": "pipeline_stage", "stage": 2, ...}
{"type": "pipeline_stage", "stage": 3, ...}
{"type": "pipeline_stage", "stage": 4, ...}
{"type": "hypotheses", "data": [...]}
{"type": "status", "message": "Synthesizing…"}
{"type": "reasoning", "text": "..."} only when synthesis produces text
{"type": "done"}
"""
# Stage 1: Timeline reconstruction
yield {"type": "status", "message": "Building timeline…"}
timeline = await asyncio.to_thread(
TimelineReconstructor().reconstruct, entries
)
n_clusters = len(timeline.clusters)
burst = timeline.burst_count
yield {
"type": "pipeline_stage",
"stage": 1,
"name": "timeline",
"message": f"Built {n_clusters} clusters, {burst} bursts",
}
# Stage 2: Severity classification
classified = await asyncio.to_thread(
SeverityClassifier().classify, timeline
)
sev_counts: dict[str, int] = {}
for sev in classified.cluster_severities.values():
sev_counts[sev] = sev_counts.get(sev, 0) + 1
counts_str = ", ".join(f"{k}:{v}" for k, v in sorted(sev_counts.items()))
yield {
"type": "pipeline_stage",
"stage": 2,
"name": "classifier",
"message": f"{classified.classifier_used} classifier: {counts_str}",
}
# Stage 3: Root-cause hypotheses
hypotheses = await asyncio.to_thread(
RootCauseHypothesizer().hypothesize,
classified,
ctx,
query,
llm_url,
llm_model,
llm_api_key,
)
yield {
"type": "pipeline_stage",
"stage": 3,
"name": "hypotheses",
"message": f"{len(hypotheses)} hypotheses generated",
}
# Stage 4: False-positive suppression
ranked = await asyncio.to_thread(
FalsePositiveSuppressor().suppress, hypotheses, db_path
)
suppressed = sum(1 for rh in ranked if rh.suppress)
active = len(ranked) - suppressed
yield {
"type": "pipeline_stage",
"stage": 4,
"name": "suppressor",
"message": f"{suppressed} suppressed, {active} active",
}
yield {
"type": "hypotheses",
"data": [dataclasses.asdict(rh) for rh in ranked],
}
# Stage 5: Summary synthesis
yield {"type": "status", "message": "Synthesizing…"}
synthesis_text = await asyncio.to_thread(
SummarySynthesizer().synthesize,
ranked,
timeline,
ctx,
query,
llm_url,
llm_model,
llm_api_key,
)
if synthesis_text:
yield {"type": "reasoning", "text": synthesis_text}
yield {"type": "done"}

View file

@ -0,0 +1,275 @@
"""Stage 4: False-Positive Suppressor — embedding cosine similarity.
Compares each hypothesis against a corpus of resolved incidents using
embedding cosine similarity. Hypotheses that closely match a previously
resolved incident are suppressed as likely false positives.
When no embedding model is configured or the service is unavailable, all
hypotheses pass through with novelty_score=1.0 (full novelty assumed).
"""
from __future__ import annotations
import logging
import sqlite3
from pathlib import Path
from typing import Any
from app.services.diagnose.models import Hypothesis, RankedHypothesis
logger = logging.getLogger(__name__)
# Module-level corpus cache: db_path_str -> (corpus_texts, embeddings)
# Invalidated when the corpus text list changes between calls.
_corpus_cache: dict[str, tuple[list[str], Any]] = {}
# ---------------------------------------------------------------------------
# Cosine similarity helpers
# ---------------------------------------------------------------------------
try:
import numpy as np
def _cosine_similarities(
query_emb: list[float], corpus_embs: list[list[float]]
) -> list[float]:
"""Batch cosine similarity of one query embedding against all corpus embeddings."""
q = np.array(query_emb, dtype=np.float32)
c = np.array(corpus_embs, dtype=np.float32)
q_norm = q / (np.linalg.norm(q) + 1e-10)
c_norm = c / (np.linalg.norm(c, axis=1, keepdims=True) + 1e-10)
return list(c_norm @ q_norm)
_HAS_NUMPY = True
except ImportError: # pragma: no cover
import math
_HAS_NUMPY = False
def _dot(a: list[float], b: list[float]) -> float:
return sum(x * y for x, y in zip(a, b))
def _norm(a: list[float]) -> float:
return math.sqrt(sum(x * x for x in a)) + 1e-10
def _cosine(a: list[float], b: list[float]) -> float:
return _dot(a, b) / (_norm(a) * _norm(b))
def _cosine_similarities(
query_emb: list[float], corpus_embs: list[list[float]]
) -> list[float]:
return [_cosine(query_emb, c) for c in corpus_embs]
# ---------------------------------------------------------------------------
# DB helpers
# ---------------------------------------------------------------------------
def _fetch_resolved_incidents(db_path: Path) -> list[str]:
"""Fetch resolved incident texts from SQLite.
Returns a list of non-empty combined strings for each resolved incident.
Returns an empty list on any error (missing table, connection failure, etc.).
"""
try:
with sqlite3.connect(str(db_path)) as conn:
cursor = conn.execute(
"SELECT label, notes FROM incidents WHERE ended_at IS NOT NULL LIMIT 200"
)
rows = cursor.fetchall()
except sqlite3.OperationalError as exc:
logger.warning("Could not query resolved incidents (%s) — treating as empty corpus", exc)
return []
except sqlite3.Error as exc:
# Catches all remaining SQLite-family errors (IntegrityError, DatabaseError, etc.)
logger.warning("Unexpected SQLite error fetching resolved incidents (%s) — treating as empty corpus", exc)
return []
texts: list[str] = []
for label, notes in rows:
label = (label or "").strip()
notes = (notes or "").strip()
combined = f"{label}. {notes}" if label and notes else (label or notes)
if combined:
texts.append(combined)
return texts
# ---------------------------------------------------------------------------
# Public class
# ---------------------------------------------------------------------------
class FalsePositiveSuppressor:
"""Stage 4 of the multi-agent diagnose pipeline.
Uses embedding cosine similarity to detect hypotheses that closely match
previously resolved incidents and suppress them as likely false positives.
When model_id is empty or the embedding service is unavailable, all
hypotheses pass through with novelty_score=1.0 (no suppression).
"""
def __init__(
self,
model_id: str = "",
device: str = "cpu",
similarity_threshold: float = 0.85,
) -> None:
self._model_id = model_id
self._device = device
# _device stored for future use when get_embedder() supports device selection
# Suppress when cosine similarity to a known resolved incident >= threshold.
# A threshold of 0.85 means "suppress if 85%+ similar to something already resolved."
self._similarity_threshold = similarity_threshold
def suppress(
self,
hypotheses: list[Hypothesis],
db_path: Path,
) -> list[RankedHypothesis]:
"""Rank hypotheses by novelty, suppressing those matching resolved incidents.
Args:
hypotheses: Candidate hypotheses from Stage 3.
db_path: Path to the Turnstone SQLite database containing incidents.
Returns:
List of RankedHypothesis sorted by (novelty_score * confidence) descending.
Non-suppressed hypotheses appear first in practice.
"""
if not hypotheses:
return []
# No model configured — full passthrough, rank by confidence only.
if not self._model_id:
return self._passthrough(hypotheses)
# Attempt to obtain an embedder; fall back to passthrough on failure.
embedder = self._load_embedder()
if embedder is None:
logger.warning(
"Embedding service unavailable for model %r — skipping suppression",
self._model_id,
)
return self._passthrough(hypotheses)
# Fetch corpus texts from DB; fall back to passthrough if corpus is empty.
corpus_texts = _fetch_resolved_incidents(db_path)
if not corpus_texts:
logger.debug("No resolved incidents found — all hypotheses treated as novel")
return self._passthrough(hypotheses)
# Embed corpus (with caching).
corpus_embeddings = self._get_corpus_embeddings(embedder, corpus_texts, db_path)
# Score each hypothesis and sort by novelty * confidence descending.
ranked = [
self._score_hypothesis(h, embedder, corpus_embeddings)
for h in hypotheses
]
ranked.sort(key=lambda rh: rh.novelty_score * rh.hypothesis.confidence, reverse=True)
return ranked
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
def _score_hypothesis(
self,
hypothesis: Hypothesis,
embedder: Any,
corpus_embeddings: list[list[float]],
) -> RankedHypothesis:
"""Score a single hypothesis against the resolved incident corpus."""
try:
query_text = f"{hypothesis.title}. {hypothesis.description}"
h_emb = embedder.embed(query_text)
# Convert numpy array to plain Python list for _cosine_similarities
h_emb_list: list[float] = h_emb.tolist() if hasattr(h_emb, "tolist") else list(h_emb)
sims = _cosine_similarities(h_emb_list, corpus_embeddings)
max_sim = float(max(sims)) if sims else 0.0
except Exception as exc:
# Broad catch is intentional: catches unknown embedder runtime errors
# (e.g. CUDA OOM, backend crashes) so one bad hypothesis never halts the pipeline.
logger.warning("Embedding failed for hypothesis %r: %s — treating as novel", hypothesis.title, exc)
return RankedHypothesis(
hypothesis=hypothesis,
novelty_score=1.0,
similarity_to_known=0.0,
suppress=False,
suppression_reason=None,
)
novelty_score = 1.0 - max_sim
suppress = bool(max_sim >= self._similarity_threshold)
suppression_reason = (
f"Similar to resolved incident (similarity {max_sim:.2f})"
if suppress
else None
)
return RankedHypothesis(
hypothesis=hypothesis,
novelty_score=novelty_score,
similarity_to_known=max_sim,
suppress=suppress,
suppression_reason=suppression_reason,
)
def _load_embedder(self) -> Any | None:
"""Load the embedding service. Returns None if unavailable."""
try:
from app.services.embeddings import get_embedder
return get_embedder()
except Exception as exc:
# Broad catch is intentional: get_embedder() may raise on import or
# backend init failures from any number of third-party libraries.
logger.warning("Failed to import/initialise embedding service: %s", exc)
return None
def _get_corpus_embeddings(
self,
embedder: Any,
corpus_texts: list[str],
db_path: Path,
) -> list[list[float]]:
"""Return cached corpus embeddings, re-embedding if the corpus has changed."""
cache_key = str(db_path)
cached = _corpus_cache.get(cache_key)
if cached is not None:
cached_texts, cached_embeddings = cached
if cached_texts == corpus_texts:
return cached_embeddings
logger.debug("Embedding corpus of %d resolved incidents", len(corpus_texts))
try:
raw_embeddings = embedder.embed_batch(corpus_texts)
# Normalise each embedding to a plain Python list for portability
corpus_embeddings: list[list[float]] = [
e.tolist() if hasattr(e, "tolist") else list(e)
for e in raw_embeddings
]
except Exception as exc:
# Broad catch is intentional: embed_batch() may raise from any backend
# (network timeout, CUDA error, etc.) — treat as empty corpus so the
# pipeline can continue without suppression.
logger.warning("Corpus embedding failed: %s — treating as empty corpus", exc)
return []
_corpus_cache[cache_key] = (corpus_texts, corpus_embeddings)
return corpus_embeddings
def _passthrough(self, hypotheses: list[Hypothesis]) -> list[RankedHypothesis]:
"""Return all hypotheses as non-suppressed, ranked by confidence descending."""
ranked = [
RankedHypothesis(
hypothesis=h,
novelty_score=1.0,
similarity_to_known=0.0,
suppress=False,
suppression_reason=None,
)
for h in hypotheses
]
ranked.sort(key=lambda rh: rh.hypothesis.confidence, reverse=True)
return ranked

View file

@ -0,0 +1,210 @@
"""Stage 5: Summary Synthesizer — deterministic narrative from ranked hypotheses.
Streaming upgrade (async SSE chunks) is tracked as a follow-up enhancement.
This implementation is synchronous to match the rest of the pipeline.
"""
from __future__ import annotations
import logging
import httpx
from app.context.retriever import RetrievedContext
from app.services.diagnose.models import RankedHypothesis, TimelineResult
logger = logging.getLogger(__name__)
_SYSTEM_PROMPT = (
"You are a Linux sysadmin diagnosing a system incident. "
"Write a concise, actionable incident diagnosis.\n\n"
"Format your response exactly as:\n"
"1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — <what happened> (<X>% confidence)\n"
"2. TIMELINE: <what the logs show in sequence, 2-3 sentences>\n"
"3. ROOT CAUSES:\n"
" - <hypothesis 1 title> (<confidence>%)\n"
" - <hypothesis 2 title> (<confidence>%)\n"
"4. RECOMMENDED ACTIONS:\n"
" - <action based on hypotheses>\n"
"5. INVESTIGATE FURTHER: <open questions, if any>"
)
def _extract_content(resp_json: dict) -> str | None:
"""Pull text content from an OpenAI-compat chat completion response."""
choices = resp_json.get("choices") or []
if not choices:
return None
return (choices[0].get("message", {}).get("content") or "").strip() or None
def _build_hypothesis_block(ranked: list[RankedHypothesis]) -> str:
"""Build the hypothesis block for the prompt (non-suppressed only, top 3)."""
active = [rh for rh in ranked if not rh.suppress][:3]
if not active:
return "(none)"
lines: list[str] = []
for rh in active:
h = rh.hypothesis
conf_pct = int(h.confidence * 100)
similar = (
f"Yes — suppressed, {rh.suppression_reason}"
if rh.suppress and rh.suppression_reason
else "No"
)
novelty = f"{rh.novelty_score:.2f}"
lines.append(
f"- [{h.severity}, {conf_pct}%] {h.title}\n"
f" Similar resolved incident? {similar} (novelty {novelty})"
)
return "\n".join(lines)
def _build_context_block(ctx: RetrievedContext) -> str:
"""Build the runbook context block for the prompt."""
parts: list[str] = []
for chunk in ctx.chunks[:5]:
filename = chunk.get("filename", "unknown")
text = chunk.get("text", "")[:300]
parts.append(f"[{filename}] {text}")
return "\n".join(parts) if parts else "(none)"
def _deterministic_fallback(
ranked: list[RankedHypothesis],
timeline: TimelineResult,
) -> str:
"""Build a deterministic fallback text when no LLM is available."""
active = [rh for rh in ranked if not rh.suppress][:3]
if active:
top = active[0]
verdict_severity = top.hypothesis.severity
verdict_title = top.hypothesis.title
verdict_conf = int(top.hypothesis.confidence * 100)
elif ranked:
top = ranked[0]
verdict_severity = top.hypothesis.severity
verdict_title = top.hypothesis.title
verdict_conf = int(top.hypothesis.confidence * 100)
else:
verdict_severity = "UNKNOWN"
verdict_title = "No hypotheses generated"
verdict_conf = 0
root_causes = ", ".join(
rh.hypothesis.title for rh in (active or ranked[:3])
) or "None"
return (
f"VERDICT: {verdict_severity}{verdict_title} ({verdict_conf}% confidence)\n"
f"TIMELINE: {timeline.total_entries} entries across {len(timeline.clusters)} clusters.\n"
f"ROOT CAUSES: {root_causes}"
)
class SummarySynthesizer:
"""Stage 5 of the multi-agent diagnose pipeline.
Synthesizes a human-readable incident narrative from ranked hypotheses,
the reconstructed timeline, and RAG context. When no LLM is configured,
returns a deterministic fallback built from the hypothesis data.
"""
def synthesize(
self,
ranked: list[RankedHypothesis],
timeline: TimelineResult,
ctx: RetrievedContext,
query: str,
llm_url: str | None = None,
llm_model: str | None = None,
llm_api_key: str | None = None,
) -> str:
"""Return synthesis text (single string, synchronous).
Falls back to a deterministic narrative when no LLM URL or model is
provided, or when the LLM call fails.
"""
fallback = _deterministic_fallback(ranked, timeline)
if not llm_url or not llm_model:
return fallback
hypothesis_block = _build_hypothesis_block(ranked)
context_block = _build_context_block(ctx)
dominant = ", ".join(timeline.dominant_sources[:5]) or "none"
user_message = (
f"Query: {query}\n\n"
f"Timeline summary:\n"
f"- {len(timeline.clusters)} clusters, "
f"{timeline.burst_count} bursts, "
f"{timeline.gap_count} silence gaps\n"
f"- Primary sources: {dominant}\n\n"
f"Top hypotheses:\n{hypothesis_block}\n\n"
f"Context from runbooks:\n{context_block}"
)
messages = [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_message},
]
result = self._call_llm(
llm_url=llm_url,
llm_model=llm_model,
llm_api_key=llm_api_key,
messages=messages,
)
return result if result else fallback
def _call_llm(
self,
llm_url: str,
llm_model: str,
llm_api_key: str | None,
messages: list[dict],
) -> str | None:
"""Send messages to the LLM and return raw text content.
Tries the cf-orch task endpoint first, falls back to direct OpenAI-compat.
"""
headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
task_url = f"{llm_url.rstrip('/')}/api/inference/task"
try:
resp = httpx.post(
task_url,
json={
"product": "turnstone",
"task": "log_analysis",
"payload": {"messages": messages, "stream": False},
},
headers=headers,
timeout=120.0,
)
if resp.status_code == 200:
return _extract_content(resp.json())
if resp.status_code != 404:
resp.raise_for_status()
logger.debug(
"No task assignment for turnstone.log_analysis — falling back to direct model"
)
except Exception as exc:
logger.debug(
"Task endpoint unavailable (%s) — falling back to direct model", exc
)
try:
resp = httpx.post(
f"{llm_url.rstrip('/')}/v1/chat/completions",
json={"model": llm_model, "messages": messages, "stream": False},
headers=headers,
timeout=120.0,
)
resp.raise_for_status()
return _extract_content(resp.json())
except Exception as exc:
logger.warning(
"LLM synthesizer failed (%s): %s", type(exc).__name__, exc
)
return None

View file

@ -0,0 +1,272 @@
"""Stage 1: Timeline Reconstructor — pure Python, no ML."""
from __future__ import annotations
import hashlib
import logging
from collections import defaultdict
from datetime import datetime, timezone
from app.services.diagnose.models import EventCluster, TimelineResult
from app.services.search import SearchResult
logger = logging.getLogger(__name__)
_SEVERITY_ORDER: dict[str | None, int] = {
"CRITICAL": 5,
"ERROR": 4,
"WARN": 3,
"WARNING": 3,
"INFO": 2,
"DEBUG": 1,
None: 0,
}
def _parse_iso(s: str) -> datetime | None:
"""Parse ISO 8601 string to UTC-aware datetime. Returns None on parse failure."""
try:
dt = datetime.fromisoformat(s)
except ValueError:
logger.warning("Unparseable timestamp in log entry, treating as None: %r", s)
return None
if dt.tzinfo is None:
logger.debug("Naive timestamp treated as UTC: %s", s)
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc)
def _sort_key(e: SearchResult) -> tuple[int, str]:
"""Sort key: timestamped entries first (ascending), then None-timestamp entries."""
if e.timestamp_iso is None:
return (1, "")
return (0, e.timestamp_iso)
def _highest_severity(entries: list[SearchResult]) -> str:
"""Return the highest severity label across all entries."""
best: str | None = None
best_rank = -1
for entry in entries:
sev = entry.severity
rank = _SEVERITY_ORDER.get(sev, 0)
if rank > best_rank:
best_rank = rank
best = sev
# SeverityLabel requires a valid literal; fall back to "UNKNOWN" if None
if best is None:
return "UNKNOWN"
# Normalise WARNING -> WARN for the output type
if best == "WARNING":
return "WARN"
return best
def _representative_text(entries: list[SearchResult]) -> str:
"""Return text of the entry with highest rank; tie-break on longest text."""
if not entries:
return ""
best = max(entries, key=lambda e: (e.rank, len(e.text)))
return best.text
def _cluster_id(entry_ids: list[str]) -> str:
"""Compute a 12-char hex cluster ID from a sorted list of entry IDs."""
payload = ",".join(sorted(entry_ids)).encode()
return hashlib.sha1(payload).hexdigest()[:12] # noqa: S324 — not used for security
def _make_event_cluster(
cluster_entries: list[SearchResult],
gap_before_seconds: float,
burst_threshold: int,
burst_window_seconds: int,
) -> EventCluster:
"""Construct an EventCluster from a list of SearchResult entries."""
timestamps = [
ts
for e in cluster_entries
if e.timestamp_iso is not None
for ts in (_parse_iso(e.timestamp_iso),)
if ts is not None
]
start_iso: str | None = None
end_iso: str | None = None
duration_seconds = 0.0
if timestamps:
ts_min = min(timestamps)
ts_max = max(timestamps)
start_iso = ts_min.isoformat()
end_iso = ts_max.isoformat()
duration_seconds = (ts_max - ts_min).total_seconds()
entry_ids = [e.entry_id for e in cluster_entries]
burst = (
len(cluster_entries) >= burst_threshold
and duration_seconds <= burst_window_seconds
)
return EventCluster(
cluster_id=_cluster_id(entry_ids),
entries=tuple(entry_ids),
start_iso=start_iso,
end_iso=end_iso,
duration_seconds=duration_seconds,
source_ids=tuple(sorted(set(e.source_id for e in cluster_entries))),
pattern_tags=tuple(
sorted(set(tag for e in cluster_entries for tag in e.matched_patterns))
),
severity=_highest_severity(cluster_entries), # type: ignore[arg-type] # SeverityLabel is a Literal; _highest_severity returns a compatible str
burst=burst,
gap_before_seconds=gap_before_seconds,
representative_text=_representative_text(cluster_entries),
)
class TimelineReconstructor:
"""Reconstruct a structured timeline of event clusters from log entries.
Pure Python no ML or LLM calls. Designed as Stage 1 of the multi-agent
diagnose pipeline.
"""
def __init__(
self,
cluster_window_seconds: int = 30,
burst_threshold: int = 10,
burst_window_seconds: int = 5,
gap_significance_seconds: int = 30,
) -> None:
self._cluster_window = cluster_window_seconds
self._burst_threshold = burst_threshold
self._burst_window = burst_window_seconds
self._gap_significance_seconds: int = gap_significance_seconds
def _sort_entries(self, entries: list[SearchResult]) -> list[SearchResult]:
"""Sort entries: timestamped first (ascending), then None-timestamp entries."""
return sorted(entries, key=_sort_key)
def _group_into_raw_clusters(
self, sorted_entries: list[SearchResult]
) -> list[list[SearchResult]]:
"""Group sorted entries into time-window clusters."""
raw_clusters: list[list[SearchResult]] = []
current: list[SearchResult] = []
cluster_anchor: datetime | None = None
for entry in sorted_entries:
if not current:
current.append(entry)
if entry.timestamp_iso is not None:
cluster_anchor = _parse_iso(entry.timestamp_iso)
continue
if entry.timestamp_iso is None:
# No timestamp — always joins the current cluster
current.append(entry)
continue
entry_dt = _parse_iso(entry.timestamp_iso)
if entry_dt is None:
# Malformed timestamp — treat same as None: join current cluster
current.append(entry)
continue
if cluster_anchor is None:
# Current cluster has no anchor yet — set it, stay in cluster
cluster_anchor = entry_dt
current.append(entry)
continue
delta = (entry_dt - cluster_anchor).total_seconds()
if delta > self._cluster_window:
raw_clusters.append(current)
current = [entry]
cluster_anchor = entry_dt
else:
current.append(entry)
if current:
raw_clusters.append(current)
return raw_clusters
def _build_cluster(
self,
cluster_entries: list[SearchResult],
prev_end_iso: str | None,
) -> EventCluster:
"""Build an EventCluster from a list of SearchResult entries."""
gap_before = 0.0
if prev_end_iso is not None:
ts_list = [
ts
for e in cluster_entries
if e.timestamp_iso is not None
for ts in (_parse_iso(e.timestamp_iso),)
if ts is not None
]
if ts_list:
this_start = min(ts_list)
prev_end = _parse_iso(prev_end_iso)
if prev_end is not None:
gap_before = (this_start - prev_end).total_seconds()
return _make_event_cluster(
cluster_entries,
gap_before_seconds=gap_before,
burst_threshold=self._burst_threshold,
burst_window_seconds=self._burst_window,
)
def _dominant_sources_tuple(self, entries: list[SearchResult]) -> tuple[str, ...]:
"""Return source_ids sorted by total entry count descending."""
source_counts: dict[str, int] = defaultdict(int)
for entry in entries:
source_counts[entry.source_id] += 1
return tuple(
src for src, _ in sorted(source_counts.items(), key=lambda kv: -kv[1])
)
def reconstruct(self, entries: list[SearchResult]) -> TimelineResult:
"""Build a structured timeline from a flat list of log entries."""
if not entries:
return TimelineResult(
clusters=(),
total_entries=0,
window_start=None,
window_end=None,
gap_count=0,
burst_count=0,
dominant_sources=(),
)
sorted_entries = self._sort_entries(entries)
raw_clusters = self._group_into_raw_clusters(sorted_entries)
clusters: list[EventCluster] = []
prev_end: str | None = None
for raw in raw_clusters:
c = self._build_cluster(raw, prev_end)
clusters.append(c)
prev_end = c.end_iso
clusters_tuple = tuple(clusters)
gap_count = sum(
1
for c in clusters_tuple
if c.gap_before_seconds > self._gap_significance_seconds
)
return TimelineResult(
clusters=clusters_tuple,
total_entries=len(entries),
window_start=clusters_tuple[0].start_iso if clusters_tuple else None,
window_end=clusters_tuple[-1].end_iso if clusters_tuple else None,
gap_count=gap_count,
burst_count=sum(1 for c in clusters_tuple if c.burst),
dominant_sources=self._dominant_sources_tuple(entries),
)

229
app/services/embeddings.py Normal file
View file

@ -0,0 +1,229 @@
"""Configurable embedding service — BSL licensed.
Backends:
sentence_transformers local in-process inference (default, no server needed)
ollama HTTP to a running Ollama instance
Configuration (env vars):
TURNSTONE_EMBED_BACKEND sentence_transformers | ollama (default: sentence_transformers)
TURNSTONE_EMBED_MODEL model name/path (backend-specific default)
TURNSTONE_EMBED_DEVICE cpu | cuda (default: cpu; ST backend only)
TURNSTONE_LLM_URL Ollama base URL (default: http://localhost:11434)
When no backend is importable/reachable, EMBEDDING_AVAILABLE is False and all
embed calls return empty arrays callers must handle this gracefully.
"""
from __future__ import annotations
import logging
import os
import struct
from typing import Protocol, runtime_checkable
import numpy as np
logger = logging.getLogger(__name__)
# ── Public availability flag ──────────────────────────────────────────────────
EMBEDDING_AVAILABLE: bool = False
# ── Config ────────────────────────────────────────────────────────────────────
_BACKEND = os.environ.get("TURNSTONE_EMBED_BACKEND", "sentence_transformers").lower()
_DEVICE = os.environ.get("TURNSTONE_EMBED_DEVICE", "cpu").lower()
_LLM_URL = os.environ.get("TURNSTONE_LLM_URL", "http://localhost:11434")
# BAAI/bge-small-en-v1.5: 33MB, MIT, 49M downloads/month, 384-dim, 512-token max.
# Benchmarked as the best quality-to-size ratio in the field (MTEB 62.17).
# all-MiniLM-L6-v2 is a viable lighter alternative (23MB, 256-token max) if
# inference speed is the primary constraint.
_DEFAULT_MODEL: dict[str, str] = {
"sentence_transformers": "BAAI/bge-small-en-v1.5",
"ollama": "nomic-embed-text",
}
_MODEL = os.environ.get(
"TURNSTONE_EMBED_MODEL",
_DEFAULT_MODEL.get(_BACKEND, "sentence-transformers/all-MiniLM-L6-v2"),
)
# ── Protocol ──────────────────────────────────────────────────────────────────
@runtime_checkable
class Embedder(Protocol):
"""Minimal interface all embedding backends must satisfy."""
@property
def dim(self) -> int:
"""Embedding dimension produced by this model."""
...
@property
def model_name(self) -> str:
"""Human-readable model identifier."""
...
def embed(self, text: str) -> np.ndarray:
"""Embed a single string. Returns 1-D float32 array of length dim."""
...
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
"""Embed a list of strings. Returns list of 1-D float32 arrays."""
...
# ── sentence-transformers backend ─────────────────────────────────────────────
class SentenceTransformerEmbedder:
"""Local in-process embedding via the sentence-transformers library.
The model is downloaded from HuggingFace on first instantiation and cached
at ~/.cache/huggingface/. Subsequent starts use the local cache.
"""
def __init__(self, model_name: str = _MODEL, device: str = _DEVICE) -> None:
from sentence_transformers import SentenceTransformer # type: ignore[import]
logger.info("Loading embedding model %r on device %r ...", model_name, device)
self._model = SentenceTransformer(model_name, device=device)
self._model_name = model_name
# Infer dimension from a test embed rather than hard-coding
self._dim: int = int(self._model.encode("test").shape[0])
logger.info("Embedding model ready — dim=%d", self._dim)
@property
def dim(self) -> int:
return self._dim
@property
def model_name(self) -> str:
return self._model_name
def embed(self, text: str) -> np.ndarray:
vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
return vec.astype(np.float32)
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
if not texts:
return []
vecs = self._model.encode(
texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=32
)
return [v.astype(np.float32) for v in vecs]
# ── Ollama backend ────────────────────────────────────────────────────────────
class OllamaEmbedder:
"""HTTP embedding via a running Ollama instance."""
def __init__(
self,
model_name: str = _MODEL,
llm_url: str = _LLM_URL,
timeout: float = 30.0,
) -> None:
import httpx # already a project dependency
self._model_name = model_name
self._url = f"{llm_url.rstrip('/')}/api/embeddings"
self._timeout = timeout
self._client = httpx.Client(timeout=timeout)
# Probe dimension with a test call
self._dim = self._probe_dim()
def _probe_dim(self) -> int:
try:
vec = self._raw_embed("probe")
return len(vec)
except Exception as exc:
logger.warning("Ollama dim probe failed (%s) — defaulting to 768", exc)
return 768
def _raw_embed(self, text: str) -> list[float]:
resp = self._client.post(
self._url, json={"model": self._model_name, "prompt": text}
)
resp.raise_for_status()
return resp.json().get("embedding") or []
@property
def dim(self) -> int:
return self._dim
@property
def model_name(self) -> str:
return self._model_name
def embed(self, text: str) -> np.ndarray:
vec = self._raw_embed(text)
return np.array(vec, dtype=np.float32)
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
return [self.embed(t) for t in texts]
# ── Singleton factory ─────────────────────────────────────────────────────────
_embedder: Embedder | None = None
def get_embedder() -> Embedder | None:
"""Return the configured embedder singleton, or None when unavailable.
Lazy-initialises on first call. Callers should check EMBEDDING_AVAILABLE
or test for None rather than calling this unconditionally.
"""
global _embedder, EMBEDDING_AVAILABLE
if _embedder is not None:
return _embedder
if _BACKEND == "sentence_transformers":
try:
_embedder = SentenceTransformerEmbedder(_MODEL, _DEVICE)
EMBEDDING_AVAILABLE = True
except ImportError:
logger.warning(
"sentence-transformers not installed — embeddings disabled. "
"Install with: pip install sentence-transformers"
)
except Exception as exc:
logger.warning("Failed to load sentence-transformers model %r: %s", _MODEL, exc)
elif _BACKEND == "ollama":
try:
_embedder = OllamaEmbedder(_MODEL, _LLM_URL)
EMBEDDING_AVAILABLE = True
except Exception as exc:
logger.warning("Ollama embedder init failed: %s", exc)
else:
logger.warning("Unknown TURNSTONE_EMBED_BACKEND %r — embeddings disabled", _BACKEND)
return _embedder
# ── BLOB serialisation helpers ────────────────────────────────────────────────
def pack_vector(vec: np.ndarray) -> bytes:
"""Serialise a float32 numpy vector to a SQLite BLOB."""
arr = vec.astype(np.float32)
return struct.pack(f"{len(arr)}f", *arr.tolist())
def unpack_vector(blob: bytes) -> np.ndarray:
"""Deserialise a SQLite BLOB back to a float32 numpy vector."""
n = len(blob) // 4 # 4 bytes per float32
return np.array(struct.unpack(f"{n}f", blob), dtype=np.float32)
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
"""Cosine similarity between two L2-normalised vectors.
Both vectors are re-normalised defensively so callers need not pre-normalise.
Returns 0.0 when either vector has zero norm.
"""
norm_a = np.linalg.norm(a)
norm_b = np.linalg.norm(b)
if norm_a == 0.0 or norm_b == 0.0:
return 0.0
return float(np.dot(a, b) / (norm_a * norm_b))

View file

@ -6,7 +6,7 @@ import sqlite3
import uuid
from pathlib import Path
from app.ingest.base import now_iso
from app.glean.base import now_iso
from app.services.models import Incident, ReceivedBundle
from app.services.search import SearchResult, entries_in_window, search

View file

@ -10,7 +10,7 @@ class RetrievedEntry:
entry_id: str
source_id: str # log file path or service name
sequence: int # original line number — ingest order, not wall-clock order
sequence: int # original line number — glean order, not wall-clock order
timestamp_raw: str | None # timestamp as it appeared in the log
timestamp_iso: str | None # parsed to ISO 8601 for sorting; None if unparseable
ingest_time: str # when Turnstone indexed this entry (wall clock)
@ -25,7 +25,7 @@ class RetrievedEntry:
@dataclass(frozen=True)
class LogPattern:
"""A named regex pattern for tagging entries at ingest time."""
"""A named regex pattern for tagging entries at glean time."""
name: str # e.g. "device_disconnect", "auth_failure"
pattern: str # regex string

View file

@ -451,9 +451,8 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
else:
suppressed += 1
# When did we last ingest anything?
last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone()
last_ingested: str | None = last_row["t"] if last_row else None
last_gleaned: str | None = last_row["t"] if last_row else None
conn.close()
@ -465,7 +464,7 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
"source_health": source_health,
"recent_criticals": recent_criticals,
"suppressed_criticals": suppressed,
"last_ingested": last_ingested,
"last_gleaned": last_gleaned,
}

View file

@ -1,10 +1,10 @@
"""Periodic batch ingest scheduler with optional CF submission.
"""Periodic batch glean scheduler with optional CF submission.
Runs ingest_sources on a configurable interval (TURNSTONE_INGEST_INTERVAL env var,
Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var,
default 900s / 15 min). Set to 0 to disable.
When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote
Turnstone instance (the CF receiving store) after each ingest run.
Turnstone instance (the CF receiving store) after each glean run.
"""
from __future__ import annotations
@ -19,7 +19,7 @@ from typing import Any
import httpx
from app.ingest.pipeline import ingest_sources
from app.glean.pipeline import glean_sources
logger = logging.getLogger(__name__)
@ -96,14 +96,14 @@ async def submit_matched(
if not entries:
return {"ok": True, "submitted": 0, "skipped": True}
url = f"{submit_endpoint.rstrip('/')}/turnstone/api/ingest/batch"
url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch"
payload = {"source_host": source_host, "entries": entries}
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(url, json=payload)
resp.raise_for_status()
result = resp.json()
submitted = result.get("ingested", len(entries))
submitted = result.get("gleaned", len(entries))
_state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat()
_state.last_submit_count = submitted
_state.last_submit_error = None
@ -121,10 +121,15 @@ async def run_once(
pattern_file: Path | None = None,
submit_endpoint: str | None = None,
source_host: str = "unknown",
force: bool = False,
) -> dict[str, Any]:
"""Ingest all sources once, then submit matched entries if configured."""
"""Ingest all sources once, then submit matched entries if configured.
Pass ``force=True`` to bypass fingerprint checks and re-glean all local
file sources regardless of whether they appear unchanged.
"""
if _lock.locked():
return {"ok": False, "error": "ingest already running", "skipped": True}
return {"ok": False, "error": "glean already running", "skipped": True}
async with _lock:
_state.running = True
@ -133,7 +138,7 @@ async def run_once(
loop = asyncio.get_running_loop()
stats: dict[str, int] = await loop.run_in_executor(
None,
lambda: ingest_sources(sources_file, db_path, pattern_file),
lambda: glean_sources(sources_file, db_path, pattern_file, force=force),
)
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
_state.last_run_at = started.isoformat()
@ -141,14 +146,14 @@ async def run_once(
_state.last_stats = stats
_state.last_error = None
_state.run_count += 1
logger.info("Batch ingest complete in %.1fs — %s", duration, stats)
logger.info("Batch glean complete in %.1fs — %s", duration, stats)
except Exception as exc:
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
_state.last_run_at = started.isoformat()
_state.last_duration_s = round(duration, 2)
_state.last_error = str(exc)
_state.run_count += 1
logger.error("Batch ingest failed: %s", exc)
logger.error("Batch glean failed: %s", exc)
_state.running = False
return {"ok": False, "error": str(exc)}
finally:
@ -168,7 +173,7 @@ async def scheduler_loop(
submit_endpoint: str | None = None,
source_host: str = "unknown",
) -> None:
"""Run ingest + optional submission every interval_s seconds until cancelled."""
"""Run glean + optional submission every interval_s seconds until cancelled."""
logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file)
if submit_endpoint:
logger.info("Submission enabled — endpoint: %s", submit_endpoint)

View file

@ -1,4 +1,4 @@
"""Live watch: tail active log sources and ingest entries in near-real-time.
"""Live watch: tail active log sources and glean entries in near-real-time.
Each WatchSource runs a subprocess (journalctl -f, podman/docker logs -f)
in a daemon thread and pipes lines through the existing ingestors into SQLite.
@ -18,12 +18,12 @@ from typing import Iterator
import yaml
from app.ingest import journald as journald_parser, syslog as syslog_parser
from app.ingest import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
from app.ingest import qbittorrent as qbit_parser, caddy as caddy_parser
from app.ingest.pipeline import _detect_format
from app.ingest.base import _compile, load_patterns, now_iso
from app.ingest.pipeline import _write_batch, _SCHEMA
from app.glean import journald as journald_parser, syslog as syslog_parser
from app.glean import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
from app.glean import qbittorrent as qbit_parser, caddy as caddy_parser
from app.glean.pipeline import _detect_format
from app.glean.base import _compile, load_patterns, now_iso
from app.glean.pipeline import _write_batch, _SCHEMA
from app.services.search import build_fts_index
from app.services.models import RetrievedEntry
@ -85,7 +85,7 @@ class WatchSource:
"source_id": self.config.source_id,
"type": self.config.source_type,
"running": self._thread is not None and self._thread.is_alive(),
"entries_ingested": self._entry_count,
"entries_gleaned": self._entry_count,
"last_event": self._last_event,
"error": self._error,
}

View file

@ -39,7 +39,7 @@ notification agent:
## Webhook URL
```
http://<turnstone-host>:8534/turnstone/api/ingest/tautulli
http://<turnstone-host>:8534/turnstone/api/glean/tautulli
```
Replace `<turnstone-host>` with the hostname or IP of the machine running

View file

@ -2,7 +2,7 @@
"""Turnstone Harvester — collect logs and ship them to a Turnstone instance.
Subcommands:
push Read sources.yaml, POST each log file to Turnstone /api/ingest/upload
push Read sources.yaml, POST each log file to Turnstone /api/glean/upload
incident Tag an incident on the remote Turnstone instance
Usage:
@ -97,8 +97,8 @@ def cmd_push(args: argparse.Namespace) -> int:
logger.warning("No sources defined in %s", sources_path)
return 0
upload_url = args.url.rstrip("/") + "/turnstone/api/ingest/upload"
total_ingested = 0
upload_url = args.url.rstrip("/") + "/turnstone/api/glean/upload"
total_gleaned = 0
errors = 0
for src in sources:
@ -110,9 +110,9 @@ def cmd_push(args: argparse.Namespace) -> int:
logger.info("Pushing %s (%s) ...", src_id, src_path)
try:
result = _post_file(upload_url, src_path, src_id)
count = result.get("ingested", 0)
total_ingested += count
logger.info(" %s: %d entries ingested", src_id, count)
count = result.get("gleaned", 0)
total_gleaned += count
logger.info(" %s: %d entries gleaned", src_id, count)
except urllib.error.HTTPError as exc:
logger.error(" %s: HTTP %d%s", src_id, exc.code, exc.read().decode(errors="replace"))
errors += 1
@ -120,7 +120,7 @@ def cmd_push(args: argparse.Namespace) -> int:
logger.error(" %s: %s", src_id, exc)
errors += 1
logger.info("Done. Total ingested: %d entries, errors: %d", total_ingested, errors)
logger.info("Done. Total gleaned: %d entries, errors: %d", total_gleaned, errors)
return 1 if errors else 0

View file

@ -46,6 +46,6 @@ sources:
# Wazuh SIEM — alerts.json on the Wazuh manager
# Turnstone auto-detects this format; source_id is qualified per agent automatically.
# For push-based ingestion from Wazuh custom integrations, use:
# POST /api/ingest/wazuh/alert (single alert JSON body)
# POST /api/glean/wazuh/alert (single alert JSON body)
# - id: wazuh
# path: /var/ossec/logs/alerts/alerts.json

View file

@ -120,9 +120,9 @@ usage() {
echo -e " ${GREEN}dev${NC} uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})"
echo ""
echo " Data:"
echo -e " ${GREEN}ingest PATH [DB]${NC} Ingest a log file or corpus directory"
echo -e " ${GREEN}ingest-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and ingest"
echo -e " ${GREEN}ingest-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH"
echo -e " ${GREEN}glean PATH [DB]${NC} Glean a log file or corpus directory"
echo -e " ${GREEN}glean-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and glean"
echo -e " ${GREEN}glean-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH"
echo -e " ${GREEN}build-fts${NC} Rebuild the FTS search index"
echo ""
echo " Tests:"
@ -134,8 +134,8 @@ usage() {
echo " Examples:"
echo " ./manage.sh start"
echo " ./manage.sh dev"
echo " ./manage.sh ingest corpus/raw/"
echo " ./manage.sh ingest corpus/raw/ data/custom.db"
echo " ./manage.sh glean corpus/raw/"
echo " ./manage.sh glean corpus/raw/ data/custom.db"
echo ""
}
@ -231,15 +231,15 @@ case "$CMD" in
(cd web && npm run dev -- --port "$VITE_PORT")
;;
ingest)
glean)
if [[ $# -lt 1 ]]; then
error "Usage: ./manage.sh ingest <file_or_dir> [DB_PATH]"
error "Usage: ./manage.sh glean <file_or_dir> [DB_PATH]"
fi
info "Ingesting $1${2:-$DB}"
"$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}"
info "Gleaning $1${2:-$DB}"
"$PYTHON" scripts/glean_corpus.py "$1" "${2:-$DB}"
;;
ingest-plex)
glean-plex)
PLEX_HOST="${1:-cass}"
PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs"
TMP_DIR="/tmp/turnstone-plex-$$"
@ -264,16 +264,16 @@ case "$CMD" in
ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path"
done
info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}"
info "Gleaning ${#REMOTE_LOGS[@]} log file(s) into ${DB}"
for f in "$TMP_DIR"/*.log; do
"$PYTHON" scripts/ingest_corpus.py "$f" "$DB"
"$PYTHON" scripts/glean_corpus.py "$f" "$DB"
done
rm -rf "$TMP_DIR"
info "Done. Restarting server…"
exec bash "$0" restart
;;
ingest-qbit)
glean-qbit)
QBIT_HOST="${1:-}"
# Default log locations in priority order
QBIT_LOG_PATHS=(
@ -316,8 +316,8 @@ case "$CMD" in
info "${LOCAL_LOG}"
fi
info "Ingesting into ${DB}"
"$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB"
info "Gleaning into ${DB}"
"$PYTHON" scripts/glean_corpus.py "${TMP_DIR}"/*.log "$DB"
rm -rf "$TMP_DIR"
info "Done. Restarting server…"
exec bash "$0" restart

View file

@ -1,4 +1,4 @@
# Turnstone pattern library — named regex patterns for log tagging at ingest time.
# Turnstone pattern library — named regex patterns for log tagging at glean time.
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
# used to boost retrieval relevance for diagnostic queries.
#
@ -128,6 +128,21 @@ patterns:
severity: ERROR
description: NFS mount or RPC timeout
- name: service_crash_loop
pattern: "(restart counter is at [0-9]|start request repeated too quickly|Restart limit hit)"
severity: WARN
description: systemd service crash-looping — restart counter incrementing or rate-limit hit; check for DNS resolution failures, missing dependencies, or bad config
- name: pkg_daemon_restart
pattern: "(invoke-rc\\.d|Unit process.*(apt-get|dpkg|preinst).*remains running after unit stopped|Stopped.*service.*openssh|Restarting.*OpenBSD Secure Shell)"
severity: WARN
description: Package manager restarted a system daemon — active SSH or service sessions may have been interrupted
- name: ssh_forward_conflict
pattern: "(channel_setup_fwd_listener_tcpip: cannot listen to port|error: bind.*Address already in use)"
severity: WARN
description: SSH port-forward conflict — previous session port still bound; stale sessions accumulating or rapid reconnects
# Add device/service-specific patterns below this line:
- name: qbit_tracker_error

View file

@ -1,15 +1,15 @@
# Turnstone log sources — Heimdall cluster ingest.
# Turnstone log sources — Heimdall cluster glean.
# Covers: Heimdall (local), Navi, Sif, Cass, Strahl (SSH-collected),
# Docker services on Heimdall, and network device syslog.
#
# Collected by scripts/collect_cluster_logs.sh before each ingest run.
# Collected by scripts/collect_cluster_logs.sh before each glean run.
# All paths are container-side (/data/ = bind-mount of /devl/turnstone-cluster/data/).
#
# Cron (collect + ingest, every 15 min):
# Cron (collect + glean, every 15 min):
# */15 * * * * bash /Library/Development/CircuitForge/turnstone/scripts/collect_cluster_logs.sh && \
# docker exec turnstone-cluster python scripts/ingest_corpus.py \
# docker exec turnstone-cluster python scripts/glean_corpus.py \
# --sources /patterns/sources-cluster.yaml --db /data/turnstone.db \
# >> /var/log/turnstone-cluster-ingest.log 2>&1
# >> /var/log/turnstone-cluster-glean.log 2>&1
sources:
# ── Heimdall (local) ─────────────────────────────────────────────────────────

View file

@ -1,8 +1,8 @@
# Turnstone log sources — edit this file to add or remove services.
# NOTE: the system-journal entry requires export_journal.sh to run on the HOST
# before the container ingest step. See crontab setup instructions in the README.
# Run ingest manually:
# sudo podman exec turnstone python scripts/ingest_corpus.py \
# before the container glean step. See crontab setup instructions in the README.
# Run glean manually:
# sudo podman exec turnstone python scripts/glean_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db
#
# Paths here are container-side paths under the /opt bind mount.
@ -12,7 +12,7 @@
sources:
# ── System (exported by export_journal.sh on the host) ───────────────────
# journal-export.jsonl and dmesg-export.txt are written to /opt/turnstone/data/
# by the export script before each ingest run.
# by the export script before each glean run.
- id: system-journal
path: /data/journal-export.jsonl
@ -73,7 +73,7 @@ sources:
# ── MQTT / IoT (live — subscribe mode, no path needed) ───────────────────
# Requires: pip install circuitforge-core[mqtt]
# These sources are handled by the live MQTT subscriber task (not batch ingest).
# These sources are handled by the live MQTT subscriber task (not batch glean).
# Uncomment and configure to enable.
#
# Meshtastic MQTT bridge (node must have MQTT uplink enabled):

View file

@ -2,7 +2,7 @@
# podman-standalone.sh — Turnstone rootful Podman setup (no Compose)
#
# For hosts running system Podman (non-rootless) with systemd.
# Turnstone is a diagnostic log intelligence layer — ingest service logs,
# Turnstone is a diagnostic log intelligence layer — glean service logs,
# search by symptom, and view incidents in a lightweight web UI.
#
# ── Prerequisites ────────────────────────────────────────────────────────────
@ -28,18 +28,18 @@
# sudo systemctl daemon-reload
# sudo systemctl enable --now turnstone
#
# ── Ingesting logs ────────────────────────────────────────────────────────────
# ── Gleaning logs ─────────────────────────────────────────────────────────────
# All service logs under /opt are accessible inside the container.
# Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/).
#
# To ingest all sources (run manually or via cron):
# To glean all sources (run manually or via cron):
#
# sudo podman exec turnstone python scripts/ingest_corpus.py \
# sudo podman exec turnstone python scripts/glean_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db
#
# Example cron (every 15 minutes, add to root's crontab with: sudo crontab -e):
# */15 * * * * podman exec turnstone python scripts/ingest_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-ingest.log 2>&1
# */15 * * * * podman exec turnstone python scripts/glean_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1
#
# To add a new log source: edit /opt/turnstone/patterns/sources.yaml — no restart needed.
#
@ -73,7 +73,7 @@ TZ=America/Los_Angeles
#
# ── Orchard submission (opt-in telemetry) ────────────────────────────────────
# Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF
# receiving instance after each ingest run. Only matched entries are sent —
# receiving instance after each glean run. Only matched entries are sent —
# no raw log content. Used to build Avocet training data.
#
# export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/contrib2
@ -142,8 +142,8 @@ echo "Check container health with:"
echo " sudo podman ps"
echo " sudo podman logs turnstone"
echo ""
echo "To ingest all sources now:"
echo " sudo podman exec turnstone python scripts/ingest_corpus.py \\"
echo "To glean all sources now:"
echo " sudo podman exec turnstone python scripts/glean_corpus.py \\"
echo " --sources /patterns/sources.yaml --db /data/turnstone.db"
echo ""
echo "To add a new source: edit /opt/turnstone/patterns/sources.yaml — no restart needed."

View file

@ -6,3 +6,4 @@ aiofiles>=23.0.0
python-multipart>=0.0.9
dateparser>=1.2.0
httpx>=0.27.0
paramiko

View file

@ -1,4 +1,4 @@
"""CLI: build (or update) the FTS5 full-text search index after ingest."""
"""CLI: build (or update) the FTS5 full-text search index after glean."""
from __future__ import annotations
import sys
@ -13,7 +13,7 @@ if __name__ == "__main__":
if not db_path.exists():
print(f"ERROR: database not found: {db_path}", file=sys.stderr)
print("Run ingest first: python scripts/ingest_corpus.py", file=sys.stderr)
print("Run glean first: python scripts/glean_corpus.py", file=sys.stderr)
sys.exit(1)
print(f"Building FTS index for {db_path} ...")

View file

@ -20,7 +20,7 @@ SSH_OPTS="-o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=no"
PYTHON=/devl/miniconda3/envs/cf/bin/python
INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/ingest_corpus.py"
DB=/devl/turnstone-cluster/data/turnstone.db
LOG=/devl/turnstone-cluster/data/ingest.log
LOG=/devl/turnstone-cluster/data/glean.log
mkdir -p "${DATA_DIR}"
@ -141,7 +141,7 @@ fi
# Remote journals (explicit source IDs via YAML)
${INGEST} --sources /devl/turnstone-cluster/patterns/sources-cluster.yaml --db "${DB}"
# Docker and Plex logs (source IDs derived from filenames by directory ingest)
# Docker and Plex logs (source IDs derived from filenames by directory glean)
for dir in "${HEIMDALL_DIR}" "${NAVI_DIR}" "${STRAHL_DIR}" "${PLEX_DIR}"; do
[[ -d "${dir}" ]] && ls "${dir}"/*.jsonl "${dir}"/*.log 2>/dev/null | grep -q . && \
${INGEST} "${dir}" "${DB}" || true

View file

@ -1,5 +1,5 @@
#!/usr/bin/env bash
# Export recent system messages to files the Turnstone container can ingest.
# Export recent system messages to files the Turnstone container can glean.
#
# Exports:
# journal-export.jsonl — journald (if journalctl is available)
@ -11,11 +11,11 @@
# Usage (standalone):
# sudo bash /opt/turnstone/scripts/export_journal.sh
#
# Cron (combined with ingest):
# Cron (combined with glean):
# */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \
# podman exec turnstone python scripts/ingest_corpus.py \
# --sources /patterns/sources.yaml --db /data/turnstone.db \
# >> /var/log/turnstone-ingest.log 2>&1
# >> /var/log/turnstone-glean.log 2>&1
set -euo pipefail

View file

@ -1,11 +1,11 @@
"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database.
"""CLI: glean a log file or corpus directory into the Turnstone SQLite database.
Usage:
# Single file or directory (legacy)
python scripts/ingest_corpus.py <file_or_dir> [db_path]
python scripts/glean_corpus.py <file_or_dir> [db_path]
# Sources config (multi-service)
python scripts/ingest_corpus.py --sources <sources.yaml> [--db <db_path>]
python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>]
"""
from __future__ import annotations
@ -17,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.ingest.pipeline import ingest, ingest_file, ingest_sources
from app.glean.pipeline import glean_dir, glean_file, glean_sources
def _print_stats(stats: dict[str, int]) -> None:
@ -33,33 +33,33 @@ if __name__ == "__main__":
if not args:
print(
"Usage:\n"
" ingest_corpus.py <file_or_dir> [db_path]\n"
" ingest_corpus.py --sources <sources.yaml> [--db <db_path>]",
" glean_corpus.py <file_or_dir> [db_path]\n"
" glean_corpus.py --sources <sources.yaml> [--db <db_path>]",
file=sys.stderr,
)
sys.exit(1)
if args[0] == "--sources":
if len(args) < 2:
print("Usage: ingest_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
sys.exit(1)
sources_file = Path(args[1])
db_path = Path("data/turnstone.db")
if "--db" in args:
db_path = Path(args[args.index("--db") + 1])
db_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Ingesting sources from {sources_file}{db_path}")
stats = ingest_sources(sources_file, db_path)
print(f"Gleaning sources from {sources_file}{db_path}")
stats = glean_sources(sources_file, db_path)
_print_stats(stats)
else:
target = Path(args[0])
db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db")
db_path.parent.mkdir(parents=True, exist_ok=True)
print(f"Ingesting {target}{db_path}")
print(f"Gleaning {target}{db_path}")
if target.is_file():
stats = ingest_file(target, db_path)
stats = glean_file(target, db_path)
elif target.is_dir():
stats = ingest(target, db_path)
stats = glean_dir(target, db_path)
else:
print(f"Error: {target} is not a file or directory", file=sys.stderr)
sys.exit(1)

View file

@ -3,7 +3,7 @@ import sqlite3
import pytest
from pathlib import Path
from app.ingest.doc_upload import ingest_upload
from app.glean.doc_upload import glean_upload
from app.context.store import list_facts, list_documents
from app.context.chunker import UnsupportedDocType
@ -40,7 +40,7 @@ services:
ports:
- "32400:32400"
"""
result = ingest_upload(db, "docker-compose.yml", yaml_bytes)
result = glean_upload(db, "docker-compose.yml", yaml_bytes)
assert result["doc_type"] == "yaml"
assert result["facts_written"] >= 1
assert result["chunks_written"] >= 1
@ -53,7 +53,7 @@ services:
def test_ingest_markdown_no_facts(db):
md = b"# Runbook\n\nRestart plex with `systemctl restart plex`."
result = ingest_upload(db, "runbook.md", md)
result = glean_upload(db, "runbook.md", md)
assert result["doc_type"] == "markdown"
assert result["facts_written"] == 0
assert result["chunks_written"] >= 1
@ -61,4 +61,4 @@ def test_ingest_markdown_no_facts(db):
def test_ingest_raises_on_bad_type(db):
with pytest.raises(UnsupportedDocType):
ingest_upload(db, "report.pdf", b"data")
glean_upload(db, "report.pdf", b"data")

View file

@ -1,13 +1,17 @@
"""Tests for app/context/embedder.py — graceful no-op without sqlite-vec."""
"""Tests for app/context/embedder.py — delegates to app.services.embeddings."""
import sqlite3
import struct
from pathlib import Path
from unittest.mock import patch
from unittest.mock import MagicMock, patch
import numpy as np
import pytest
from app.context import embedder as emb_mod
@pytest.fixture
def db(tmp_path):
@pytest.fixture()
def db(tmp_path: Path) -> Path:
db_path = tmp_path / "t.db"
conn = sqlite3.connect(str(db_path))
conn.executescript("""
@ -20,34 +24,78 @@ def db(tmp_path):
REFERENCES context_documents(id) ON DELETE CASCADE,
chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
);
INSERT INTO context_documents VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
INSERT INTO context_documents
VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
INSERT INTO context_chunks VALUES ('c1','d1',0,'hello world',NULL);
INSERT INTO context_chunks VALUES ('c2','d1',1,'second chunk',NULL);
""")
conn.commit()
conn.close()
return db_path
def test_embed_skipped_when_extension_absent(db):
with patch.object(emb_mod, "EMBEDDING_AVAILABLE", False):
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
def _mock_embedder(dim: int = 3) -> MagicMock:
"""Return a mock Embedder that returns constant dim-length vectors."""
m = MagicMock()
m.dim = dim
m.embed_batch.return_value = [np.zeros(dim, dtype=np.float32)] * 10
return m
class TestEmbedChunks:
def test_returns_zero_when_no_embedder(self, db: Path) -> None:
with patch("app.context.embedder.get_embedder", return_value=None):
count = emb_mod.embed_chunks(db, "d1")
assert count == 0
def test_embed_calls_ollama_when_available(db):
import httpx
class FakeResponse:
status_code = 200
def raise_for_status(self): pass
def json(self): return {"embedding": [0.1, 0.2, 0.3]}
with patch.object(emb_mod, "EMBEDDING_AVAILABLE", True), \
patch("app.context.embedder.httpx.post", return_value=FakeResponse()):
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
assert count == 1
# Verify blob was written
def test_returns_zero_when_no_unembedded_chunks(self, db: Path) -> None:
# Pre-fill both chunks with a blob
blob = struct.pack("3f", 0.1, 0.2, 0.3)
conn = sqlite3.connect(str(db))
row = conn.execute("SELECT embedding FROM context_chunks WHERE id='c1'").fetchone()
conn.execute("UPDATE context_chunks SET embedding=?", (blob,))
conn.commit()
conn.close()
assert row[0] is not None
embedder = _mock_embedder()
with patch("app.context.embedder.get_embedder", return_value=embedder):
count = emb_mod.embed_chunks(db, "d1")
assert count == 0
embedder.embed_batch.assert_not_called()
def test_embeds_all_null_chunks(self, db: Path) -> None:
embedder = _mock_embedder(dim=3)
with patch("app.context.embedder.get_embedder", return_value=embedder):
count = emb_mod.embed_chunks(db, "d1")
assert count == 2 # two chunks in fixture
def test_blobs_written_to_db(self, db: Path) -> None:
vec = np.array([0.1, 0.2, 0.3], dtype=np.float32)
embedder = _mock_embedder(dim=3)
embedder.embed_batch.return_value = [vec, vec]
with patch("app.context.embedder.get_embedder", return_value=embedder):
emb_mod.embed_chunks(db, "d1")
conn = sqlite3.connect(str(db))
rows = conn.execute(
"SELECT embedding FROM context_chunks WHERE document_id='d1'"
).fetchall()
conn.close()
for (blob,) in rows:
assert blob is not None
unpacked = struct.unpack(f"{len(blob)//4}f", blob)
assert len(unpacked) == 3
def test_legacy_llm_url_param_accepted(self, db: Path) -> None:
"""Ensure backward-compat signature still works (llm_url ignored)."""
embedder = _mock_embedder()
with patch("app.context.embedder.get_embedder", return_value=embedder):
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434", "nomic-embed-text")
assert count == 2
def test_embed_batch_error_returns_zero(self, db: Path) -> None:
embedder = _mock_embedder()
embedder.embed_batch.side_effect = RuntimeError("model exploded")
with patch("app.context.embedder.get_embedder", return_value=embedder):
count = emb_mod.embed_chunks(db, "d1")
assert count == 0

View file

@ -2,7 +2,7 @@
import sqlite3
from pathlib import Path
import pytest
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
def test_context_tables_created(tmp_path):

View file

@ -9,7 +9,7 @@ from unittest.mock import MagicMock, patch
@pytest.fixture
def client(tmp_path):
from fastapi.testclient import TestClient
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import app.rest as rest_module
db = tmp_path / "test.db"
@ -25,7 +25,7 @@ def client(tmp_path):
@pytest.fixture
def client_with_candidate(tmp_path):
from fastapi.testclient import TestClient
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import app.rest as rest_module
import sqlite3, uuid

View file

@ -0,0 +1,245 @@
"""Tests for app/services/diagnose/classifier.py — SeverityClassifier.
All ML-path tests mock ``transformers.pipeline`` so no model weights are
downloaded during the test suite.
"""
from __future__ import annotations
from dataclasses import FrozenInstanceError
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
import app.services.diagnose.classifier as clf_module
from app.services.diagnose.classifier import SeverityClassifier
from app.services.diagnose.models import ClassifiedTimeline, EventCluster, TimelineResult
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def reset_ml_singleton():
"""Ensure the module-level ML singleton is cleared before and after each test."""
clf_module._ml_classifier = None
yield
clf_module._ml_classifier = None
# ---------------------------------------------------------------------------
# Test-object builders
# ---------------------------------------------------------------------------
def _make_cluster(
representative_text: str = "test log",
pattern_tags: tuple[str, ...] = (),
severity: str = "INFO",
) -> EventCluster:
return EventCluster(
cluster_id="abc123",
entries=("e1",),
start_iso=None,
end_iso=None,
duration_seconds=0.0,
source_ids=("src",),
pattern_tags=pattern_tags,
severity=severity, # type: ignore[arg-type]
burst=False,
gap_before_seconds=0.0,
representative_text=representative_text,
)
def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
return TimelineResult(
clusters=clusters,
total_entries=0,
window_start=None,
window_end=None,
gap_count=0,
burst_count=0,
dominant_sources=(),
)
def _mock_hf_pipeline(label: str, score: float) -> MagicMock:
"""Return a mock HF pipeline callable that always yields one result."""
pipe = MagicMock()
pipe.return_value = [{"label": label, "score": score}]
return pipe
# ---------------------------------------------------------------------------
# Path A — ML classification
# ---------------------------------------------------------------------------
class TestMLPath:
def test_ml_error_maps_to_error(self) -> None:
"""ML returning ERROR with score 0.98 → cluster severity ERROR."""
pipe = _mock_hf_pipeline("ERROR", 0.98)
with patch(
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
):
clf = SeverityClassifier(model_id="fake/model")
result = clf.classify(_make_timeline(((_make_cluster("disk error detected")),)))
assert result.cluster_severities["abc123"] == "ERROR"
assert result.classifier_used == "ml"
assert result.model_id == "fake/model"
def test_ml_critical_promotion(self) -> None:
"""ERROR + score > 0.95 + 'kernel panic' in text → promoted to CRITICAL."""
pipe = _mock_hf_pipeline("ERROR", 0.97)
with patch(
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
):
clf = SeverityClassifier(model_id="fake/model")
result = clf.classify(
_make_timeline((_make_cluster("kernel panic: not syncing VFS"),))
)
assert result.cluster_severities["abc123"] == "CRITICAL"
def test_ml_debug_demotion(self) -> None:
"""INFO + score < 0.4 → demoted to DEBUG."""
pipe = _mock_hf_pipeline("INFO", 0.3)
with patch(
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
):
clf = SeverityClassifier(model_id="fake/model")
result = clf.classify(_make_timeline((_make_cluster("routine ping"),)))
assert result.cluster_severities["abc123"] == "DEBUG"
def test_ml_warning_maps_to_warn(self) -> None:
"""ML returning WARNING → mapped to WARN."""
pipe = _mock_hf_pipeline("WARNING", 0.85)
with patch(
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
):
clf = SeverityClassifier(model_id="fake/model")
result = clf.classify(_make_timeline((_make_cluster("low disk space"),)))
assert result.cluster_severities["abc123"] == "WARN"
# ---------------------------------------------------------------------------
# Path B — pattern_tags fallback
# ---------------------------------------------------------------------------
class TestPatternTagsPath:
def test_pattern_tags_resolve_error_severity(self, tmp_path: Path) -> None:
"""Cluster with pattern_tag 'service_crash_loop' → ERROR from pattern file."""
pattern_yaml = tmp_path / "default.yaml"
pattern_yaml.write_text(
"patterns:\n"
" - name: service_crash_loop\n"
" pattern: crash\n"
" severity: ERROR\n"
" description: Service crashed in a loop\n"
)
clf = SeverityClassifier(model_id="", pattern_file=pattern_yaml)
cluster = _make_cluster(
representative_text="service crashed",
pattern_tags=("service_crash_loop",),
)
result = clf.classify(_make_timeline((cluster,)))
assert result.cluster_severities["abc123"] == "ERROR"
assert result.classifier_used == "pattern_tags"
assert result.model_id is None
# ---------------------------------------------------------------------------
# Path C — regex fallback
# ---------------------------------------------------------------------------
class TestRegexPath:
def test_regex_detects_error(self) -> None:
"""No ML, no pattern file: 'ERROR: disk full' → ERROR via regex."""
clf = SeverityClassifier(model_id="")
result = clf.classify(
_make_timeline((_make_cluster("ERROR: disk full"),))
)
assert result.cluster_severities["abc123"] == "ERROR"
assert result.classifier_used == "regex"
def test_regex_defaults_to_info_when_no_match(self) -> None:
"""No severity keyword in text → defaults to INFO."""
clf = SeverityClassifier(model_id="")
result = clf.classify(
_make_timeline((_make_cluster("mount: disk mounted successfully"),))
)
assert result.cluster_severities["abc123"] == "INFO"
# ---------------------------------------------------------------------------
# Fallback behaviour
# ---------------------------------------------------------------------------
class TestImportErrorFallback:
def test_transformers_import_error_falls_back_to_pattern_tags(
self, tmp_path: Path
) -> None:
"""ImportError from transformers → clean fallback to pattern_tags path."""
pattern_yaml = tmp_path / "default.yaml"
pattern_yaml.write_text(
"patterns:\n"
" - name: auth_failure\n"
" pattern: auth\n"
" severity: ERROR\n"
" description: Auth failure\n"
)
def _raising_get_ml(*_args: Any, **_kwargs: Any) -> None:
raise ImportError("No module named 'transformers'")
with patch(
"app.services.diagnose.classifier._get_ml_classifier",
side_effect=_raising_get_ml,
):
clf = SeverityClassifier(model_id="fake/model", pattern_file=pattern_yaml)
cluster = _make_cluster(
representative_text="auth failed",
pattern_tags=("auth_failure",),
)
result = clf.classify(_make_timeline((cluster,)))
# ML was attempted (classifier_used == "ml") but pattern_tags resolved it
assert result.classifier_used == "ml"
assert result.cluster_severities["abc123"] == "ERROR"
# ---------------------------------------------------------------------------
# Edge cases
# ---------------------------------------------------------------------------
class TestEdgeCases:
def test_empty_timeline_produces_empty_severities(self) -> None:
"""TimelineResult with no clusters → empty cluster_severities, no crash."""
clf = SeverityClassifier(model_id="")
result = clf.classify(_make_timeline())
assert isinstance(result, ClassifiedTimeline)
assert result.cluster_severities == {}
assert result.classifier_used == "regex"
def test_classified_timeline_is_frozen(self) -> None:
"""ClassifiedTimeline must be frozen (FrozenInstanceError on mutation)."""
clf = SeverityClassifier(model_id="")
result = clf.classify(_make_timeline((_make_cluster(),)))
with pytest.raises(FrozenInstanceError):
result.classifier_used = "ml" # type: ignore[misc]

View file

@ -0,0 +1,486 @@
"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer.
All tests use mocking; no real LLM calls are made.
"""
from __future__ import annotations
import json
import re
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
from app.context.retriever import RetrievedContext
from app.services.diagnose.hypothesizer import RootCauseHypothesizer
from app.services.diagnose.models import (
ClassifiedTimeline,
EventCluster,
Hypothesis,
TimelineResult,
)
# ---------------------------------------------------------------------------
# Fixture helpers
# ---------------------------------------------------------------------------
def _make_cluster(
cluster_id: str = "c1",
representative_text: str = "kernel: oom-killer invoked",
severity: str = "ERROR",
source_ids: tuple[str, ...] = ("syslog",),
pattern_tags: tuple[str, ...] = ("oom",),
start_iso: str | None = "2024-01-01T00:00:00+00:00",
) -> EventCluster:
return EventCluster(
cluster_id=cluster_id,
entries=("e1",),
start_iso=start_iso,
end_iso=None,
duration_seconds=1.0,
source_ids=source_ids,
pattern_tags=pattern_tags,
severity=severity, # type: ignore[arg-type]
burst=False,
gap_before_seconds=0.0,
representative_text=representative_text,
)
def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
return TimelineResult(
clusters=clusters,
total_entries=len(clusters),
window_start=None,
window_end=None,
gap_count=0,
burst_count=0,
dominant_sources=(),
)
def _make_classified(
clusters: tuple[EventCluster, ...] = (),
cluster_severities: dict | None = None,
) -> ClassifiedTimeline:
if cluster_severities is None:
cluster_severities = {c.cluster_id: c.severity for c in clusters}
return ClassifiedTimeline(
timeline=_make_timeline(clusters),
cluster_severities=cluster_severities,
classifier_used="pattern_tags",
model_id=None,
)
def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
return RetrievedContext(
facts=[],
chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}],
)
def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock:
"""Build a mock httpx.Response that returns the given list as JSON."""
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"choices": [{"message": {"content": json.dumps(items)}}]
}
return mock_resp
_SAMPLE_HYPOTHESES = [
{
"title": "OOM killer terminated critical process",
"description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.",
"confidence": 0.85,
"severity": "CRITICAL",
"supporting_clusters": ["c1"],
},
{
"title": "Disk I/O saturation",
"description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.",
"confidence": 0.6,
"severity": "ERROR",
"supporting_clusters": ["c2"],
},
]
# ---------------------------------------------------------------------------
# Test 1: Valid JSON response returns correct Hypothesis objects
# ---------------------------------------------------------------------------
def test_valid_json_response_returns_hypotheses():
"""Valid LLM JSON array produces a list of Hypothesis objects with correct fields."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="why is memory failing?",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 2
assert isinstance(results[0], Hypothesis)
assert results[0].title == "OOM killer terminated critical process"
assert results[0].confidence == pytest.approx(0.85)
assert results[0].severity == "CRITICAL"
assert results[0].supporting_cluster_ids == ("c1",)
assert results[1].title == "Disk I/O saturation"
assert results[1].severity == "ERROR"
# ---------------------------------------------------------------------------
# Test 2: hypothesis_id is a non-empty UUID string on each result
# ---------------------------------------------------------------------------
_UUID_RE = re.compile(
r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
)
def test_hypothesis_id_is_uuid():
"""Each returned Hypothesis carries a distinct UUID v4 hypothesis_id."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 2
for h in results:
assert h.hypothesis_id, "hypothesis_id must not be empty"
assert _UUID_RE.match(h.hypothesis_id), (
f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4"
)
# Each ID must be distinct
ids = [h.hypothesis_id for h in results]
assert len(set(ids)) == len(ids), "hypothesis_ids must be unique"
# ---------------------------------------------------------------------------
# Test 3: Malformed JSON response returns [] with a logged warning
# ---------------------------------------------------------------------------
def test_malformed_json_returns_empty_and_warns(caplog):
"""When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
bad_resp = MagicMock()
bad_resp.status_code = 200
bad_resp.json.return_value = {
"choices": [{"message": {"content": "not valid json"}}]
}
import logging
with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert results == []
assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records)
# ---------------------------------------------------------------------------
# Test 4: Non-list JSON (dict) returns []
# ---------------------------------------------------------------------------
def test_non_list_json_returns_empty(caplog):
"""When the LLM returns a JSON object instead of an array, hypothesize() returns []."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
dict_resp = MagicMock()
dict_resp.status_code = 200
dict_resp.json.return_value = {
"choices": [{"message": {"content": '{"error": "oops"}'}}]
}
import logging
with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert results == []
assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records)
# ---------------------------------------------------------------------------
# Test 5: Empty clusters returns [] without any LLM call
# ---------------------------------------------------------------------------
def test_empty_clusters_returns_empty_no_llm_call():
"""ClassifiedTimeline with no clusters returns [] and never calls the LLM."""
classified = _make_classified(clusters=())
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
with patch("httpx.post") as mock_post:
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert results == []
mock_post.assert_not_called()
# ---------------------------------------------------------------------------
# Test 6: No LLM URL returns [] without any HTTP call
# ---------------------------------------------------------------------------
def test_no_llm_url_returns_empty_no_http_call():
"""When llm_url is None, hypothesize() returns [] immediately with no HTTP requests."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
with patch("httpx.post") as mock_post:
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url=None,
llm_model="llama3",
)
assert results == []
mock_post.assert_not_called()
def test_empty_llm_url_returns_empty_no_http_call():
"""When llm_url is empty string, hypothesize() returns [] immediately."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
with patch("httpx.post") as mock_post:
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="",
llm_model="llama3",
)
assert results == []
mock_post.assert_not_called()
def test_no_llm_model_returns_empty_no_http_call():
"""When llm_model is None, hypothesize() returns [] immediately."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
with patch("httpx.post") as mock_post:
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model=None,
)
assert results == []
mock_post.assert_not_called()
# ---------------------------------------------------------------------------
# Test 7: max_hypotheses is respected
# ---------------------------------------------------------------------------
def test_max_hypotheses_respected():
"""When LLM returns more items than max_hypotheses, only max_hypotheses are returned."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer(max_hypotheses=3)
six_items = [
{
"title": f"Hypothesis {i}",
"description": "Some description. A second sentence. Third sentence here.",
"confidence": 0.5,
"severity": "ERROR",
"supporting_clusters": ["c1"],
}
for i in range(6)
]
mock_resp = _llm_json_response(six_items)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 3
# ---------------------------------------------------------------------------
# Test 8: Severity validation — WARNING → WARN, garbage → ERROR
# ---------------------------------------------------------------------------
def test_severity_warning_maps_to_warn():
"""'WARNING' from the LLM is normalised to 'WARN'."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
items = [
{
"title": "A warning severity hypothesis",
"description": "Test description. Second sentence. Third.",
"confidence": 0.7,
"severity": "WARNING",
"supporting_clusters": ["c1"],
}
]
mock_resp = _llm_json_response(items)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 1
assert results[0].severity == "WARN"
def test_severity_garbage_maps_to_error():
"""An unrecognised severity string from the LLM defaults to 'ERROR'."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
items = [
{
"title": "A garbage severity hypothesis",
"description": "Test description. Second sentence. Third.",
"confidence": 0.4,
"severity": "GARBAGE",
"supporting_clusters": ["c1"],
}
]
mock_resp = _llm_json_response(items)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 1
assert results[0].severity == "ERROR"
# ---------------------------------------------------------------------------
# Test 9: Confidence field works with string floats from the LLM
# ---------------------------------------------------------------------------
def test_confidence_string_float_coercion():
"""A confidence value returned as a string by the LLM is coerced to float via float()."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
items = [
{
"title": "String confidence test",
"description": "Some description. Second sentence. Third.",
"confidence": "0.8", # LLM returned a string, not a float
"severity": "INFO",
"supporting_clusters": ["c1"],
}
]
mock_resp = _llm_json_response(items)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 1
assert isinstance(results[0].confidence, float)
assert results[0].confidence == pytest.approx(0.8)
# ---------------------------------------------------------------------------
# Test 10: Non-numeric confidence string falls back to default 0.5
# ---------------------------------------------------------------------------
def test_non_numeric_confidence_uses_default():
"""LLM returning 'high' for confidence should not raise and defaults to 0.5."""
cluster = _make_cluster()
classified = _make_classified(clusters=(cluster,))
ctx = _make_ctx()
hypothesizer = RootCauseHypothesizer()
items = [
{
"title": "t",
"description": "d",
"confidence": "high",
"severity": "ERROR",
"supporting_clusters": [],
}
]
mock_resp = _llm_json_response(items)
with patch("httpx.post", return_value=mock_resp):
results = hypothesizer.hypothesize(
classified, ctx, query="test",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert len(results) == 1
assert isinstance(results[0].confidence, float)
assert results[0].confidence == pytest.approx(0.5)

View file

@ -0,0 +1,489 @@
"""Tests for app/services/diagnose/pipeline.py and __init__.py feature flag wiring.
All tests use mocking; no real LLM, ML, or DB calls are made.
"""
from __future__ import annotations
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
from app.context.retriever import RetrievedContext
from app.services.diagnose.models import (
ClassifiedTimeline,
Hypothesis,
RankedHypothesis,
TimelineResult,
)
from app.services.search import SearchResult
# ---------------------------------------------------------------------------
# Shared helpers
# ---------------------------------------------------------------------------
def _make_search_result(
entry_id: str = "e1",
source_id: str = "syslog",
timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
severity: str | None = "ERROR",
text: str = "ssh: invalid user",
) -> SearchResult:
return SearchResult(
entry_id=entry_id,
source_id=source_id,
sequence=1,
timestamp_iso=timestamp_iso,
severity=severity,
repeat_count=1,
out_of_order=False,
matched_patterns=["ssh_fail"],
text=text,
rank=1.0,
)
def _make_ctx() -> RetrievedContext:
return RetrievedContext(facts=[], chunks=[])
def _make_timeline(n_clusters: int = 2) -> TimelineResult:
return TimelineResult(
clusters=tuple(),
total_entries=5,
window_start="2026-01-01T00:00:00+00:00",
window_end="2026-01-01T01:00:00+00:00",
gap_count=0,
burst_count=1,
dominant_sources=("syslog",),
)
def _make_classified(timeline: TimelineResult | None = None) -> ClassifiedTimeline:
tl = timeline or _make_timeline()
return ClassifiedTimeline(
timeline=tl,
cluster_severities={},
classifier_used="regex",
model_id=None,
)
def _make_hypothesis(
hypothesis_id: str = "h1",
title: str = "SSH flood",
confidence: float = 0.87,
severity: str = "CRITICAL",
) -> Hypothesis:
return Hypothesis(
hypothesis_id=hypothesis_id,
title=title,
description="Multiple failed SSH attempts.",
confidence=confidence,
supporting_cluster_ids=("c1",),
runbook_refs=(),
severity=severity, # type: ignore[arg-type]
)
def _make_ranked(hypothesis: Hypothesis | None = None, suppress: bool = False) -> RankedHypothesis:
h = hypothesis or _make_hypothesis()
return RankedHypothesis(
hypothesis=h,
novelty_score=0.95,
similarity_to_known=0.05,
suppress=suppress,
suppression_reason="similar to known" if suppress else None,
)
# ---------------------------------------------------------------------------
# Helper: collect all events from run_pipeline
# ---------------------------------------------------------------------------
async def _collect_pipeline_events(**kwargs) -> list[dict[str, Any]]:
"""Run run_pipeline and collect all yielded events into a list."""
from app.services.diagnose.pipeline import run_pipeline
events = []
async for event in run_pipeline(**kwargs):
events.append(event)
return events
def _default_pipeline_kwargs(entries=None, db_path=None) -> dict:
return dict(
db_path=db_path or Path("/tmp/fake.db"),
entries=entries or [_make_search_result()],
ctx=_make_ctx(),
query="ssh brute force",
since="2026-01-01T00:00:00+00:00",
until="2026-01-01T01:00:00+00:00",
llm_url=None,
llm_model=None,
llm_api_key=None,
)
# ---------------------------------------------------------------------------
# Mock factories for all 5 stage classes
# ---------------------------------------------------------------------------
def _mock_all_stages(
hypotheses=None,
ranked=None,
synthesis_text="VERDICT: CRITICAL — SSH flood (87% confidence)",
):
"""Return a dict of patch targets and their mock return values."""
timeline = _make_timeline()
classified = _make_classified(timeline)
hyps = hypotheses if hypotheses is not None else [_make_hypothesis()]
rnk = ranked if ranked is not None else [_make_ranked()]
mock_reconstructor = MagicMock()
mock_reconstructor.return_value.reconstruct.return_value = timeline
mock_classifier = MagicMock()
mock_classifier.return_value.classify.return_value = classified
mock_hypothesizer = MagicMock()
mock_hypothesizer.return_value.hypothesize.return_value = hyps
mock_suppressor = MagicMock()
mock_suppressor.return_value.suppress.return_value = rnk
mock_synthesizer = MagicMock()
mock_synthesizer.return_value.synthesize.return_value = synthesis_text
return {
"app.services.diagnose.pipeline.TimelineReconstructor": mock_reconstructor,
"app.services.diagnose.pipeline.SeverityClassifier": mock_classifier,
"app.services.diagnose.pipeline.RootCauseHypothesizer": mock_hypothesizer,
"app.services.diagnose.pipeline.FalsePositiveSuppressor": mock_suppressor,
"app.services.diagnose.pipeline.SummarySynthesizer": mock_synthesizer,
}
# ---------------------------------------------------------------------------
# 1. Feature flag off: legacy summarize() path runs, not run_pipeline
# ---------------------------------------------------------------------------
class TestFeatureFlagOff:
@pytest.mark.asyncio
async def test_legacy_path_when_flag_off(self):
"""With MULTI_AGENT_ENABLED=False, run_pipeline is never called."""
from app.services import diagnose as diagnose_module
entries = [_make_search_result()]
with (
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
patch("app.services.diagnose.search", return_value=entries),
patch("app.services.diagnose.entries_in_window", return_value=[]),
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
patch("app.services.diagnose.format_context_block", return_value=None),
patch("app.services.diagnose.run_pipeline") as mock_pipeline,
patch("app.services.diagnose.summarize", return_value=None),
):
events = []
async for event in diagnose_module.diagnose_stream(
db_path=Path("/tmp/fake.db"),
query="ssh failures",
llm_url=None,
llm_model=None,
):
events.append(event)
# run_pipeline must NOT have been called
mock_pipeline.assert_not_called()
# SSE sequence must end with done
types = [e["type"] for e in events]
assert "done" in types
assert types[-1] == "done"
@pytest.mark.asyncio
async def test_legacy_done_event_is_last(self):
"""Legacy path: done is always the last event."""
from app.services import diagnose as diagnose_module
with (
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
patch("app.services.diagnose.search", return_value=[]),
patch("app.services.diagnose.entries_in_window", return_value=[]),
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
patch("app.services.diagnose.format_context_block", return_value=None),
):
events = []
async for event in diagnose_module.diagnose_stream(
db_path=Path("/tmp/fake.db"),
query="check logs",
):
events.append(event)
assert events[-1] == {"type": "done"}
# ---------------------------------------------------------------------------
# 2. Feature flag on, all stages mocked: verify SSE event sequence
# ---------------------------------------------------------------------------
class TestFeatureFlagOn:
@pytest.mark.asyncio
async def test_pipeline_stage_events_in_order(self):
"""pipeline_stage events must be emitted stages 1→2→3→4 in order."""
mocks = _mock_all_stages()
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
stages = [e["stage"] for e in stage_events]
assert stages == [1, 2, 3, 4]
@pytest.mark.asyncio
async def test_hypotheses_event_after_stage4(self):
"""hypotheses event must appear after pipeline_stage stage=4."""
mocks = _mock_all_stages()
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
stage4_idx = next(
i for i, e in enumerate(events)
if e.get("type") == "pipeline_stage" and e.get("stage") == 4
)
hyp_idx = next(i for i, e in enumerate(events) if e.get("type") == "hypotheses")
assert hyp_idx > stage4_idx
@pytest.mark.asyncio
async def test_reasoning_event_emitted(self):
"""reasoning event must be present when synthesizer returns text."""
mocks = _mock_all_stages(synthesis_text="VERDICT: CRITICAL — SSH flood")
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
reasoning_events = [e for e in events if e.get("type") == "reasoning"]
assert len(reasoning_events) == 1
assert "VERDICT" in reasoning_events[0]["text"]
@pytest.mark.asyncio
async def test_done_event_is_last(self):
"""done must always be the last event in the pipeline sequence."""
mocks = _mock_all_stages()
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
assert events[-1] == {"type": "done"}
@pytest.mark.asyncio
async def test_pipeline_wired_from_diagnose_stream(self):
"""diagnose_stream routes through run_pipeline when flag is on."""
from app.services import diagnose as diagnose_module
entries = [_make_search_result()]
async def fake_pipeline(**kwargs):
yield {"type": "status", "message": "Building timeline…"}
yield {"type": "pipeline_stage", "stage": 1, "name": "timeline", "message": "Built 1 clusters, 0 bursts"}
yield {"type": "done"}
with (
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", True),
patch("app.services.diagnose.search", return_value=entries),
patch("app.services.diagnose.entries_in_window", return_value=[]),
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
patch("app.services.diagnose.format_context_block", return_value=None),
patch("app.services.diagnose.run_pipeline", side_effect=fake_pipeline),
):
events = []
async for event in diagnose_module.diagnose_stream(
db_path=Path("/tmp/fake.db"),
query="ssh failures",
):
events.append(event)
types = [e["type"] for e in events]
assert "pipeline_stage" in types
assert types[-1] == "done"
# Legacy summarize() must NOT have been called — done event came from pipeline
assert types.count("done") == 1
# ---------------------------------------------------------------------------
# 3. Empty entries: pipeline completes with done
# ---------------------------------------------------------------------------
class TestEmptyEntries:
@pytest.mark.asyncio
async def test_empty_entries_pipeline_completes(self):
"""Pipeline with entries=[] must still complete and emit done."""
mocks = _mock_all_stages(hypotheses=[], ranked=[])
kwargs = _default_pipeline_kwargs(entries=[])
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
types = [e["type"] for e in events]
assert "done" in types
assert types[-1] == "done"
@pytest.mark.asyncio
async def test_empty_entries_all_stage_events_present(self):
"""Even with empty entries, all 4 pipeline_stage events are emitted."""
mocks = _mock_all_stages(hypotheses=[], ranked=[])
kwargs = _default_pipeline_kwargs(entries=[])
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
assert len(stage_events) == 4
# ---------------------------------------------------------------------------
# 4. No LLM: Stage 3 and Stage 5 return empty/fallback; done still emitted
# ---------------------------------------------------------------------------
class TestNoLLM:
@pytest.mark.asyncio
async def test_no_llm_pipeline_completes_with_done(self):
"""No llm_url/llm_model: pipeline runs all stages and emits done."""
mocks = _mock_all_stages(hypotheses=[], ranked=[], synthesis_text="VERDICT: UNKNOWN — no hypotheses generated")
kwargs = _default_pipeline_kwargs()
# llm_url and llm_model already None in default kwargs
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
assert events[-1] == {"type": "done"}
@pytest.mark.asyncio
async def test_no_llm_no_reasoning_event_when_synthesis_empty(self):
"""When synthesizer returns empty string, no reasoning event is emitted."""
mocks = _mock_all_stages(synthesis_text="")
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
reasoning_events = [e for e in events if e.get("type") == "reasoning"]
assert len(reasoning_events) == 0
# ---------------------------------------------------------------------------
# 5. Stage 1 cluster count in pipeline_stage message
# ---------------------------------------------------------------------------
class TestStage1Message:
@pytest.mark.asyncio
async def test_stage1_message_contains_cluster_count(self):
"""pipeline_stage stage=1 message must report cluster count."""
timeline = TimelineResult(
clusters=tuple(),
total_entries=10,
window_start=None,
window_end=None,
gap_count=0,
burst_count=3,
dominant_sources=("syslog",),
)
classified = _make_classified(timeline)
mock_reconstructor = MagicMock()
mock_reconstructor.return_value.reconstruct.return_value = timeline
mock_classifier = MagicMock()
mock_classifier.return_value.classify.return_value = classified
mock_hypothesizer = MagicMock()
mock_hypothesizer.return_value.hypothesize.return_value = []
mock_suppressor = MagicMock()
mock_suppressor.return_value.suppress.return_value = []
mock_synthesizer = MagicMock()
mock_synthesizer.return_value.synthesize.return_value = "VERDICT: INFO — nothing found"
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mock_reconstructor),
patch("app.services.diagnose.pipeline.SeverityClassifier", mock_classifier),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mock_hypothesizer),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mock_suppressor),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mock_synthesizer),
):
events = await _collect_pipeline_events(**kwargs)
stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
# 0 clusters (empty tuple), 3 bursts
assert "0" in stage1["message"] # cluster count
assert "3" in stage1["message"] # burst count
@pytest.mark.asyncio
async def test_stage1_name_is_timeline(self):
"""pipeline_stage stage=1 must have name='timeline'."""
mocks = _mock_all_stages()
kwargs = _default_pipeline_kwargs()
with (
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
):
events = await _collect_pipeline_events(**kwargs)
stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
assert stage1["name"] == "timeline"

View file

@ -0,0 +1,432 @@
"""Tests for app/services/diagnose/suppressor.py — FalsePositiveSuppressor.
All tests use mocking; no real model downloads are made.
"""
from __future__ import annotations
import math
import sqlite3
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
import app.services.diagnose.suppressor as sup_module
from app.services.diagnose.models import Hypothesis, RankedHypothesis
from app.services.diagnose.suppressor import FalsePositiveSuppressor
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_hypothesis(
title: str = "Test",
description: str = "A test hypothesis.",
confidence: float = 0.8,
severity: str = "ERROR",
) -> Hypothesis:
return Hypothesis(
hypothesis_id="test-id",
title=title,
description=description,
confidence=confidence,
supporting_cluster_ids=(),
runbook_refs=(),
severity=severity, # type: ignore[arg-type]
)
def _make_db_with_incidents(incidents: list[tuple[str, str]], db_path: Path) -> Path:
"""Create a temporary SQLite database with resolved incidents. Returns the db path."""
with sqlite3.connect(str(db_path)) as conn:
conn.execute(
"CREATE TABLE incidents "
"(id INTEGER PRIMARY KEY, label TEXT, notes TEXT, ended_at TEXT)"
)
for label, notes in incidents:
conn.execute(
"INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
(label, notes, "2024-01-01T00:00:00"),
)
conn.commit()
return db_path
def _make_empty_db(db_path: Path) -> Path:
"""Create a temporary SQLite DB with no incidents table."""
with sqlite3.connect(str(db_path)) as conn:
conn.execute("CREATE TABLE unrelated (id INTEGER PRIMARY KEY)")
conn.commit()
return db_path
def _make_mock_embedder(
embed_return: list[float] | None = None,
embed_batch_return: list[list[float]] | None = None,
) -> MagicMock:
"""Build a mock embedder with controllable embed/embed_batch responses."""
embedder = MagicMock()
# Default: unit vector along first dimension
default_vec = [1.0] + [0.0] * 383
raw_single = embed_return if embed_return is not None else default_vec
raw_batch = embed_batch_return if embed_batch_return is not None else [default_vec]
# Wrap scalars in numpy-like MagicMock with .tolist()
def _wrap(vec: list[float]) -> MagicMock:
m = MagicMock()
m.tolist.return_value = vec
return m
embedder.embed.return_value = _wrap(raw_single)
embedder.embed_batch.return_value = [_wrap(v) for v in raw_batch]
return embedder
# ---------------------------------------------------------------------------
# Autouse fixture: reset module-level cache between tests
# ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
def reset_suppressor_cache():
sup_module._corpus_cache.clear()
yield
sup_module._corpus_cache.clear()
# ---------------------------------------------------------------------------
# Test 1: No model configured — passthrough, ranked by confidence
# ---------------------------------------------------------------------------
def test_no_model_passthrough_ranked_by_confidence(tmp_path):
"""model_id='' → all novelty_score=1.0, suppress=False, ranked by confidence desc."""
h_low = _make_hypothesis(title="Low", confidence=0.3)
h_high = _make_hypothesis(title="High", confidence=0.9)
h_mid = _make_hypothesis(title="Mid", confidence=0.6)
db_path = tmp_path / "turnstone.db"
suppressor = FalsePositiveSuppressor(model_id="")
results = suppressor.suppress([h_low, h_high, h_mid], db_path)
assert len(results) == 3
assert all(isinstance(r, RankedHypothesis) for r in results)
assert all(r.novelty_score == pytest.approx(1.0) for r in results)
assert all(r.similarity_to_known == pytest.approx(0.0) for r in results)
assert all(r.suppress is False for r in results)
assert all(r.suppression_reason is None for r in results)
# Ranked by confidence descending
confidences = [r.hypothesis.confidence for r in results]
assert confidences == sorted(confidences, reverse=True)
# ---------------------------------------------------------------------------
# Test 2: High similarity → suppressed
# ---------------------------------------------------------------------------
def test_high_similarity_suppresses_hypothesis(tmp_path):
"""Hypothesis with embedding nearly identical to corpus → suppress=True."""
identical_vec = [1.0] + [0.0] * 383
corpus_vec = [1.0] + [0.0] * 383 # cosine similarity = 1.0
mock_embedder = _make_mock_embedder(
embed_return=identical_vec,
embed_batch_return=[corpus_vec],
)
db_path = _make_db_with_incidents(
[("OOM killer", "Memory pressure caused OOM kill")],
tmp_path / "turnstone.db",
)
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([_make_hypothesis()], db_path)
assert len(results) == 1
result = results[0]
assert result.suppress is True
assert result.suppression_reason is not None
assert "Similar to resolved incident" in result.suppression_reason
assert result.similarity_to_known == pytest.approx(1.0, abs=0.01)
assert result.novelty_score == pytest.approx(0.0, abs=0.01)
# ---------------------------------------------------------------------------
# Test 3: Low similarity → not suppressed
# ---------------------------------------------------------------------------
def test_low_similarity_does_not_suppress(tmp_path):
"""Hypothesis with embedding orthogonal to corpus → suppress=False."""
hypothesis_vec = [1.0] + [0.0] * 383
corpus_vec = [0.0, 1.0] + [0.0] * 382 # orthogonal → similarity = 0.0
mock_embedder = _make_mock_embedder(
embed_return=hypothesis_vec,
embed_batch_return=[corpus_vec],
)
db_path = _make_db_with_incidents(
[("Disk I/O", "Storage saturation caused latency")],
tmp_path / "turnstone.db",
)
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([_make_hypothesis()], db_path)
assert len(results) == 1
result = results[0]
assert result.suppress is False
assert result.suppression_reason is None
assert result.similarity_to_known == pytest.approx(0.0, abs=0.01)
assert result.novelty_score == pytest.approx(1.0, abs=0.01)
# ---------------------------------------------------------------------------
# Test 3b: Borderline similarity — exactly at threshold vs. just below
# ---------------------------------------------------------------------------
def test_similarity_threshold_boundary(tmp_path):
"""similarity == threshold is suppressed; similarity just below threshold is not.
This test locks down the boundary semantics: suppress when max_sim >= threshold,
not when novelty_score < threshold (the inverted form that was the original bug).
With threshold=0.85:
- similarity=0.85 suppressed (at boundary, inclusive)
- similarity=0.84 NOT suppressed (just below)
"""
db_path = _make_db_with_incidents(
[("Disk I/O", "Storage saturation caused latency")],
tmp_path / "turnstone.db",
)
# Corpus unit vector along first axis
corpus_vec = [1.0] + [0.0] * 383
for sim_value, expected_suppress in [(0.85, True), (0.84, False)]:
# Build a hypothesis embedding whose cosine similarity to corpus_vec ≈ sim_value.
# query = [sim, sqrt(1 - sim^2), 0, ...] → cosine sim = sim exactly.
import math
hyp_vec = [sim_value, math.sqrt(max(0.0, 1.0 - sim_value ** 2))] + [0.0] * 382
mock_embedder = _make_mock_embedder(
embed_return=hyp_vec,
embed_batch_return=[corpus_vec],
)
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([_make_hypothesis()], db_path)
assert len(results) == 1
result = results[0]
assert result.suppress is expected_suppress, (
f"similarity={sim_value:.2f}: expected suppress={expected_suppress}, "
f"got suppress={result.suppress} (similarity_to_known={result.similarity_to_known:.4f})"
)
# ---------------------------------------------------------------------------
# Test 4: Empty hypotheses list returns []
# ---------------------------------------------------------------------------
def test_empty_hypotheses_returns_empty(tmp_path):
"""suppress([]) → [] regardless of model or db state."""
db_path = tmp_path / "turnstone.db"
suppressor = FalsePositiveSuppressor(model_id="test-model")
results = suppressor.suppress([], db_path)
assert results == []
# ---------------------------------------------------------------------------
# Test 5: Ranking by novelty_score * confidence
# ---------------------------------------------------------------------------
def test_ranking_by_novelty_times_confidence(tmp_path):
"""Results are sorted by novelty_score * confidence descending."""
# Hypothesis A: novelty=0.9, confidence=0.5 → score=0.45
# Hypothesis B: novelty=0.5, confidence=0.9 → score=0.45 (tie, order stable-ish)
# Hypothesis C: novelty=0.8, confidence=0.9 → score=0.72 (highest)
# Expected order: C, then A or B
# We'll use orthogonal embeddings to get predictable similarities.
# Corpus has 3 incidents with different embeddings.
# We'll control novelty_score by setting similarity carefully.
# Simplest: set up so each hypothesis gets a specific similarity to its corpus.
# corpus_embs[0] = [1,0,0,...], [0,1,0,...], [0,0,1,...] — unit vectors
# hyp A embed = [cos(0.1), sin(0.1), 0...] → sim to corpus[0] = cos(0.1) ≈ 0.995 high
# This gets complex. Instead, mock _load_embedder to return None and rely
# on passthrough with controlled confidence, then verify confidence-based ranking.
# Then do a second test variant with manual novelty injection via embed return values.
# Simpler approach: create 3 hypotheses and verify output is sorted correctly
# by providing distinct embeddings that produce known similarities.
# Corpus: single vector [1, 0, 0, ...]
corpus_vec = [1.0] + [0.0] * 383
# H_A: similarity = 0.1 → novelty = 0.9, confidence = 0.5 → score = 0.45
angle_a = math.acos(0.1)
vec_a = [0.1, math.sin(angle_a)] + [0.0] * 382
# H_B: similarity = 0.5 → novelty = 0.5, confidence = 0.9 → score = 0.45
angle_b = math.acos(0.5)
vec_b = [0.5, math.sin(angle_b)] + [0.0] * 382
# H_C: similarity = 0.2 → novelty = 0.8, confidence = 0.9 → score = 0.72 (highest)
angle_c = math.acos(0.2)
vec_c = [0.2, math.sin(angle_c)] + [0.0] * 382
h_a = _make_hypothesis(title="A", confidence=0.5)
h_b = _make_hypothesis(title="B", confidence=0.9)
h_c = _make_hypothesis(title="C", confidence=0.9)
call_count = [0]
vecs_in_order = [vec_a, vec_b, vec_c]
def side_effect_embed(text: str) -> MagicMock:
m = MagicMock()
m.tolist.return_value = vecs_in_order[call_count[0] % len(vecs_in_order)]
call_count[0] += 1
return m
mock_embedder = MagicMock()
batch_m = MagicMock()
batch_m.tolist.return_value = corpus_vec
mock_embedder.embed_batch.return_value = [batch_m]
mock_embedder.embed.side_effect = side_effect_embed
db_path = _make_db_with_incidents(
[("OOM", "Memory exhaustion")],
tmp_path / "turnstone.db",
)
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([h_a, h_b, h_c], db_path)
assert len(results) == 3
titles = [r.hypothesis.title for r in results]
# H_C should be first (highest novelty*confidence score)
assert titles[0] == "C", f"Expected C first, got {titles}"
# Verify sort is descending by novelty*confidence
scores = [r.novelty_score * r.hypothesis.confidence for r in results]
assert scores == sorted(scores, reverse=True)
# ---------------------------------------------------------------------------
# Test 6: DB with no resolved incidents → novelty_score=1.0
# ---------------------------------------------------------------------------
def test_no_resolved_incidents_in_db_passthrough(tmp_path):
"""When incidents table is empty, all hypotheses get novelty_score=1.0."""
db_path = _make_db_with_incidents([], tmp_path / "turnstone.db") # table exists but zero rows
mock_embedder = _make_mock_embedder()
suppressor = FalsePositiveSuppressor(model_id="test-model")
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([_make_hypothesis()], db_path)
assert len(results) == 1
assert results[0].novelty_score == pytest.approx(1.0)
assert results[0].suppress is False
# embed_batch should NOT have been called (empty corpus short-circuits)
mock_embedder.embed_batch.assert_not_called()
# ---------------------------------------------------------------------------
# Test 7: DB query failure → graceful fallback, no crash
# ---------------------------------------------------------------------------
def test_db_query_failure_graceful_fallback(tmp_path):
"""When the incidents table is missing, suppress() returns passthrough without raising."""
db_path = _make_empty_db(tmp_path / "turnstone.db") # no 'incidents' table
mock_embedder = _make_mock_embedder()
suppressor = FalsePositiveSuppressor(model_id="test-model")
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
results = suppressor.suppress([_make_hypothesis()], db_path)
assert len(results) == 1
assert results[0].novelty_score == pytest.approx(1.0)
assert results[0].suppress is False
# ---------------------------------------------------------------------------
# Test 8: Embedding service unavailable (returns None) → graceful fallback
# ---------------------------------------------------------------------------
def test_embedding_service_unavailable_passthrough(tmp_path):
"""When get_embedder() returns None, suppress() falls back without crashing."""
db_path = _make_db_with_incidents(
[("OOM", "Memory pressure")],
tmp_path / "turnstone.db",
)
suppressor = FalsePositiveSuppressor(model_id="test-model")
with patch.object(suppressor, "_load_embedder", return_value=None):
results = suppressor.suppress([_make_hypothesis(confidence=0.7)], db_path)
assert len(results) == 1
assert results[0].novelty_score == pytest.approx(1.0)
assert results[0].suppress is False
assert results[0].suppression_reason is None
# ---------------------------------------------------------------------------
# Test 9: Corpus cache invalidated when corpus changes
# ---------------------------------------------------------------------------
def test_corpus_cache_invalidated_on_corpus_change(tmp_path):
"""When the corpus changes between calls, embed_batch is called again."""
# First DB: one incident
db_path = _make_db_with_incidents(
[("OOM", "Memory pressure")],
tmp_path / "turnstone.db",
)
corpus_vec_1 = [1.0] + [0.0] * 383
corpus_vec_2 = [0.0, 1.0] + [0.0] * 382
hyp_vec = [1.0] + [0.0] * 383
# embedder will be called twice for embed_batch (different corpus each time)
mock_embedder = MagicMock()
single_m = MagicMock()
single_m.tolist.return_value = hyp_vec
batch_m1 = MagicMock()
batch_m1.tolist.return_value = corpus_vec_1
batch_m2 = MagicMock()
batch_m2.tolist.return_value = corpus_vec_2
mock_embedder.embed.return_value = single_m
mock_embedder.embed_batch.side_effect = [[batch_m1], [batch_m2]]
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
# First call — populates cache
results_1 = suppressor.suppress([_make_hypothesis()], db_path)
assert mock_embedder.embed_batch.call_count == 1
# Mutate the DB to add a second incident (changes corpus)
with sqlite3.connect(str(db_path)) as conn:
conn.execute(
"INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
("Disk I/O", "Storage saturation", "2024-01-02T00:00:00"),
)
conn.commit()
# Second call — corpus changed, should re-embed
results_2 = suppressor.suppress([_make_hypothesis()], db_path)
assert mock_embedder.embed_batch.call_count == 2, (
"embed_batch should be called again when corpus changes"
)
assert len(results_1) == 1
assert len(results_2) == 1

View file

@ -0,0 +1,285 @@
"""Tests for app/services/diagnose/synthesizer.py — SummarySynthesizer.
All tests use mocking; no real LLM calls are made.
"""
from __future__ import annotations
from unittest.mock import MagicMock, patch
from app.context.retriever import RetrievedContext
from app.services.diagnose.models import Hypothesis, RankedHypothesis, TimelineResult
from app.services.diagnose.synthesizer import SummarySynthesizer
# ---------------------------------------------------------------------------
# Fixture helpers
# ---------------------------------------------------------------------------
def _make_hypothesis(
hypothesis_id: str = "h1",
title: str = "SSH flood from external IPs",
description: str = "Repeated failed login attempts from multiple IPs.",
confidence: float = 0.87,
severity: str = "CRITICAL",
) -> Hypothesis:
return Hypothesis(
hypothesis_id=hypothesis_id,
title=title,
description=description,
confidence=confidence,
supporting_cluster_ids=("c1",),
runbook_refs=(),
severity=severity, # type: ignore[arg-type]
)
def _make_ranked(
hypothesis: Hypothesis | None = None,
novelty_score: float = 0.95,
similarity_to_known: float = 0.05,
suppress: bool = False,
suppression_reason: str | None = None,
) -> RankedHypothesis:
h = hypothesis or _make_hypothesis()
return RankedHypothesis(
hypothesis=h,
novelty_score=novelty_score,
similarity_to_known=similarity_to_known,
suppress=suppress,
suppression_reason=suppression_reason,
)
def _make_timeline(
total_entries: int = 42,
n_clusters: int = 3,
) -> TimelineResult:
return TimelineResult(
clusters=tuple(),
total_entries=total_entries,
window_start="2026-01-01T00:00:00+00:00",
window_end="2026-01-01T01:00:00+00:00",
gap_count=1,
burst_count=2,
dominant_sources=("syslog", "auth"),
)
def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
return RetrievedContext(
facts=[{"category": "network", "key": "host", "value": "heimdall", "source": "facts"}],
chunks=chunks or [{"filename": "runbook.md", "text": "Restart sshd if flooded"}],
)
# ---------------------------------------------------------------------------
# Test cases
# ---------------------------------------------------------------------------
class TestSynthesizerWithHypotheses:
"""With hypotheses, result must contain VERDICT."""
def test_returns_verdict_string_with_llm(self):
synthesizer = SummarySynthesizer()
ranked = [_make_ranked()]
timeline = _make_timeline()
ctx = _make_ctx()
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)\nTIMELINE: lots of hits."}}]
}
with patch("httpx.post", return_value=mock_resp):
result = synthesizer.synthesize(
ranked=ranked,
timeline=timeline,
ctx=ctx,
query="ssh brute force",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert "VERDICT" in result
def test_returns_nonempty_string(self):
synthesizer = SummarySynthesizer()
ranked = [_make_ranked()]
timeline = _make_timeline()
ctx = _make_ctx()
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)"}}]
}
with patch("httpx.post", return_value=mock_resp):
result = synthesizer.synthesize(
ranked=ranked,
timeline=timeline,
ctx=ctx,
query="why is auth failing",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert isinstance(result, str)
assert len(result) > 0
class TestSynthesizerSuppressedHypotheses:
"""Suppressed hypotheses must be excluded from the LLM prompt."""
def test_suppressed_hypotheses_excluded_from_prompt(self):
suppressed = _make_ranked(
hypothesis=_make_hypothesis(
hypothesis_id="h2",
title="Wazuh alert processing backlog",
severity="ERROR",
confidence=0.72,
),
suppress=True,
suppression_reason="similar to 2025-04 SSH incident",
novelty_score=0.1,
)
active = _make_ranked(
hypothesis=_make_hypothesis(
hypothesis_id="h1",
title="SSH flood from external IPs",
severity="CRITICAL",
confidence=0.87,
),
suppress=False,
novelty_score=0.95,
)
captured_messages: list = []
def fake_post(url, json=None, headers=None, timeout=None):
if json and "payload" in json:
captured_messages.extend(json["payload"].get("messages", []))
elif json and "messages" in json:
captured_messages.extend(json.get("messages", []))
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood"}}]
}
return mock_resp
synthesizer = SummarySynthesizer()
with patch("httpx.post", side_effect=fake_post):
synthesizer.synthesize(
ranked=[active, suppressed],
timeline=_make_timeline(),
ctx=_make_ctx(),
query="auth failures",
llm_url="http://localhost:11434",
llm_model="llama3",
)
# The user message should contain the active hypothesis title
# and NOT contain the suppressed one (or mark it suppressed)
user_content = next(
(m["content"] for m in captured_messages if m.get("role") == "user"), ""
)
assert "SSH flood from external IPs" in user_content
# Wazuh should not appear as a standalone top-level hypothesis
# (suppressed items are excluded from the active list sent to the LLM)
assert "Wazuh alert processing backlog" not in user_content
class TestSynthesizerNoLLM:
"""No LLM configured: must return deterministic fallback (not empty)."""
def test_no_llm_url_returns_fallback(self):
synthesizer = SummarySynthesizer()
ranked = [_make_ranked()]
timeline = _make_timeline()
ctx = _make_ctx()
result = synthesizer.synthesize(
ranked=ranked,
timeline=timeline,
ctx=ctx,
query="disk errors",
)
assert isinstance(result, str)
assert len(result) > 0
assert "VERDICT" in result
def test_no_llm_model_returns_fallback(self):
synthesizer = SummarySynthesizer()
ranked = [_make_ranked()]
result = synthesizer.synthesize(
ranked=ranked,
timeline=_make_timeline(),
ctx=_make_ctx(),
query="oom killer",
llm_url="http://localhost:11434",
# llm_model omitted
)
assert "VERDICT" in result
assert "SSH flood from external IPs" in result
def test_llm_failure_returns_fallback(self):
synthesizer = SummarySynthesizer()
ranked = [_make_ranked()]
with patch("httpx.post", side_effect=ConnectionError("refused")):
result = synthesizer.synthesize(
ranked=ranked,
timeline=_make_timeline(),
ctx=_make_ctx(),
query="why is disk full",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert "VERDICT" in result
assert len(result) > 0
class TestSynthesizerEmptyRanked:
"""Empty ranked list: must return deterministic fallback text, not raise."""
def test_empty_ranked_no_llm_returns_fallback(self):
synthesizer = SummarySynthesizer()
result = synthesizer.synthesize(
ranked=[],
timeline=_make_timeline(),
ctx=_make_ctx(),
query="check everything",
)
assert isinstance(result, str)
assert len(result) > 0
assert "VERDICT" in result
def test_empty_ranked_with_llm_returns_fallback_or_llm_text(self):
"""Even with empty ranked, we attempt LLM and return something."""
synthesizer = SummarySynthesizer()
mock_resp = MagicMock()
mock_resp.status_code = 200
mock_resp.json.return_value = {
"choices": [{"message": {"content": "VERDICT: UNKNOWN — no hypotheses generated"}}]
}
with patch("httpx.post", return_value=mock_resp):
result = synthesizer.synthesize(
ranked=[],
timeline=_make_timeline(),
ctx=_make_ctx(),
query="nothing found",
llm_url="http://localhost:11434",
llm_model="llama3",
)
assert isinstance(result, str)
assert len(result) > 0

View file

@ -0,0 +1,234 @@
"""Tests for app/services/diagnose/timeline.py — TimelineReconstructor."""
from __future__ import annotations
from app.services.diagnose.timeline import TimelineReconstructor
from app.services.diagnose.models import TimelineResult
from app.services.search import SearchResult
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_entry(
entry_id: str = "e1",
source_id: str = "src-a",
timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
severity: str | None = "INFO",
rank: float = 0.0,
text: str = "log line",
matched_patterns: list[str] | None = None,
sequence: int = 1,
) -> SearchResult:
return SearchResult(
entry_id=entry_id,
source_id=source_id,
sequence=sequence,
timestamp_iso=timestamp_iso,
severity=severity,
repeat_count=1,
out_of_order=False,
matched_patterns=matched_patterns or [],
text=text,
rank=rank,
)
def _ts(offset_seconds: int) -> str:
"""Return an ISO timestamp offset_seconds after 2026-01-01T00:00:00+00:00."""
from datetime import datetime, timezone, timedelta
base = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
dt = base + timedelta(seconds=offset_seconds)
return dt.isoformat()
# ---------------------------------------------------------------------------
# Test cases
# ---------------------------------------------------------------------------
class TestEmptyInput:
def test_empty_returns_empty_timeline(self):
rt = TimelineReconstructor()
result = rt.reconstruct([])
assert result == TimelineResult(
clusters=(),
total_entries=0,
gap_count=0,
burst_count=0,
window_start=None,
window_end=None,
dominant_sources=(),
)
class TestSingleEntry:
def test_single_entry_one_cluster(self):
rt = TimelineReconstructor()
entry = _make_entry(entry_id="e1", timestamp_iso=_ts(0))
result = rt.reconstruct([entry])
assert len(result.clusters) == 1
cluster = result.clusters[0]
assert cluster.gap_before_seconds == 0.0
assert cluster.burst is False
assert result.total_entries == 1
class TestClusteringWithinWindow:
def test_two_entries_10s_apart_same_cluster(self):
rt = TimelineReconstructor(cluster_window_seconds=30)
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", timestamp_iso=_ts(10)),
]
result = rt.reconstruct(entries)
assert len(result.clusters) == 1
assert len(result.clusters[0].entries) == 2
class TestClusteringOutsideWindow:
def test_two_entries_60s_apart_two_clusters(self):
rt = TimelineReconstructor(cluster_window_seconds=30)
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", timestamp_iso=_ts(60)),
]
result = rt.reconstruct(entries)
assert len(result.clusters) == 2
second_cluster = result.clusters[1]
assert second_cluster.gap_before_seconds >= 60.0
def test_gap_count_correct_for_60s_gap(self):
rt = TimelineReconstructor(cluster_window_seconds=30)
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", timestamp_iso=_ts(60)),
]
result = rt.reconstruct(entries)
assert result.gap_count == 1
class TestBurst:
def test_15_entries_within_3s_is_burst(self):
rt = TimelineReconstructor(
cluster_window_seconds=30,
burst_threshold=10,
burst_window_seconds=5,
)
# All 15 entries within a 3-second window — well under burst_window=5
entries = [
_make_entry(entry_id=f"e{i}", timestamp_iso=_ts(i % 3), sequence=i)
for i in range(15)
]
result = rt.reconstruct(entries)
# All should land in one cluster
assert len(result.clusters) == 1
assert result.clusters[0].burst is True
assert result.burst_count == 1
class TestNullTimestamps:
def test_null_timestamp_joins_current_cluster(self):
rt = TimelineReconstructor(cluster_window_seconds=30)
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", timestamp_iso=None),
]
# Should not raise, and null entry should join the existing cluster
result = rt.reconstruct(entries)
assert len(result.clusters) == 1
assert "e2" in result.clusters[0].entries
def test_null_timestamp_does_not_start_new_cluster(self):
rt = TimelineReconstructor(cluster_window_seconds=30)
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", timestamp_iso=None),
_make_entry(entry_id="e3", timestamp_iso=_ts(5)),
]
result = rt.reconstruct(entries)
# e3 is within 30s of e1, so all three in one cluster
assert len(result.clusters) == 1
def test_all_null_timestamps_one_cluster_no_crash(self):
rt = TimelineReconstructor()
entries = [
_make_entry(entry_id="e1", timestamp_iso=None),
_make_entry(entry_id="e2", timestamp_iso=None),
]
result = rt.reconstruct(entries)
assert len(result.clusters) == 1
cluster = result.clusters[0]
assert cluster.start_iso is None
assert cluster.end_iso is None
assert result.window_start is None
assert result.window_end is None
class TestDominantSources:
def test_dominant_sources_ordered_by_count_descending(self):
rt = TimelineReconstructor()
# src-b has 3 entries, src-a has 1
entries = [
_make_entry(entry_id="e1", source_id="src-a", timestamp_iso=_ts(0)),
_make_entry(entry_id="e2", source_id="src-b", timestamp_iso=_ts(1)),
_make_entry(entry_id="e3", source_id="src-b", timestamp_iso=_ts(2)),
_make_entry(entry_id="e4", source_id="src-b", timestamp_iso=_ts(3)),
]
result = rt.reconstruct(entries)
assert result.dominant_sources[0] == "src-b"
assert result.dominant_sources[1] == "src-a"
class TestRepresentativeText:
def test_representative_text_uses_highest_rank(self):
rt = TimelineReconstructor()
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=-5.0, text="low score"),
_make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=-1.0, text="high score"),
]
result = rt.reconstruct(entries)
assert result.clusters[0].representative_text == "high score"
def test_representative_text_tiebreak_on_longest_text(self):
rt = TimelineReconstructor()
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=0.0, text="short"),
_make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=0.0, text="much longer text here"),
]
result = rt.reconstruct(entries)
assert result.clusters[0].representative_text == "much longer text here"
class TestClusterId:
def test_cluster_id_is_12_char_hex(self):
rt = TimelineReconstructor()
entry = _make_entry(entry_id="abc123", timestamp_iso=_ts(0))
result = rt.reconstruct([entry])
cluster_id = result.clusters[0].cluster_id
assert len(cluster_id) == 12
assert all(c in "0123456789abcdef" for c in cluster_id)
class TestSeverity:
def test_critical_wins_over_error(self):
rt = TimelineReconstructor()
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0), severity="ERROR"),
_make_entry(entry_id="e2", timestamp_iso=_ts(1), severity="CRITICAL"),
_make_entry(entry_id="e3", timestamp_iso=_ts(2), severity="INFO"),
]
result = rt.reconstruct(entries)
assert result.clusters[0].severity == "CRITICAL"
class TestPatternTags:
def test_pattern_tags_union_across_entries(self):
rt = TimelineReconstructor()
entries = [
_make_entry(entry_id="e1", timestamp_iso=_ts(0), matched_patterns=["oom-killer"]),
_make_entry(entry_id="e2", timestamp_iso=_ts(1), matched_patterns=["disk-full"]),
]
result = rt.reconstruct(entries)
tags = set(result.clusters[0].pattern_tags)
assert "oom-killer" in tags
assert "disk-full" in tags

View file

@ -1,7 +1,7 @@
"""Tests for the dmesg log ingestor."""
"""Tests for the dmesg log gleaner."""
from __future__ import annotations
from app.ingest.dmesg_log import is_dmesg_log, parse
from app.glean.dmesg_log import is_dmesg_log, parse
RELATIVE_SAMPLE = """\
[ 0.000000] Linux version 6.8.0-65-generic

View file

@ -0,0 +1,236 @@
"""Tests for fingerprint-based incremental glean skipping (issue #30).
Verifies that _glean_files() (and its public wrappers) skip local files whose
mtime+size fingerprint has not changed since the last glean, and that force=True
bypasses that check.
"""
from __future__ import annotations
import sqlite3
import time
from pathlib import Path
import pytest
from app.glean.pipeline import (
_fingerprint,
_fp_unchanged,
_save_fingerprint,
ensure_schema,
glean_dir,
glean_file,
)
from app.glean.base import now_iso
# ── Fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture()
def db_path(tmp_path: Path) -> Path:
path = tmp_path / "test.db"
ensure_schema(path)
return path
@pytest.fixture()
def log_file(tmp_path: Path) -> Path:
"""A minimal plaintext log file."""
f = tmp_path / "test.log"
f.write_text("May 24 10:00:00 heimdall kernel: test message\n")
return f
# ── Unit: fingerprint helpers ──────────────────────────────────────────────────
class TestFingerprintHelpers:
def test_fingerprint_returns_mtime_and_size(self, log_file: Path) -> None:
mtime, size = _fingerprint(log_file)
st = log_file.stat()
assert mtime == st.st_mtime
assert size == st.st_size
def test_fp_unchanged_returns_false_when_no_record(self, db_path: Path, log_file: Path) -> None:
conn = sqlite3.connect(str(db_path))
mtime, size = _fingerprint(log_file)
assert _fp_unchanged(conn, log_file, mtime, size) is False
conn.close()
def test_fp_unchanged_returns_true_after_save(self, db_path: Path, log_file: Path) -> None:
conn = sqlite3.connect(str(db_path))
mtime, size = _fingerprint(log_file)
_save_fingerprint(conn, log_file, mtime, size, now_iso())
conn.commit()
assert _fp_unchanged(conn, log_file, mtime, size) is True
conn.close()
def test_fp_unchanged_returns_false_on_size_change(self, db_path: Path, log_file: Path) -> None:
conn = sqlite3.connect(str(db_path))
mtime, size = _fingerprint(log_file)
_save_fingerprint(conn, log_file, mtime, size, now_iso())
conn.commit()
# Simulate size change (new content appended)
assert _fp_unchanged(conn, log_file, mtime, size + 1) is False
conn.close()
def test_fp_unchanged_returns_false_on_mtime_change(self, db_path: Path, log_file: Path) -> None:
conn = sqlite3.connect(str(db_path))
mtime, size = _fingerprint(log_file)
_save_fingerprint(conn, log_file, mtime, size, now_iso())
conn.commit()
assert _fp_unchanged(conn, log_file, mtime + 1.0, size) is False
conn.close()
def test_save_fingerprint_upserts(self, db_path: Path, log_file: Path) -> None:
"""Second save with different values replaces the first (UPSERT semantics)."""
conn = sqlite3.connect(str(db_path))
_save_fingerprint(conn, log_file, 1000.0, 100, "2026-01-01T00:00:00Z")
conn.commit()
_save_fingerprint(conn, log_file, 2000.0, 200, "2026-01-02T00:00:00Z")
conn.commit()
row = conn.execute(
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
(str(log_file),),
).fetchone()
assert row == (2000.0, 200)
conn.close()
# ── Integration: glean_file skipping ─────────────────────────────────────────
class TestGleanFileFingerprint:
def test_first_glean_writes_fingerprint(self, db_path: Path, log_file: Path) -> None:
glean_file(log_file, db_path)
conn = sqlite3.connect(str(db_path))
row = conn.execute(
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
(str(log_file),),
).fetchone()
conn.close()
assert row is not None
mtime, size = _fingerprint(log_file)
assert row == (mtime, size)
def test_second_glean_skips_unchanged_file(self, db_path: Path, log_file: Path) -> None:
stats_first = glean_file(log_file, db_path)
count_first = sum(stats_first.values())
# Re-glean without touching the file — should produce 0 new entries.
stats_second = glean_file(log_file, db_path)
count_second = sum(stats_second.values())
assert count_first >= 1, "First glean should find at least one entry"
assert count_second == 0, "Second glean should skip unchanged file"
def test_second_glean_runs_when_file_grows(self, db_path: Path, log_file: Path) -> None:
glean_file(log_file, db_path)
# Append a new line and update mtime by rewriting.
original = log_file.read_text()
log_file.write_text(original + "May 24 10:01:00 heimdall kernel: second message\n")
stats_second = glean_file(log_file, db_path)
# INSERT OR IGNORE means the original entry won't re-count, but parsing
# does happen — at minimum the new line is processed.
assert sum(stats_second.values()) >= 0 # glean ran (not skipped)
# Confirm fingerprint updated to new size.
conn = sqlite3.connect(str(db_path))
row = conn.execute(
"SELECT size FROM glean_fingerprints WHERE path = ?",
(str(log_file),),
).fetchone()
conn.close()
assert row is not None
assert row[0] == log_file.stat().st_size
def test_force_bypasses_fingerprint(self, db_path: Path, log_file: Path) -> None:
glean_file(log_file, db_path)
# Without force: skipped.
stats_no_force = glean_file(log_file, db_path)
assert sum(stats_no_force.values()) == 0
# With force: glean runs (INSERT OR IGNORE means count may be 0, but
# we verify the fingerprint was re-saved with a fresh gleaned_at).
conn_before = sqlite3.connect(str(db_path))
ts_before = conn_before.execute(
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
(str(log_file),),
).fetchone()[0]
conn_before.close()
time.sleep(0.01) # ensure gleaned_at advances
glean_file(log_file, db_path, force=True)
conn_after = sqlite3.connect(str(db_path))
ts_after = conn_after.execute(
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
(str(log_file),),
).fetchone()[0]
conn_after.close()
assert ts_after > ts_before, "force=True should update gleaned_at timestamp"
# ── Integration: glean_dir skipping ──────────────────────────────────────────
class TestGleanDirFingerprint:
def test_glean_dir_skips_unchanged_on_second_run(self, db_path: Path, tmp_path: Path) -> None:
log1 = tmp_path / "a.log"
log2 = tmp_path / "b.log"
log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
log2.write_text("May 24 10:00:00 heimdall kernel: msg two\n")
glean_dir(tmp_path, db_path)
stats_second = glean_dir(tmp_path, db_path)
assert sum(stats_second.values()) == 0, "Both unchanged files should be skipped"
def test_glean_dir_force_reruns_all(self, db_path: Path, tmp_path: Path) -> None:
log1 = tmp_path / "a.log"
log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
glean_dir(tmp_path, db_path)
# force=True: runs even though nothing changed; INSERT OR IGNORE keeps DB clean.
conn_before = sqlite3.connect(str(db_path))
ts_before = conn_before.execute(
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
(str(log1),),
).fetchone()[0]
conn_before.close()
time.sleep(0.01)
glean_dir(tmp_path, db_path, force=True)
conn_after = sqlite3.connect(str(db_path))
ts_after = conn_after.execute(
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
(str(log1),),
).fetchone()[0]
conn_after.close()
assert ts_after > ts_before
# ── Schema: ensure fingerprints table created ─────────────────────────────────
class TestEnsureSchema:
def test_fingerprints_table_exists_after_ensure_schema(self, tmp_path: Path) -> None:
db = tmp_path / "fresh.db"
ensure_schema(db)
conn = sqlite3.connect(str(db))
tables = {
row[0]
for row in conn.execute(
"SELECT name FROM sqlite_master WHERE type='table'"
).fetchall()
}
conn.close()
assert "glean_fingerprints" in tables
def test_ensure_schema_idempotent(self, tmp_path: Path) -> None:
"""Calling ensure_schema twice on the same DB must not raise."""
db = tmp_path / "fresh.db"
ensure_schema(db)
ensure_schema(db) # second call — should be a no-op

View file

@ -0,0 +1,444 @@
"""Tests for SSH source handling in app/glean/pipeline.py.
Verifies that glean_sources() correctly:
- Dispatches SSH sources to SSHTransport (local sources unchanged)
- Routes each glean-type to the right command builder + parser
- Writes parsed entries to SQLite
- Gracefully skips sources on SSHConnectionError or SSHCommandError
"""
from __future__ import annotations
import json
import sqlite3
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
import yaml
from app.glean.pipeline import glean_sources, ensure_schema
from app.glean.ssh import SSHConnectionError, SSHCommandError
# ── Shared fixtures ───────────────────────────────────────────────────────────
JOURNALD_LINE = json.dumps({
"__REALTIME_TIMESTAMP": "1747000000000000",
"PRIORITY": "3",
"MESSAGE": "SSH brute force detected from 192.168.1.99",
"SYSLOG_IDENTIFIER": "sshd",
"_HOSTNAME": "rack01",
}) + "\n"
SYSLOG_LINE = "May 20 22:00:00 rack01 sshd[1234]: Failed password for invalid user admin\n"
PLAINTEXT_LINE = "2026-05-20 22:00:00 ERROR app crashed with exit code 1\n"
DOCKER_LINE = "2026-05-20T22:00:00.000000000Z stderr F container startup failed\n"
def _ssh_sources_yaml(sources: list[dict]) -> str:
return yaml.dump({"sources": sources})
def _mock_transport(lines: list[str] | None = None):
"""Return a mock SSHTransport context manager whose exec_stream yields given lines."""
mock_t = MagicMock()
mock_t.exec_stream.return_value = iter(lines or [])
return mock_t
def _patch_transport(mock_t):
"""Patch SSHTransport in pipeline so __enter__ returns mock_t."""
p = patch("app.glean.pipeline.SSHTransport")
MockClass = p.start()
MockClass.return_value.__enter__.return_value = mock_t
MockClass.return_value.__exit__.return_value = None
return p, MockClass
def _entry_count(db_path: Path) -> int:
conn = sqlite3.connect(db_path)
n = conn.execute("SELECT COUNT(*) FROM log_entries").fetchone()[0]
conn.close()
return n
# ── journald type ─────────────────────────────────────────────────────────────
class TestSSHJournaldGlean:
def test_journald_entries_written_to_db(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "journald"}],
}]))
mock_t = _mock_transport([JOURNALD_LINE])
p, MockClass = _patch_transport(mock_t)
try:
stats = glean_sources(sources_file, db_path)
finally:
p.stop()
assert _entry_count(db_path) >= 1
assert any("rack01" in k for k in stats)
def test_journald_args_passed_to_command_builder(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "journald", "args": ["--since", "1 hour ago"]}],
}]))
mock_t = _mock_transport([JOURNALD_LINE])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
# The command passed to exec_stream must contain the args
call_args = mock_t.exec_stream.call_args[0][0]
assert "--since" in call_args
assert "1 hour ago" in call_args
def test_journald_unit_shorthand(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "journald", "unit": "sshd"}],
}]))
mock_t = _mock_transport([])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
call_args = mock_t.exec_stream.call_args[0][0]
assert "sshd" in call_args
# ── syslog type ───────────────────────────────────────────────────────────────
class TestSSHSyslogGlean:
def test_syslog_entries_written_to_db(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01-syslog",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "syslog", "path": "/var/log/syslog"}],
}]))
mock_t = _mock_transport([SYSLOG_LINE])
p, _ = _patch_transport(mock_t)
try:
stats = glean_sources(sources_file, db_path)
finally:
p.stop()
assert _entry_count(db_path) >= 1
def test_syslog_command_contains_path(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "syslog", "path": "/var/log/auth.log"}],
}]))
mock_t = _mock_transport([])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
call_args = mock_t.exec_stream.call_args[0][0]
assert "/var/log/auth.log" in call_args
# ── plaintext type ────────────────────────────────────────────────────────────
class TestSSHPlaintextGlean:
def test_plaintext_entries_written_to_db(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01-app",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "plaintext", "path": "/var/log/app/error.log"}],
}]))
mock_t = _mock_transport([PLAINTEXT_LINE])
p, _ = _patch_transport(mock_t)
try:
stats = glean_sources(sources_file, db_path)
finally:
p.stop()
assert _entry_count(db_path) >= 1
def test_plaintext_command_contains_path(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "plaintext", "path": "/opt/myapp/app.log"}],
}]))
mock_t = _mock_transport([])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
call_args = mock_t.exec_stream.call_args[0][0]
assert "/opt/myapp/app.log" in call_args
# ── docker type ───────────────────────────────────────────────────────────────
class TestSSHDockerGlean:
def test_docker_single_container_command_issued(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "docker", "containers": ["myapp"]}],
}]))
mock_t = _mock_transport([DOCKER_LINE])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
call_args = mock_t.exec_stream.call_args[0][0]
assert "myapp" in call_args
def test_docker_multiple_containers_exec_per_container(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "docker", "containers": ["app", "nginx"]}],
}]))
mock_t = MagicMock()
mock_t.exec_stream.return_value = iter([])
p, _ = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
# One exec_stream call per container
assert mock_t.exec_stream.call_count == 2
all_cmds = " ".join(c[0][0] for c in mock_t.exec_stream.call_args_list)
assert "app" in all_cmds
assert "nginx" in all_cmds
# ── error handling ────────────────────────────────────────────────────────────
class TestSSHGleanErrorHandling:
def test_connection_error_skips_source_returns_empty_stats(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "unreachable",
"transport": "ssh",
"host": "192.168.99.99",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "journald"}],
}]))
with patch("app.glean.pipeline.SSHTransport") as MockClass:
MockClass.return_value.__enter__.side_effect = SSHConnectionError("no route")
MockClass.return_value.__exit__.return_value = None
stats = glean_sources(sources_file, db_path)
assert _entry_count(db_path) == 0
# Stats for the source should either be absent or zero
for v in stats.values():
assert v == 0
def test_command_error_skips_item_continues_next(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
# Two glean items: first raises SSHCommandError, second yields a valid line
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [
{"type": "journald"},
{"type": "syslog", "path": "/var/log/syslog"},
],
}]))
mock_t = MagicMock()
# side_effect list: exception instances are raised; other values are returned
mock_t.exec_stream.side_effect = [
SSHCommandError("journalctl: command not found"), # raised on first call
iter([SYSLOG_LINE]), # returned on second call
]
p, _ = _patch_transport(mock_t)
try:
# Should not raise — bad item is skipped, good item is processed
stats = glean_sources(sources_file, db_path)
finally:
p.stop()
# The syslog line should have been written
assert _entry_count(db_path) >= 1
def test_unknown_glean_type_skipped(self, tmp_path):
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "mqtt"}], # not a valid remote type
}]))
mock_t = _mock_transport([])
p, _ = _patch_transport(mock_t)
try:
stats = glean_sources(sources_file, db_path) # must not raise
finally:
p.stop()
assert _entry_count(db_path) == 0
# ── mixed local + SSH sources ─────────────────────────────────────────────────
class TestMixedLocalAndSSH:
def test_local_and_ssh_both_processed(self, tmp_path):
# Local syslog file
local_log = tmp_path / "local.log"
local_log.write_text(SYSLOG_LINE)
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([
{"id": "local-syslog", "path": str(local_log)},
{
"id": "remote01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [{"type": "syslog", "path": "/var/log/syslog"}],
},
]))
mock_t = _mock_transport([SYSLOG_LINE])
p, _ = _patch_transport(mock_t)
try:
stats = glean_sources(sources_file, db_path)
finally:
p.stop()
# Both sources should have contributed entries
assert _entry_count(db_path) >= 2
assert "local-syslog" in stats
assert any("remote01" in k for k in stats)
def test_local_only_sources_never_calls_ssh(self, tmp_path):
local_log = tmp_path / "local.log"
local_log.write_text(SYSLOG_LINE)
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([
{"id": "local", "path": str(local_log)},
]))
with patch("app.glean.pipeline.SSHTransport") as MockClass:
glean_sources(sources_file, db_path)
MockClass.assert_not_called()
# ── multiple glean items per SSH source ───────────────────────────────────────
class TestMultipleGleanItemsPerHost:
def test_one_connection_multiple_commands(self, tmp_path):
"""One SSHTransport instance is shared across all glean items for a host."""
sources_file = tmp_path / "sources.yaml"
db_path = tmp_path / "test.db"
sources_file.write_text(_ssh_sources_yaml([{
"id": "rack01",
"transport": "ssh",
"host": "192.168.1.10",
"user": "admin",
"key_path": "~/.ssh/id_ed25519",
"glean": [
{"type": "journald"},
{"type": "syslog", "path": "/var/log/syslog"},
{"type": "plaintext", "path": "/var/log/app.log"},
],
}]))
mock_t = _mock_transport([])
p, MockClass = _patch_transport(mock_t)
try:
glean_sources(sources_file, db_path)
finally:
p.stop()
# SSHTransport() should be instantiated only once for the one host
MockClass.assert_called_once()
# exec_stream should be called once per glean item
assert mock_t.exec_stream.call_count == 3

View file

@ -1,9 +1,9 @@
"""Tests for the qBittorrent log ingestor."""
"""Tests for the qBittorrent log gleaner."""
from __future__ import annotations
import pytest
from app.ingest.qbittorrent import is_qbit_log, parse
from app.glean.qbittorrent import is_qbit_log, parse
# ---------------------------------------------------------------------------
# Classic format sample (pre-5.x GUI builds)

185
tests/test_glean_ssh.py Normal file
View file

@ -0,0 +1,185 @@
"""Tests for SSH transport layer (app/glean/ssh.py).
All SSH network I/O is mocked no real SSH connection required.
"""
from __future__ import annotations
import io
from unittest.mock import MagicMock, patch, call
import pytest
from app.glean.ssh import (
SSHTransport,
SSHConnectionError,
SSHCommandError,
_build_journald_command,
_build_syslog_command,
_build_plaintext_command,
_build_docker_command,
)
# ── Command builders ──────────────────────────────────────────────────────────
class TestBuildJournaldCommand:
def test_no_args_returns_base_command(self):
cmd = _build_journald_command({})
assert "journalctl" in cmd
assert "-o json" in cmd
def test_args_list_appended(self):
cmd = _build_journald_command({"args": ["--since", "2 hours ago", "--unit", "sshd"]})
assert "--since" in cmd
assert "2 hours ago" in cmd
assert "--unit" in cmd
assert "sshd" in cmd
def test_unit_shorthand(self):
cmd = _build_journald_command({"unit": "docker"})
assert "--unit docker" in cmd or "--unit=docker" in cmd
class TestBuildSyslogCommand:
def test_returns_cat_command(self):
cmd = _build_syslog_command({"path": "/var/log/syslog"})
assert "cat" in cmd
assert "/var/log/syslog" in cmd
def test_default_path_when_omitted(self):
cmd = _build_syslog_command({})
assert "cat" in cmd
assert "/var/log" in cmd
class TestBuildPlaintextCommand:
def test_cat_with_path(self):
cmd = _build_plaintext_command({"path": "/var/log/app/error.log"})
assert "cat" in cmd
assert "/var/log/app/error.log" in cmd
def test_raises_without_path(self):
with pytest.raises((ValueError, KeyError)):
_build_plaintext_command({})
class TestBuildDockerCommand:
def test_single_container(self):
cmd = _build_docker_command({"containers": ["myapp"]})
assert "myapp" in cmd
def test_multiple_containers_returns_list(self):
cmds = _build_docker_command({"containers": ["app", "nginx"]})
# Multiple containers → must produce a command per container OR joined
assert "app" in (cmds if isinstance(cmds, str) else " ".join(cmds))
assert "nginx" in (cmds if isinstance(cmds, str) else " ".join(cmds))
def test_raises_without_containers(self):
with pytest.raises((ValueError, KeyError)):
_build_docker_command({})
# ── SSHTransport context manager ──────────────────────────────────────────────
def _mock_ssh_client(stdout_lines: list[str] | None = None):
"""Return a mock SSHClient whose exec_command yields the given lines."""
client = MagicMock()
stdout = MagicMock()
stdout.__iter__ = MagicMock(return_value=iter(stdout_lines or []))
stderr = MagicMock()
stderr.read.return_value = b""
client.exec_command.return_value = (MagicMock(), stdout, stderr)
return client
class TestSSHTransportConnect:
def test_connects_with_key_path(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
MockClient.return_value = _mock_ssh_client()
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
pass
MockClient.return_value.connect.assert_called_once()
call_kwargs = MockClient.return_value.connect.call_args
assert call_kwargs.kwargs.get("hostname") == "10.0.0.1" or \
call_kwargs.args[0] == "10.0.0.1"
def test_disconnects_on_exit(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
mock_client = _mock_ssh_client()
MockClient.return_value = mock_client
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
pass
mock_client.close.assert_called_once()
def test_disconnects_on_exception(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
mock_client = _mock_ssh_client()
MockClient.return_value = mock_client
with pytest.raises(RuntimeError):
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
raise RuntimeError("boom")
mock_client.close.assert_called_once()
def test_raises_ssh_connection_error_on_auth_failure(self, tmp_path):
import paramiko
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
MockClient.return_value.connect.side_effect = paramiko.AuthenticationException("denied")
with pytest.raises(SSHConnectionError, match="auth"):
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
pass
def test_raises_ssh_connection_error_on_no_route(self, tmp_path):
import paramiko
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
MockClient.return_value.connect.side_effect = paramiko.SSHException("no route")
with pytest.raises(SSHConnectionError):
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
pass
class TestSSHTransportExecStream:
def test_yields_stdout_lines(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
lines = ["line one\n", "line two\n", "line three\n"]
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
MockClient.return_value = _mock_ssh_client(lines)
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
result = list(t.exec_stream("echo hello"))
assert result == lines
def test_raises_ssh_command_error_on_nonzero_exit(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
mock_client = _mock_ssh_client([])
# Simulate non-zero exit code
channel = MagicMock()
channel.recv_exit_status.return_value = 1
mock_client.exec_command.return_value[1].channel = channel
mock_client.exec_command.return_value[2].read.return_value = b"command not found"
MockClient.return_value = mock_client
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
with pytest.raises(SSHCommandError, match="command not found"):
list(t.exec_stream("notacommand"))
def test_strips_trailing_newlines(self, tmp_path):
key_file = tmp_path / "id_ed25519"
key_file.write_bytes(b"fake-key")
lines = [" line with spaces \n"]
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
MockClient.return_value = _mock_ssh_client(lines)
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
# exec_stream should yield the raw lines; stripping is parser's job
result = list(t.exec_stream("echo hello"))
assert len(result) == 1

View file

@ -1,7 +1,7 @@
"""Tests for the syslog (RFC 3164) ingestor."""
"""Tests for the syslog (RFC 3164) gleaner."""
from __future__ import annotations
from app.ingest.syslog import is_syslog, parse
from app.glean.syslog import is_syslog, parse
SYSLOG_SAMPLE = """\
May 11 14:23:01 example-node sshd[1234]: Accepted publickey for x from 192.168.1.1 port 54321 ssh2

View file

@ -1,10 +1,10 @@
"""Tests for the Tautulli webhook ingestor."""
"""Tests for the Tautulli webhook gleaner."""
from __future__ import annotations
import pytest
from unittest.mock import patch
from app.ingest.tautulli import is_tautulli_payload, parse_webhook
from app.glean.tautulli import is_tautulli_payload, parse_webhook
# ---------------------------------------------------------------------------
@ -253,7 +253,7 @@ class TestEndpoint:
@pytest.fixture
def client(self, tmp_path):
from fastapi.testclient import TestClient
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import app.rest as rest_module
db = tmp_path / "test.db"
@ -267,14 +267,14 @@ class TestEndpoint:
def test_missing_action_returns_400(self, client):
resp = client.post(
"/turnstone/api/ingest/tautulli",
"/turnstone/api/glean/tautulli",
json={"session_key": "x"},
)
assert resp.status_code == 400
def test_wrong_token_returns_403(self, tmp_path):
from fastapi.testclient import TestClient
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import app.rest as rest_module
db = tmp_path / "test.db"
@ -288,7 +288,7 @@ class TestEndpoint:
patch.object(rest_module, "_compiled_patterns", []):
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
resp = c.post(
"/turnstone/api/ingest/tautulli",
"/turnstone/api/glean/tautulli",
json=_ERROR_PAYLOAD,
headers={"X-Tautulli-Token": "wrong"},
)
@ -296,7 +296,7 @@ class TestEndpoint:
def test_valid_payload_returns_200(self, client):
resp = client.post(
"/turnstone/api/ingest/tautulli",
"/turnstone/api/glean/tautulli",
json=_ERROR_PAYLOAD,
)
assert resp.status_code == 200

View file

@ -1,11 +1,11 @@
"""Tests for the Wazuh alert ingestor."""
"""Tests for the Wazuh alert gleaner."""
from __future__ import annotations
import json
from datetime import datetime
from app.ingest.wazuh import is_wazuh_alert, parse
from app.ingest.pipeline import _detect_format
from app.glean.wazuh import is_wazuh_alert, parse
from app.glean.pipeline import _detect_format
_ALERT = {
"timestamp": "2024-01-15T10:23:45.123+0000",

View file

@ -8,7 +8,7 @@ from pathlib import Path
class TestSchema:
def test_blocklist_candidates_table_exists(self, tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
db = tmp_path / "test.db"
ensure_schema(db)
conn = sqlite3.connect(str(db))
@ -16,7 +16,7 @@ class TestSchema:
assert "blocklist_candidates" in tables
def test_blocklist_candidates_columns(self, tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
db = tmp_path / "test.db"
ensure_schema(db)
conn = sqlite3.connect(str(db))
@ -28,7 +28,7 @@ class TestSchema:
}
def test_status_default_is_pending(self, tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import uuid
db = tmp_path / "test.db"
ensure_schema(db)
@ -89,7 +89,7 @@ class TestTelemetry:
class TestExtraction:
@pytest.fixture
def db(self, tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
p = tmp_path / "test.db"
ensure_schema(p)
return p
@ -195,7 +195,7 @@ class TestExtraction:
class TestCandidateManagement:
@pytest.fixture
def db_with_candidate(self, tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
import sqlite3, uuid
db = tmp_path / "test.db"
ensure_schema(db)

View file

@ -54,7 +54,7 @@ def test_keywords_cleaned_of_extra_spaces():
def test_diagnose_with_explicit_window_sets_time_detected(tmp_path):
from app.ingest.pipeline import ensure_schema
from app.glean.pipeline import ensure_schema
db = tmp_path / "test.db"
ensure_schema(db)
result = diagnose(db, query="plex", since="2026-05-11T14:00:00+00:00", until="2026-05-11T15:00:00+00:00")

View file

@ -104,7 +104,7 @@
<p v-if="severityFilter" class="mb-1">No {{ severityFilter }} entries in this result set.</p>
<template v-else>
<p class="mb-1">No log evidence found for "{{ lastQuery }}"</p>
<p class="text-sm">Check the Sources tab to confirm data is ingested, or try a broader description.</p>
<p class="text-sm">Check the Sources tab to confirm data is gleaned, or try a broader description.</p>
</template>
</div>

View file

@ -10,7 +10,7 @@
class="w-2 h-2 rounded-full flex-shrink-0"
></span>
<span :class="watchActive ? 'text-green-400' : 'text-text-dim'" class="text-xs">
{{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual ingest mode' }}
{{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual glean mode' }}
</span>
</div>
@ -20,8 +20,8 @@
class="flex items-center gap-2 rounded border border-surface-border bg-surface-raised px-4 py-2.5 text-xs text-text-dim"
>
<span class="text-sev-warn"></span>
<span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span>. Waiting for new entries to arrive.</span>
<span v-else>Last ingested: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span> 24h counts reflect this window, not today.</span>
<span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span>. Waiting for new entries to arrive.</span>
<span v-else>Last gleaned: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span> 24h counts reflect this window, not today.</span>
</div>
</div>
@ -171,7 +171,7 @@ interface StatsResponse {
criticals_24h: number
errors_24h: number
suppressed_criticals: number
last_ingested: string | null
last_gleaned: string | null
source_health: SourceHealth[]
recent_criticals: Array<{
entry_id: string
@ -186,7 +186,7 @@ interface WatchSourceStatus {
source_id: string
type: string
running: boolean
entries_ingested: number
entries_gleaned: number
last_event: string | null
error: string | null
}
@ -211,8 +211,8 @@ const watchActive = computed(() =>
)
const isStale = computed(() => {
if (!stats.value?.last_ingested) return false
const age = Date.now() - new Date(stats.value.last_ingested).getTime()
if (!stats.value?.last_gleaned) return false
const age = Date.now() - new Date(stats.value.last_gleaned).getTime()
return age > 25 * 60 * 60 * 1000 // older than 25h
})

View file

@ -106,7 +106,7 @@
</div>
<div v-else class="text-center">
<p class="text-base mb-1">No results for "{{ store.query }}"</p>
<p class="text-sm">Try broader terms or check the Sources tab to confirm data is ingested.</p>
<p class="text-sm">Try broader terms or check the Sources tab to confirm data is gleaned.</p>
</div>
</div>

View file

@ -3,7 +3,7 @@
<div class="mb-6 flex items-start justify-between gap-4">
<div>
<h1 class="text-text-primary text-xl font-semibold mb-1">Log Sources</h1>
<p class="text-text-dim text-sm">All hosts and services in the ingested corpus.</p>
<p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p>
</div>
<label class="btn-secondary text-sm cursor-pointer shrink-0">
<span>Upload log file</span>
@ -21,12 +21,12 @@
<div v-else-if="sources.length === 0" class="text-text-dim py-12 text-center">
<p class="mb-1">No log sources found.</p>
<p class="text-sm">Run the ingest pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/ingest_corpus.py</code></p>
<p class="text-sm">Run the glean pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/glean_corpus.py</code></p>
</div>
<div v-else class="rounded border border-surface-border overflow-hidden">
<div class="overflow-x-auto">
<table class="w-full text-sm min-w-[560px]">
<table class="w-full text-sm min-w-[620px]">
<thead class="bg-surface-raised border-b border-surface-border">
<tr>
<th class="text-left px-4 py-2.5 text-text-dim font-medium text-xs uppercase tracking-wider">Source</th>
@ -40,29 +40,72 @@
<tbody>
<tr
v-for="src in sources"
:key="src.source_id"
:key="src.id"
class="border-b border-surface-border hover:bg-surface-raised transition-colors"
>
<td class="px-4 py-2.5 text-accent">{{ src.source_id }}</td>
<td class="px-4 py-2.5 text-text-muted text-right tabular-nums">{{ src.entry_count.toLocaleString() }}</td>
<!-- Source name + badges -->
<td class="px-4 py-2.5">
<div class="flex flex-wrap items-center gap-1.5">
<span class="text-accent font-mono text-xs">{{ src.id }}</span>
<!-- SSH transport badge -->
<span
v-if="src.transport === 'ssh'"
class="inline-flex items-center gap-1 px-1.5 py-0.5 rounded text-[10px] font-medium
bg-blue-900/30 text-blue-400 border border-blue-800/40"
:title="`SSH: ${src.user}@${src.host}`"
>
<svg class="w-2.5 h-2.5" viewBox="0 0 16 16" fill="currentColor" aria-hidden="true">
<path d="M2 3a1 1 0 011-1h10a1 1 0 011 1v2a1 1 0 01-1 1H3a1 1 0 01-1-1V3zm0 5a1 1 0 011-1h4a1 1 0 110 2H3a1 1 0 01-1-1zm0 4a1 1 0 011-1h2a1 1 0 110 2H3a1 1 0 01-1-1z"/>
</svg>
ssh
</span>
<!-- Glean-type pills for SSH sources -->
<span
v-for="gtype in (src.glean_types ?? [])"
:key="gtype"
class="px-1.5 py-0.5 rounded text-[10px] font-medium
bg-surface-raised text-text-dim border border-surface-border"
>{{ gtype }}</span>
<!-- Upload badge for DB-only sources not in sources.yaml -->
<span
v-if="src.dbOnly"
class="px-1.5 py-0.5 rounded text-[10px] font-medium
bg-surface-raised text-text-dim border border-surface-border"
>uploaded</span>
</div>
<!-- SSH host subtitle -->
<div v-if="src.transport === 'ssh'" class="text-text-dim text-xs mt-0.5 font-mono">
{{ src.user }}@{{ src.host }}
</div>
</td>
<!-- Entry count -->
<td class="px-4 py-2.5 text-text-muted text-right tabular-nums">
{{ src.entry_count.toLocaleString() }}
</td>
<!-- Error count -->
<td class="px-4 py-2.5 text-right tabular-nums">
<span :class="src.error_count > 0 ? 'text-sev-error' : 'text-text-dim'">
{{ src.error_count.toLocaleString() }}
</span>
</td>
<td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.earliest) }}</td>
<td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.latest) }}</td>
<!-- Actions -->
<td class="px-4 py-2.5">
<div class="flex items-center justify-end gap-2">
<button
:disabled="busy.has(src.source_id)"
@click="reingest(src.source_id)"
:disabled="busy.has(src.id) || src.dbOnly"
@click="reglean(src.id)"
class="text-text-dim hover:text-accent transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
title="Re-ingest from sources.yaml"
>{{ busy.has(src.source_id) ? '…' : 'reingest' }}</button>
:title="src.dbOnly ? 'Not in sources.yaml — cannot re-glean' : 'Re-glean from sources.yaml'"
>{{ busy.has(src.id) ? '…' : 'reglean' }}</button>
<button
:disabled="busy.has(src.source_id)"
@click="deleteSource(src.source_id)"
:disabled="busy.has(src.id)"
@click="deleteSource(src.id)"
class="text-text-dim hover:text-sev-error transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
title="Delete all entries for this source"
>delete</button>
@ -78,9 +121,36 @@
<script setup lang="ts">
import { ref, onMounted } from 'vue'
import type { LogSource } from '@/stores/search'
const sources = ref<LogSource[]>([])
// Unified source row shown in the table (merges configured + DB-only sources).
interface SourceRow {
id: string
transport: 'local' | 'ssh'
// SSH-specific
host?: string
user?: string
glean_types?: string[]
// Local-specific
path?: string
// DB stats (always present, default 0/null)
entry_count: number
error_count: number
earliest: string | null
latest: string | null
// True when this source exists in the DB but not in sources.yaml (e.g. uploads)
dbOnly?: boolean
}
interface ConfiguredSource extends Omit<SourceRow, 'dbOnly'> {}
interface DbSource {
source_id: string
entry_count: number
error_count: number
earliest: string | null
latest: string | null
}
const sources = ref<SourceRow[]>([])
const loading = ref(true)
const busy = ref(new Set<string>())
const actionMsg = ref('')
@ -90,11 +160,52 @@ const BASE = import.meta.env.BASE_URL.replace(/\/$/, '')
async function loadSources(): Promise<void> {
try {
const res = await fetch(`${BASE}/api/sources`)
if (res.ok) {
const data = await res.json()
sources.value = data.sources
// Primary list: configured sources from sources.yaml (enriched with DB stats).
// This makes SSH sources visible even before their first glean.
const [configuredRes, dbRes] = await Promise.all([
fetch(`${BASE}/api/sources/configured`),
fetch(`${BASE}/api/sources`),
])
const configuredData = configuredRes.ok ? await configuredRes.json() : { sources: [] }
const dbData = dbRes.ok ? await dbRes.json() : { sources: [] }
const configuredSources: ConfiguredSource[] = configuredData.sources ?? []
const dbSources: DbSource[] = dbData.sources ?? []
// Build a set of all IDs represented by configured sources.
// SSH sources own all sub-source IDs like "rack01/journald" too.
const coveredIds = new Set<string>()
for (const s of configuredSources) {
coveredIds.add(s.id)
}
// For SSH sources, also mark sub-source IDs (rack01/) as covered so they
// don't appear as separate "uploaded" rows.
for (const s of configuredSources) {
if (s.transport === 'ssh') {
for (const db of dbSources) {
if (db.source_id.startsWith(s.id + '/') || db.source_id === s.id) {
coveredIds.add(db.source_id)
}
}
}
}
// DB-only sources: uploaded files or manually gleaned sources not in sources.yaml.
const dbOnly: SourceRow[] = dbSources
.filter(db => !coveredIds.has(db.source_id))
.map(db => ({
id: db.source_id,
transport: 'local' as const,
entry_count: db.entry_count,
error_count: db.error_count,
earliest: db.earliest,
latest: db.latest,
dbOnly: true,
}))
sources.value = [...configuredSources as SourceRow[], ...dbOnly]
} finally {
loading.value = false
}
@ -118,7 +229,13 @@ async function deleteSource(sourceId: string): Promise<void> {
const data = await res.json()
actionMsg.value = `Deleted ${data.deleted.toLocaleString()} entries for "${sourceId}"`
actionError.value = false
sources.value = sources.value.filter(s => s.source_id !== sourceId)
// Remove DB-only rows; zero-out configured-source stats instead of hiding.
sources.value = sources.value
.filter(s => !(s.id === sourceId && s.dbOnly))
.map(s => s.id === sourceId
? { ...s, entry_count: 0, error_count: 0, earliest: null, latest: null }
: s
)
} else {
const data = await res.json()
actionMsg.value = data.detail ?? 'Delete failed'
@ -129,19 +246,19 @@ async function deleteSource(sourceId: string): Promise<void> {
}
}
async function reingest(sourceId: string): Promise<void> {
async function reglean(sourceId: string): Promise<void> {
setBusy(sourceId, true)
actionMsg.value = ''
actionError.value = false
try {
const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/ingest`, { method: 'POST' })
const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/glean`, { method: 'POST' })
const data = await res.json()
if (res.ok) {
actionMsg.value = `Re-ingest complete: ${data.ingested.toLocaleString()} new entries for "${sourceId}"`
actionMsg.value = `Re-glean complete: ${data.gleaned.toLocaleString()} new entries for "${sourceId}"`
actionError.value = false
await loadSources()
} else {
actionMsg.value = data.detail ?? 'Re-ingest failed'
actionMsg.value = data.detail ?? 'Re-glean failed'
actionError.value = true
}
} finally {
@ -156,10 +273,10 @@ async function handleUpload(e: Event): Promise<void> {
actionError.value = false
const form = new FormData()
form.append('file', file)
const res = await fetch(`${BASE}/api/ingest/upload`, { method: 'POST', body: form })
const res = await fetch(`${BASE}/api/glean/upload`, { method: 'POST', body: form })
const data = await res.json()
if (res.ok) {
actionMsg.value = `Uploaded: ${data.ingested.toLocaleString()} entries ingested as "${data.source_id}"`
actionMsg.value = `Uploaded: ${data.gleaned.toLocaleString()} entries gleaned as "${data.source_id}"`
actionError.value = false
await loadSources()
} else {