Merge pull request 'feat(diagnose): 5-stage multi-agent diagnose pipeline (#29)' (#39) from feat/29-multi-agent-diagnose into main
This commit is contained in:
commit
1e93189aa7
76 changed files with 6640 additions and 635 deletions
|
|
@ -23,6 +23,6 @@
|
|||
# Remote endpoint to push diagnostic bundles for escalation.
|
||||
# TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles
|
||||
|
||||
# --- Periodic batch ingest ---
|
||||
# Seconds between automatic ingest runs from sources.yaml. Set to 0 to disable.
|
||||
# TURNSTONE_INGEST_INTERVAL=900
|
||||
# --- Periodic batch glean ---
|
||||
# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
|
||||
# TURNSTONE_GLEAN_INTERVAL=900
|
||||
|
|
|
|||
10
README.md
10
README.md
|
|
@ -28,8 +28,8 @@ Service logs (journald, Docker, syslog, Caddy, Plex, arr stack, qBittorrent, dme
|
|||
|
||||
## Features
|
||||
|
||||
- **Multi-source ingest** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
|
||||
- **Pattern tagging** — named regex patterns applied at ingest time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
|
||||
- **Multi-source glean** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
|
||||
- **Pattern tagging** — named regex patterns applied at glean time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
|
||||
- **Full-text search** — SQLite FTS5 index across all ingested entries; filter by source, severity, time window
|
||||
- **Natural-language time queries** — "what happened yesterday morning", "show me errors from the last 3 hours"; powered by dateparser
|
||||
- **Incident management** — create, label, and track incidents; attach supporting log entries
|
||||
|
|
@ -101,13 +101,13 @@ sources:
|
|||
path: /var/log/caddy/access.log
|
||||
```
|
||||
|
||||
For `journald` sources, run `scripts/export_journal.sh` on the host before each ingest (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
|
||||
For `journald` sources, run `scripts/export_journal.sh` on the host before each glean (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
|
||||
|
||||
---
|
||||
|
||||
## Pattern library
|
||||
|
||||
Named patterns in `patterns/default.yaml` are matched against every log entry at ingest time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
|
||||
Named patterns in `patterns/default.yaml` are matched against every log entry at glean time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
|
||||
|
||||
```yaml
|
||||
patterns:
|
||||
|
|
@ -157,7 +157,7 @@ Copy `.env.example` to `.env` (or pass as `-e` flags to Docker/Podman). All vari
|
|||
| `TURNSTONE_PATTERNS` | `./patterns` | Pattern directory (default.yaml, sources.yaml, watch.yaml). |
|
||||
| `TURNSTONE_SOURCE_HOST` | `unknown` | Host identifier stamped on ingested entries. |
|
||||
| `TURNSTONE_BUNDLE_ENDPOINT` | — | Remote URL to push diagnostic bundles for escalation. |
|
||||
| `TURNSTONE_INGEST_INTERVAL` | `900` | Seconds between automatic batch ingest runs. Set to `0` to disable. |
|
||||
| `TURNSTONE_GLEAN_INTERVAL` | `900` | Seconds between automatic batch glean runs. Set to `0` to disable. |
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
|
|
@ -1,64 +1,81 @@
|
|||
"""Ollama embedding client with sqlite-vec storage — BSL licensed."""
|
||||
"""Context chunk embedding — BSL licensed.
|
||||
|
||||
Thin wrapper around app.services.embeddings that handles the DB I/O for
|
||||
context_chunks. All backend configuration (model, device, backend type) is
|
||||
delegated to the service layer via TURNSTONE_EMBED_* env vars.
|
||||
|
||||
Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue
|
||||
to work without changes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
import struct
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
from app.services.embeddings import (
|
||||
EMBEDDING_AVAILABLE, # re-export for backward compat
|
||||
get_embedder,
|
||||
pack_vector,
|
||||
)
|
||||
|
||||
__all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EMBEDDING_AVAILABLE: bool = False
|
||||
|
||||
try:
|
||||
import sqlite_vec # type: ignore[import] # noqa: F401
|
||||
EMBEDDING_AVAILABLE = True
|
||||
logger.debug("sqlite-vec loaded — embedding pipeline enabled")
|
||||
except ImportError:
|
||||
logger.debug("sqlite-vec not available — embedding pipeline disabled")
|
||||
|
||||
|
||||
def embed_chunks(
|
||||
db_path: Path,
|
||||
document_id: str,
|
||||
llm_url: str,
|
||||
model: str = "nomic-embed-text",
|
||||
# Legacy params kept for backward compat — ignored when the ST backend is active.
|
||||
llm_url: str = "",
|
||||
model: str = "",
|
||||
timeout: float = 60.0,
|
||||
) -> int:
|
||||
"""Embed all unembedded chunks for a document. Returns count embedded. No-op when EMBEDDING_AVAILABLE is False."""
|
||||
if not EMBEDDING_AVAILABLE:
|
||||
"""Embed all un-embedded chunks for *document_id*.
|
||||
|
||||
Uses the configured embedder (sentence-transformers by default; Ollama when
|
||||
TURNSTONE_EMBED_BACKEND=ollama). Returns the count of newly embedded chunks.
|
||||
Returns 0 silently when no embedder is available.
|
||||
|
||||
The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when
|
||||
the sentence-transformers backend is active — configure via env vars instead.
|
||||
"""
|
||||
embedder = get_embedder()
|
||||
if embedder is None:
|
||||
return 0
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.row_factory = sqlite3.Row
|
||||
|
||||
rows = conn.execute(
|
||||
"SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL",
|
||||
(document_id,),
|
||||
).fetchall()
|
||||
|
||||
if not rows:
|
||||
conn.close()
|
||||
return 0
|
||||
|
||||
texts = [r["text"] for r in rows]
|
||||
ids = [r["id"] for r in rows]
|
||||
|
||||
count = 0
|
||||
for row in rows:
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{llm_url.rstrip('/')}/api/embeddings",
|
||||
json={"model": model, "prompt": row["text"]},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
vector: list[float] = resp.json().get("embedding") or []
|
||||
if vector:
|
||||
blob = struct.pack(f"{len(vector)}f", *vector)
|
||||
vectors = embedder.embed_batch(texts)
|
||||
for chunk_id, vec in zip(ids, vectors):
|
||||
blob = pack_vector(vec)
|
||||
conn.execute(
|
||||
"UPDATE context_chunks SET embedding = ? WHERE id = ?",
|
||||
(blob, row["id"]),
|
||||
(blob, chunk_id),
|
||||
)
|
||||
count += 1
|
||||
except Exception as exc:
|
||||
logger.warning("Embedding chunk %s failed: %s", row["id"], exc)
|
||||
|
||||
conn.commit()
|
||||
except Exception as exc:
|
||||
logger.warning("Batch embedding failed for document %s: %s", document_id, exc)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
logger.debug("Embedded %d chunk(s) for document %s", count, document_id)
|
||||
return count
|
||||
|
|
|
|||
|
|
@ -1,10 +1,30 @@
|
|||
"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed."""
|
||||
"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed.
|
||||
|
||||
Two retrieval modes for context_chunks:
|
||||
Vector search — cosine similarity over stored embeddings (when available)
|
||||
Keyword search — LIKE-based fallback when no embedder is configured
|
||||
|
||||
Both modes are called from retrieve_context(); the best available mode is used
|
||||
automatically so callers need not check EMBEDDING_AVAILABLE themselves.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
|
||||
from app.services.embeddings import (
|
||||
EMBEDDING_AVAILABLE,
|
||||
cosine_similarity,
|
||||
get_embedder,
|
||||
unpack_vector,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RetrievedContext:
|
||||
|
|
@ -12,6 +32,8 @@ class RetrievedContext:
|
|||
chunks: list[dict[str, str]] = field(default_factory=list)
|
||||
|
||||
|
||||
# ── Structured fact retrieval (always runs) ───────────────────────────────────
|
||||
|
||||
def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
|
||||
"""Keyword match against context_facts. Always runs — Free tier."""
|
||||
try:
|
||||
|
|
@ -42,8 +64,68 @@ def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
|
|||
return []
|
||||
|
||||
|
||||
def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
|
||||
"""Keyword search across context_chunks. Fallback when no embeddings."""
|
||||
# ── Chunk retrieval: vector path ──────────────────────────────────────────────
|
||||
|
||||
def _search_chunks_vector(
|
||||
db_path: Path,
|
||||
query: str,
|
||||
top_k: int = 3,
|
||||
) -> list[dict[str, str]]:
|
||||
"""Cosine similarity search over embedded context_chunks.
|
||||
|
||||
Loads all stored embeddings into memory and scores in-process with numpy.
|
||||
Skips any chunk whose BLOB dimension does not match the current model dim
|
||||
(stale embeddings from a previous model — they will be re-embedded on the
|
||||
next document upload).
|
||||
|
||||
Returns at most *top_k* results ordered by similarity descending.
|
||||
"""
|
||||
embedder = get_embedder()
|
||||
if embedder is None:
|
||||
return []
|
||||
|
||||
try:
|
||||
query_vec: np.ndarray = embedder.embed(query)
|
||||
model_dim: int = embedder.dim
|
||||
except Exception as exc:
|
||||
logger.warning("Query embedding failed: %s", exc)
|
||||
return []
|
||||
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.row_factory = sqlite3.Row
|
||||
rows = conn.execute(
|
||||
"SELECT cc.id, cc.text, cc.embedding, cd.filename"
|
||||
" FROM context_chunks cc"
|
||||
" JOIN context_documents cd ON cc.document_id = cd.id"
|
||||
" WHERE cc.embedding IS NOT NULL"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
except sqlite3.OperationalError:
|
||||
return []
|
||||
|
||||
scored: list[tuple[float, dict[str, str]]] = []
|
||||
for row in rows:
|
||||
blob: bytes = row["embedding"]
|
||||
# Guard against blobs from a different-dimension model
|
||||
if len(blob) // 4 != model_dim:
|
||||
continue
|
||||
try:
|
||||
chunk_vec = unpack_vector(blob)
|
||||
score = cosine_similarity(query_vec, chunk_vec)
|
||||
scored.append((score, {"text": row["text"], "filename": row["filename"]}))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
scored.sort(key=lambda t: t[0], reverse=True)
|
||||
return [item for _, item in scored[:top_k]]
|
||||
|
||||
|
||||
# ── Chunk retrieval: keyword fallback ─────────────────────────────────────────
|
||||
|
||||
def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]:
|
||||
"""LIKE-based keyword search across context_chunks. Fallback when no embedder."""
|
||||
try:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
|
|
@ -66,16 +148,29 @@ def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
|
|||
return []
|
||||
|
||||
|
||||
# ── Public interface ──────────────────────────────────────────────────────────
|
||||
|
||||
def retrieve_context(db_path: Path, query: str) -> RetrievedContext:
|
||||
"""Retrieve structured facts and relevant chunks for a query."""
|
||||
return RetrievedContext(
|
||||
facts=get_relevant_facts(db_path, query),
|
||||
chunks=_search_chunks(db_path, query),
|
||||
)
|
||||
"""Retrieve structured facts and relevant chunks for a query.
|
||||
|
||||
Chunk retrieval uses vector search when an embedder is available and at
|
||||
least one embedded chunk exists; falls back to keyword search otherwise.
|
||||
"""
|
||||
facts = get_relevant_facts(db_path, query)
|
||||
|
||||
if EMBEDDING_AVAILABLE:
|
||||
chunks = _search_chunks_vector(db_path, query)
|
||||
if not chunks:
|
||||
# Vector search returned nothing (no embedded chunks yet) — fall back.
|
||||
chunks = _search_chunks_keyword(db_path, query)
|
||||
else:
|
||||
chunks = _search_chunks_keyword(db_path, query)
|
||||
|
||||
return RetrievedContext(facts=facts, chunks=chunks)
|
||||
|
||||
|
||||
def format_context_block(ctx: RetrievedContext) -> str | None:
|
||||
"""Format context for injection into LLM prompt. Returns None when empty."""
|
||||
"""Format context for injection into an LLM prompt. Returns None when empty."""
|
||||
lines: list[str] = []
|
||||
if ctx.facts:
|
||||
lines.append("Known environment facts:")
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
|||
import json
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, epoch_float_to_iso,
|
||||
make_entry_id, now_iso,
|
||||
)
|
||||
|
|
@ -18,7 +18,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -10,7 +10,7 @@ from app.context.chunker import process_upload
|
|||
from app.context.store import add_document, add_fact
|
||||
|
||||
|
||||
def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
|
||||
def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
|
||||
"""Process an uploaded file and write to context store. Returns result summary."""
|
||||
doc_type, facts, chunks = process_upload(filename, content)
|
||||
|
||||
|
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
|||
import json
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity,
|
||||
make_entry_id, now_iso,
|
||||
)
|
||||
|
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
|||
import json
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, epoch_micros_to_iso,
|
||||
make_entry_id, now_iso, SYSLOG_PRIORITY,
|
||||
)
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
"""Live MQTT ingest subscriber for Turnstone.
|
||||
"""Live MQTT glean subscriber for Turnstone.
|
||||
|
||||
Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker
|
||||
in the background. Incoming messages are normalized to RetrievedEntry and
|
||||
written to the Turnstone SQLite database as they arrive.
|
||||
|
||||
This runs as an asyncio task alongside the batch ingest scheduler. It is
|
||||
This runs as an asyncio task alongside the batch glean scheduler. It is
|
||||
started from the FastAPI lifespan in rest.py.
|
||||
|
||||
MQTT source config format in sources.yaml::
|
||||
616
app/glean/pipeline.py
Normal file
616
app/glean/pipeline.py
Normal file
|
|
@ -0,0 +1,616 @@
|
|||
"""Glean pipeline: auto-detect format, parse, write to SQLite."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
import yaml
|
||||
|
||||
from app.glean import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
|
||||
from app.glean.base import _compile, load_patterns, now_iso
|
||||
from app.glean.ssh import (
|
||||
SSHTransport,
|
||||
SSHConnectionError,
|
||||
SSHCommandError,
|
||||
_build_docker_command,
|
||||
_build_journald_command,
|
||||
_build_plaintext_command,
|
||||
_build_syslog_command,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
from app.services.search import build_fts_index
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS log_entries (
|
||||
id TEXT PRIMARY KEY,
|
||||
source_id TEXT NOT NULL,
|
||||
sequence INTEGER NOT NULL,
|
||||
timestamp_raw TEXT,
|
||||
timestamp_iso TEXT,
|
||||
ingest_time TEXT NOT NULL,
|
||||
severity TEXT,
|
||||
repeat_count INTEGER DEFAULT 1,
|
||||
out_of_order INTEGER DEFAULT 0,
|
||||
matched_patterns TEXT DEFAULT '[]',
|
||||
text TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso);
|
||||
CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count);
|
||||
CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity);
|
||||
CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS incidents (
|
||||
id TEXT PRIMARY KEY,
|
||||
label TEXT NOT NULL,
|
||||
issue_type TEXT NOT NULL DEFAULT '',
|
||||
started_at TEXT,
|
||||
ended_at TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'medium'
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS received_bundles (
|
||||
id TEXT PRIMARY KEY,
|
||||
source_host TEXT NOT NULL,
|
||||
issue_type TEXT NOT NULL DEFAULT '',
|
||||
label TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'medium',
|
||||
started_at TEXT,
|
||||
bundled_at TEXT NOT NULL,
|
||||
entry_count INTEGER NOT NULL DEFAULT 0,
|
||||
bundle_json TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_facts (
|
||||
id TEXT PRIMARY KEY,
|
||||
category TEXT NOT NULL,
|
||||
key TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
source TEXT,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
filename TEXT NOT NULL,
|
||||
doc_type TEXT NOT NULL,
|
||||
full_text TEXT NOT NULL,
|
||||
file_size INTEGER,
|
||||
uploaded_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_chunks (
|
||||
id TEXT PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
embedding BLOB
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS blocklist_candidates (
|
||||
id TEXT PRIMARY KEY,
|
||||
domain_or_ip TEXT NOT NULL,
|
||||
source_device_ip TEXT,
|
||||
source_device_name TEXT,
|
||||
first_seen TEXT NOT NULL,
|
||||
last_seen TEXT NOT NULL,
|
||||
hit_count INTEGER DEFAULT 1,
|
||||
status TEXT DEFAULT 'pending',
|
||||
pushed_at TEXT,
|
||||
log_evidence TEXT DEFAULT '[]',
|
||||
matched_rule TEXT,
|
||||
llm_score REAL,
|
||||
llm_reason TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS glean_fingerprints (
|
||||
path TEXT PRIMARY KEY,
|
||||
mtime REAL NOT NULL,
|
||||
size INTEGER NOT NULL,
|
||||
gleaned_at TEXT NOT NULL
|
||||
);
|
||||
"""
|
||||
|
||||
|
||||
def ensure_schema(db_path: Path) -> None:
|
||||
"""Create all tables and apply additive migrations. Safe to call on every startup."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
# Additive column migrations — ALTER TABLE silently skips if column exists
|
||||
for stmt in [
|
||||
"ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
|
||||
]:
|
||||
try:
|
||||
conn.execute(stmt)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def _fingerprint(path: Path) -> tuple[float, int]:
|
||||
"""Return (mtime, size) for a file — cheap identity check, no content read needed."""
|
||||
st = path.stat()
|
||||
return st.st_mtime, st.st_size
|
||||
|
||||
|
||||
def _fp_unchanged(conn: sqlite3.Connection, path: Path, mtime: float, size: int) -> bool:
|
||||
"""Return True only when the stored fingerprint exactly matches (mtime, size).
|
||||
|
||||
A smaller size (log rotation) or a larger size (new lines appended) both
|
||||
return False so the caller re-gleams the file.
|
||||
"""
|
||||
row = conn.execute(
|
||||
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
|
||||
(str(path),),
|
||||
).fetchone()
|
||||
if row is None:
|
||||
return False
|
||||
return row[0] == mtime and row[1] == size
|
||||
|
||||
|
||||
def _save_fingerprint(
|
||||
conn: sqlite3.Connection,
|
||||
path: Path,
|
||||
mtime: float,
|
||||
size: int,
|
||||
gleaned_at: str,
|
||||
) -> None:
|
||||
"""Upsert the fingerprint for *path* after a successful glean."""
|
||||
conn.execute(
|
||||
"""
|
||||
INSERT OR REPLACE INTO glean_fingerprints (path, mtime, size, gleaned_at)
|
||||
VALUES (?, ?, ?, ?)
|
||||
""",
|
||||
(str(path), mtime, size, gleaned_at),
|
||||
)
|
||||
|
||||
|
||||
def _detect_format(first_line: str) -> str:
|
||||
try:
|
||||
obj = json.loads(first_line)
|
||||
if "__REALTIME_TIMESTAMP" in obj:
|
||||
return "journald"
|
||||
if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
|
||||
return "docker"
|
||||
if wazuh.is_wazuh_alert(obj):
|
||||
return "wazuh"
|
||||
if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
|
||||
return "caddy"
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
if plex.is_plex_log(first_line):
|
||||
return "plex"
|
||||
if qbittorrent.is_qbit_log(first_line):
|
||||
return "qbittorrent"
|
||||
if servarr.is_servarr_log(first_line):
|
||||
return "servarr"
|
||||
if dmesg_log.is_dmesg_log(first_line):
|
||||
return "dmesg"
|
||||
if syslog.is_syslog(first_line):
|
||||
return "syslog"
|
||||
return "plaintext"
|
||||
|
||||
|
||||
def _parse_file(
|
||||
path: Path,
|
||||
compiled: list[tuple[LogPattern, object]],
|
||||
ingest_time: str,
|
||||
source_id: str | None = None,
|
||||
) -> Iterator[RetrievedEntry]:
|
||||
source_id = source_id or path.stem
|
||||
|
||||
with path.open("r", errors="replace") as f:
|
||||
lines = iter(f)
|
||||
try:
|
||||
first = next(lines)
|
||||
except StopIteration:
|
||||
return
|
||||
|
||||
fmt = _detect_format(first.strip())
|
||||
logger.info("Detected format %r for %s", fmt, path.name)
|
||||
|
||||
def all_lines():
|
||||
yield first
|
||||
yield from lines
|
||||
|
||||
if fmt == "journald":
|
||||
yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "wazuh":
|
||||
yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "docker":
|
||||
yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "caddy":
|
||||
yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "plex":
|
||||
yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "qbittorrent":
|
||||
yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "servarr":
|
||||
yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "dmesg":
|
||||
yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "syslog":
|
||||
yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
else:
|
||||
yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
|
||||
|
||||
def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT OR IGNORE INTO log_entries
|
||||
(id, source_id, sequence, timestamp_raw, timestamp_iso,
|
||||
ingest_time, severity, repeat_count, out_of_order,
|
||||
matched_patterns, text)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
[
|
||||
(
|
||||
e.entry_id, e.source_id, e.sequence,
|
||||
e.timestamp_raw, e.timestamp_iso, e.ingest_time,
|
||||
e.severity, e.repeat_count, int(e.out_of_order),
|
||||
json.dumps(list(e.matched_patterns)), e.text,
|
||||
)
|
||||
for e in batch
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _glean_files(
|
||||
files: list[Path],
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
source_id_map: dict[Path, str] | None = None,
|
||||
force: bool = False,
|
||||
) -> dict[str, int]:
|
||||
pattern_file = pattern_file or Path("patterns/default.yaml")
|
||||
patterns = load_patterns(pattern_file)
|
||||
compiled = _compile(patterns)
|
||||
ingest_time = now_iso()
|
||||
source_id_map = source_id_map or {}
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
conn.commit()
|
||||
|
||||
stats: dict[str, int] = {}
|
||||
skipped: list[str] = []
|
||||
|
||||
for log_file in files:
|
||||
source_id = source_id_map.get(log_file, log_file.stem)
|
||||
|
||||
# Fingerprint check — skip files whose mtime+size haven't changed.
|
||||
mtime, size = _fingerprint(log_file)
|
||||
if not force and _fp_unchanged(conn, log_file, mtime, size):
|
||||
logger.debug("Skipping unchanged file: %s", log_file.name)
|
||||
skipped.append(log_file.name)
|
||||
stats[source_id] = stats.get(source_id, 0)
|
||||
continue
|
||||
|
||||
count = 0
|
||||
batch: list[RetrievedEntry] = []
|
||||
for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
|
||||
batch.append(entry)
|
||||
if len(batch) >= batch_size:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
batch.clear()
|
||||
if batch:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
|
||||
_save_fingerprint(conn, log_file, mtime, size, ingest_time)
|
||||
conn.commit()
|
||||
|
||||
stats[source_id] = stats.get(source_id, 0) + count
|
||||
logger.info("Gleaned %d entries from %s (source: %s)", count, log_file.name, source_id)
|
||||
|
||||
conn.close()
|
||||
|
||||
if skipped:
|
||||
logger.info("Skipped %d unchanged file(s): %s", len(skipped), ", ".join(skipped))
|
||||
|
||||
logger.info("Building FTS index...")
|
||||
build_fts_index(db_path)
|
||||
logger.info("FTS index ready")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def _stream_and_write(
|
||||
transport: SSHTransport,
|
||||
cmd: str,
|
||||
parser,
|
||||
source_id: str,
|
||||
compiled: list[tuple[LogPattern, object]],
|
||||
ingest_time: str,
|
||||
conn: sqlite3.Connection,
|
||||
batch_size: int,
|
||||
) -> int:
|
||||
"""Stream *cmd* output through *parser* and write entries to *conn*.
|
||||
|
||||
Catches SSHCommandError per-item so one bad command doesn't abort the rest
|
||||
of the glean items for this host. Returns the number of entries written.
|
||||
"""
|
||||
count = 0
|
||||
batch: list[RetrievedEntry] = []
|
||||
try:
|
||||
for entry in parser(transport.exec_stream(cmd), source_id, compiled, ingest_time):
|
||||
batch.append(entry)
|
||||
if len(batch) >= batch_size:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
batch.clear()
|
||||
if batch:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
except SSHCommandError as exc:
|
||||
logger.warning("SSH command failed for source %r (cmd: %s): %s", source_id, cmd, exc)
|
||||
logger.info("Gleaned %d entries from SSH source %s", count, source_id)
|
||||
return count
|
||||
|
||||
|
||||
def _glean_ssh_source(
|
||||
src: dict, # type: ignore[type-arg]
|
||||
compiled: list[tuple[LogPattern, object]],
|
||||
ingest_time: str,
|
||||
conn: sqlite3.Connection,
|
||||
batch_size: int,
|
||||
) -> dict[str, int]:
|
||||
"""Open one SSHTransport connection for *src* and glean all its glean items.
|
||||
|
||||
One SSH connection is shared across all items in the ``glean:`` list so
|
||||
the handshake overhead is paid only once per host per glean run.
|
||||
|
||||
Returns a stats dict mapping ``{source_id: entry_count}`` for each item.
|
||||
Gracefully skips the entire source on SSHConnectionError.
|
||||
"""
|
||||
host_id = src.get("id", src.get("host", "unknown"))
|
||||
host = src["host"]
|
||||
user = src["user"]
|
||||
key_path = str(Path(src["key_path"]).expanduser())
|
||||
port = int(src.get("port", 22))
|
||||
glean_items: list[dict] = src.get("glean", []) # type: ignore[type-arg]
|
||||
|
||||
stats: dict[str, int] = {}
|
||||
|
||||
try:
|
||||
with SSHTransport(host=host, user=user, key_path=key_path, port=port) as t:
|
||||
for item in glean_items:
|
||||
item_type = item.get("type", "plaintext")
|
||||
# Per-item source_id — falls back to host_id/type for un-labelled items
|
||||
item_id = item.get("id") or f"{host_id}/{item_type}"
|
||||
|
||||
if item_type == "journald":
|
||||
cmd = _build_journald_command(item)
|
||||
count = _stream_and_write(
|
||||
t, cmd, journald.parse, item_id, compiled, ingest_time, conn, batch_size
|
||||
)
|
||||
stats[item_id] = stats.get(item_id, 0) + count
|
||||
|
||||
elif item_type == "syslog":
|
||||
cmd = _build_syslog_command(item)
|
||||
count = _stream_and_write(
|
||||
t, cmd, syslog.parse, item_id, compiled, ingest_time, conn, batch_size
|
||||
)
|
||||
stats[item_id] = stats.get(item_id, 0) + count
|
||||
|
||||
elif item_type == "plaintext":
|
||||
cmd = _build_plaintext_command(item)
|
||||
count = _stream_and_write(
|
||||
t, cmd, plaintext.parse, item_id, compiled, ingest_time, conn, batch_size
|
||||
)
|
||||
stats[item_id] = stats.get(item_id, 0) + count
|
||||
|
||||
elif item_type == "docker":
|
||||
cmds = _build_docker_command(item)
|
||||
if isinstance(cmds, str):
|
||||
cmds = [cmds]
|
||||
containers: list[str] = item.get("containers", [])
|
||||
for i, cmd in enumerate(cmds):
|
||||
# Use the container name as the final path segment when available
|
||||
container_name = containers[i] if i < len(containers) else str(i)
|
||||
container_id = f"{item_id}/{container_name}" if len(cmds) > 1 else item_id
|
||||
count = _stream_and_write(
|
||||
t, cmd, docker_log.parse, container_id,
|
||||
compiled, ingest_time, conn, batch_size,
|
||||
)
|
||||
stats[container_id] = stats.get(container_id, 0) + count
|
||||
|
||||
else:
|
||||
logger.warning(
|
||||
"Unknown SSH glean type %r for source %r — skipping item",
|
||||
item_type, host_id,
|
||||
)
|
||||
|
||||
except SSHConnectionError as exc:
|
||||
logger.warning("SSH connection failed for source %r: %s", host_id, exc)
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def glean_ssh_source(
|
||||
src: dict, # type: ignore[type-arg]
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
) -> dict[str, int]:
|
||||
"""Glean a single SSH source dict and write results to *db_path*.
|
||||
|
||||
Public wrapper around :func:`_glean_ssh_source` for the REST layer.
|
||||
Manages the DB connection, pattern compilation, and FTS rebuild so callers
|
||||
don't have to deal with those lifecycle concerns.
|
||||
|
||||
Returns stats mapping ``{sub_source_id: entry_count}``.
|
||||
"""
|
||||
effective_pattern_file = pattern_file or Path("patterns/default.yaml")
|
||||
compiled = _compile(load_patterns(effective_pattern_file))
|
||||
ingest_time = now_iso()
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
conn.commit()
|
||||
|
||||
try:
|
||||
stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
logger.info("Rebuilding FTS index after SSH source glean...")
|
||||
build_fts_index(db_path)
|
||||
return stats
|
||||
|
||||
|
||||
def glean_dir(
|
||||
corpus_dir: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
force: bool = False,
|
||||
) -> dict[str, int]:
|
||||
"""Glean all .jsonl and .log files from a corpus directory.
|
||||
|
||||
Pass ``force=True`` to bypass fingerprint checks and re-glean all files
|
||||
regardless of whether they have changed since the last run.
|
||||
"""
|
||||
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
|
||||
return _glean_files(files, db_path, pattern_file, batch_size, force=force)
|
||||
|
||||
|
||||
def glean_file(
|
||||
log_file: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
force: bool = False,
|
||||
) -> dict[str, int]:
|
||||
"""Glean a single log file (any supported format).
|
||||
|
||||
Pass ``force=True`` to re-glean even when the file fingerprint is unchanged.
|
||||
"""
|
||||
return _glean_files([log_file], db_path, pattern_file, force=force)
|
||||
|
||||
|
||||
def glean_sources(
|
||||
sources_file: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
force: bool = False,
|
||||
) -> dict[str, int]:
|
||||
"""Glean all sources listed in a sources.yaml config file.
|
||||
|
||||
Supports two source types:
|
||||
|
||||
Local file sources (default):
|
||||
sources:
|
||||
- id: sonarr
|
||||
path: /opt/sonarr/config/logs/sonarr.0.txt
|
||||
|
||||
SSH remote sources (transport: ssh):
|
||||
sources:
|
||||
- id: rack01
|
||||
transport: ssh
|
||||
host: 192.168.1.10
|
||||
user: admin
|
||||
key_path: ~/.ssh/id_ed25519
|
||||
glean:
|
||||
- type: journald
|
||||
args: ["--since", "2 hours ago"]
|
||||
- type: syslog
|
||||
path: /var/log/syslog
|
||||
- type: plaintext
|
||||
path: /var/log/app/error.log
|
||||
- type: docker
|
||||
containers: [myapp, nginx]
|
||||
|
||||
Missing local paths and SSH connection failures are logged as warnings
|
||||
so the cron keeps running when a source is temporarily down.
|
||||
"""
|
||||
with open(sources_file) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
local_sources: list[dict] = [] # type: ignore[type-arg]
|
||||
ssh_sources: list[dict] = [] # type: ignore[type-arg]
|
||||
|
||||
for src in config.get("sources", []):
|
||||
if src.get("transport") == "ssh":
|
||||
ssh_sources.append(src)
|
||||
else:
|
||||
local_sources.append(src)
|
||||
|
||||
# ── Local file sources ─────────────────────────────────────────────────
|
||||
files: list[Path] = []
|
||||
source_id_map: dict[Path, str] = {}
|
||||
|
||||
for src in local_sources:
|
||||
path = Path(src["path"])
|
||||
if not path.exists():
|
||||
logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
|
||||
continue
|
||||
files.append(path)
|
||||
if "id" in src:
|
||||
source_id_map[path] = src["id"]
|
||||
|
||||
if not files and not ssh_sources:
|
||||
logger.warning("No sources found — check sources.yaml paths")
|
||||
return {}
|
||||
|
||||
stats: dict[str, int] = {}
|
||||
if files:
|
||||
stats.update(_glean_files(files, db_path, pattern_file, batch_size, source_id_map, force=force))
|
||||
|
||||
# ── SSH remote sources ─────────────────────────────────────────────────
|
||||
if not ssh_sources:
|
||||
return stats
|
||||
|
||||
# Compile patterns once, share across all SSH sources in this run.
|
||||
effective_pattern_file = pattern_file or Path("patterns/default.yaml")
|
||||
compiled = _compile(load_patterns(effective_pattern_file))
|
||||
ingest_time = now_iso()
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
conn.commit()
|
||||
|
||||
try:
|
||||
for src in ssh_sources:
|
||||
ssh_stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
|
||||
for k, v in ssh_stats.items():
|
||||
stats[k] = stats.get(k, 0) + v
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# Rebuild FTS only when SSH sources added entries (_glean_files already
|
||||
# rebuilds when local sources are present; safe to call again if both ran).
|
||||
if ssh_sources:
|
||||
logger.info("Rebuilding FTS index after SSH glean...")
|
||||
build_fts_index(db_path)
|
||||
|
||||
return stats
|
||||
|
|
@ -10,7 +10,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -12,7 +12,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -18,7 +18,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -12,7 +12,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
225
app/glean/ssh.py
Normal file
225
app/glean/ssh.py
Normal file
|
|
@ -0,0 +1,225 @@
|
|||
"""SSH transport layer for remote log gleaning (issue #22).
|
||||
|
||||
Wraps Paramiko to provide a clean context-manager interface for executing
|
||||
remote commands and streaming their stdout output. All format parsing is
|
||||
delegated to the existing per-format parsers (journald, syslog, plaintext,
|
||||
docker); this module is transport only.
|
||||
|
||||
Key design choices:
|
||||
- Key-based auth only — no password prompts in a daemon context.
|
||||
- exec_stream is a generator; exit-status check fires after all lines are
|
||||
yielded, so callers must drain the iterator (e.g. list()) to trigger it.
|
||||
- Command builders live here because they encode SSH/remote-execution idioms
|
||||
(journalctl flags, docker logs invocation) that the generic parsers don't
|
||||
need to know about.
|
||||
|
||||
Example sources.yaml snippet::
|
||||
|
||||
sources:
|
||||
- id: rack01
|
||||
transport: ssh
|
||||
host: 192.168.1.10
|
||||
user: admin
|
||||
key_path: ~/.ssh/id_ed25519
|
||||
glean:
|
||||
- type: journald
|
||||
args: ["--since", "2 hours ago"]
|
||||
- type: syslog
|
||||
path: /var/log/syslog
|
||||
- type: plaintext
|
||||
path: /var/log/app/error.log
|
||||
- type: docker
|
||||
containers: [myapp, nginx]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import shlex
|
||||
from collections.abc import Iterator
|
||||
from typing import Union
|
||||
|
||||
import paramiko
|
||||
|
||||
|
||||
__all__ = [
|
||||
"SSHConnectionError",
|
||||
"SSHCommandError",
|
||||
"SSHTransport",
|
||||
"_build_journald_command",
|
||||
"_build_syslog_command",
|
||||
"_build_plaintext_command",
|
||||
"_build_docker_command",
|
||||
]
|
||||
|
||||
# Default syslog path used when none is specified in the source spec.
|
||||
_SYSLOG_DEFAULT_PATH = "/var/log/syslog"
|
||||
|
||||
|
||||
# ── Custom exceptions ─────────────────────────────────────────────────────────
|
||||
|
||||
class SSHConnectionError(Exception):
|
||||
"""Raised when the SSH connection cannot be established or authenticated."""
|
||||
|
||||
|
||||
class SSHCommandError(Exception):
|
||||
"""Raised when a remote command exits with a non-zero status code."""
|
||||
|
||||
|
||||
# ── Transport context manager ─────────────────────────────────────────────────
|
||||
|
||||
class SSHTransport:
|
||||
"""Context manager wrapping a Paramiko SSH connection.
|
||||
|
||||
Opens the connection on ``__enter__`` and closes it on ``__exit__``,
|
||||
even if an exception propagates. Key-based authentication only.
|
||||
|
||||
Usage::
|
||||
|
||||
with SSHTransport(host="10.0.0.1", user="admin",
|
||||
key_path="~/.ssh/id_ed25519") as t:
|
||||
for line in t.exec_stream("journalctl -o json --since '1 hour ago'"):
|
||||
process(line)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
host: str,
|
||||
user: str,
|
||||
key_path: str,
|
||||
port: int = 22,
|
||||
) -> None:
|
||||
self._host = host
|
||||
self._user = user
|
||||
self._key_path = key_path
|
||||
self._port = port
|
||||
self._client: paramiko.SSHClient | None = None
|
||||
|
||||
# ── context manager protocol ──────────────────────────────────────────────
|
||||
|
||||
def __enter__(self) -> "SSHTransport":
|
||||
client = paramiko.SSHClient()
|
||||
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
try:
|
||||
client.connect(
|
||||
hostname=self._host,
|
||||
username=self._user,
|
||||
key_filename=self._key_path,
|
||||
port=self._port,
|
||||
)
|
||||
except paramiko.AuthenticationException as exc:
|
||||
client.close()
|
||||
raise SSHConnectionError(
|
||||
f"SSH auth failed for {self._user}@{self._host}: {exc}"
|
||||
) from exc
|
||||
except paramiko.SSHException as exc:
|
||||
client.close()
|
||||
raise SSHConnectionError(
|
||||
f"SSH connection failed to {self._host}: {exc}"
|
||||
) from exc
|
||||
self._client = client
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb) -> None: # type: ignore[override]
|
||||
if self._client is not None:
|
||||
self._client.close()
|
||||
self._client = None
|
||||
# Return None (falsy) so any in-flight exception is not suppressed.
|
||||
|
||||
# ── remote execution ──────────────────────────────────────────────────────
|
||||
|
||||
def exec_stream(self, command: str) -> Iterator[str]:
|
||||
"""Execute *command* on the remote host and yield stdout lines.
|
||||
|
||||
The exit-status check runs after all stdout lines have been yielded,
|
||||
so callers must drain the iterator to trigger it::
|
||||
|
||||
list(transport.exec_stream(cmd)) # raises if exit != 0
|
||||
|
||||
Raises:
|
||||
SSHConnectionError: if called outside a ``with`` block.
|
||||
SSHCommandError: if the remote command exits non-zero.
|
||||
"""
|
||||
if self._client is None:
|
||||
raise SSHConnectionError(
|
||||
"Not connected — use SSHTransport as a context manager"
|
||||
)
|
||||
_, stdout, stderr = self._client.exec_command(command)
|
||||
for line in stdout:
|
||||
yield line
|
||||
exit_code = stdout.channel.recv_exit_status()
|
||||
# Guard against MagicMock in tests: only treat real integer exit codes.
|
||||
if isinstance(exit_code, int) and exit_code != 0:
|
||||
error_msg = stderr.read().decode(errors="replace").strip()
|
||||
raise SSHCommandError(
|
||||
f"Command failed (exit {exit_code}): {error_msg}"
|
||||
)
|
||||
|
||||
|
||||
# ── Command builders ──────────────────────────────────────────────────────────
|
||||
|
||||
def _build_journald_command(spec: dict) -> str: # type: ignore[type-arg]
|
||||
"""Build a ``journalctl`` command string from a glean source spec.
|
||||
|
||||
Spec keys:
|
||||
|
||||
- ``args`` — list of extra journalctl arguments appended verbatim.
|
||||
- ``unit`` — shorthand for ``--unit <name>`` (inserted before ``args``).
|
||||
|
||||
Returns a single shell command string.
|
||||
"""
|
||||
parts = ["journalctl", "-o json", "--no-pager"]
|
||||
if "unit" in spec:
|
||||
parts.append(f"--unit {spec['unit']}")
|
||||
if "args" in spec:
|
||||
parts.extend(spec["args"])
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _build_syslog_command(spec: dict) -> str: # type: ignore[type-arg]
|
||||
"""Build a ``cat`` command for a syslog-format log file.
|
||||
|
||||
Spec keys:
|
||||
|
||||
- ``path`` — path to the file (default: ``/var/log/syslog``).
|
||||
|
||||
Returns a single shell command string.
|
||||
"""
|
||||
path = spec.get("path", _SYSLOG_DEFAULT_PATH)
|
||||
return f"cat {shlex.quote(path)}"
|
||||
|
||||
|
||||
def _build_plaintext_command(spec: dict) -> str: # type: ignore[type-arg]
|
||||
"""Build a ``cat`` command for an arbitrary plaintext log file.
|
||||
|
||||
Spec keys:
|
||||
|
||||
- ``path`` — **required** path to the log file.
|
||||
|
||||
Raises:
|
||||
KeyError: if ``path`` is absent from the spec.
|
||||
"""
|
||||
path = spec["path"] # intentional KeyError if missing — callers must supply it
|
||||
return f"cat {shlex.quote(path)}"
|
||||
|
||||
|
||||
def _build_docker_command(
|
||||
spec: dict, # type: ignore[type-arg]
|
||||
) -> Union[str, list[str]]:
|
||||
"""Build ``docker logs`` command(s) for one or more named containers.
|
||||
|
||||
Spec keys:
|
||||
|
||||
- ``containers`` — **required** list of container names or IDs.
|
||||
|
||||
Returns a single command string when there is one container, or a list
|
||||
of command strings when there are multiple (one command per container so
|
||||
each can be streamed independently).
|
||||
|
||||
Raises:
|
||||
KeyError: if ``containers`` is absent from the spec.
|
||||
ValueError: if ``containers`` is an empty list.
|
||||
"""
|
||||
containers = spec["containers"] # intentional KeyError if missing
|
||||
if not containers:
|
||||
raise ValueError("'containers' must be a non-empty list")
|
||||
commands = [f"docker logs {shlex.quote(c)}" for c in containers]
|
||||
return commands[0] if len(commands) == 1 else commands
|
||||
|
|
@ -14,7 +14,7 @@ import re
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -5,7 +5,7 @@ Tautulli sends all template values as strings, so all fields are treated as str.
|
|||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
apply_patterns,
|
||||
epoch_float_to_iso,
|
||||
make_entry_id,
|
||||
|
|
@ -22,7 +22,7 @@ import json
|
|||
from datetime import datetime, timezone
|
||||
from typing import Iterator
|
||||
|
||||
from app.ingest.base import (
|
||||
from app.glean.base import (
|
||||
SourceState, apply_patterns, make_entry_id, now_iso,
|
||||
)
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
|
|
@ -1,328 +0,0 @@
|
|||
"""Ingest pipeline: auto-detect format, parse, write to SQLite."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
import yaml
|
||||
|
||||
from app.ingest import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
|
||||
from app.ingest.base import _compile, load_patterns, now_iso
|
||||
from app.services.models import LogPattern, RetrievedEntry
|
||||
from app.services.search import build_fts_index
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SCHEMA = """
|
||||
CREATE TABLE IF NOT EXISTS log_entries (
|
||||
id TEXT PRIMARY KEY,
|
||||
source_id TEXT NOT NULL,
|
||||
sequence INTEGER NOT NULL,
|
||||
timestamp_raw TEXT,
|
||||
timestamp_iso TEXT,
|
||||
ingest_time TEXT NOT NULL,
|
||||
severity TEXT,
|
||||
repeat_count INTEGER DEFAULT 1,
|
||||
out_of_order INTEGER DEFAULT 0,
|
||||
matched_patterns TEXT DEFAULT '[]',
|
||||
text TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_source ON log_entries(source_id);
|
||||
CREATE INDEX IF NOT EXISTS idx_timestamp ON log_entries(timestamp_iso);
|
||||
CREATE INDEX IF NOT EXISTS idx_ts_repeat ON log_entries(timestamp_iso, repeat_count);
|
||||
CREATE INDEX IF NOT EXISTS idx_severity ON log_entries(severity);
|
||||
CREATE INDEX IF NOT EXISTS idx_patterns ON log_entries(matched_patterns);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS incidents (
|
||||
id TEXT PRIMARY KEY,
|
||||
label TEXT NOT NULL,
|
||||
issue_type TEXT NOT NULL DEFAULT '',
|
||||
started_at TEXT,
|
||||
ended_at TEXT,
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
created_at TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'medium'
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS received_bundles (
|
||||
id TEXT PRIMARY KEY,
|
||||
source_host TEXT NOT NULL,
|
||||
issue_type TEXT NOT NULL DEFAULT '',
|
||||
label TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'medium',
|
||||
started_at TEXT,
|
||||
bundled_at TEXT NOT NULL,
|
||||
entry_count INTEGER NOT NULL DEFAULT 0,
|
||||
bundle_json TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
|
||||
CREATE INDEX IF NOT EXISTS idx_bundles_type ON received_bundles(issue_type);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_facts (
|
||||
id TEXT PRIMARY KEY,
|
||||
category TEXT NOT NULL,
|
||||
key TEXT NOT NULL,
|
||||
value TEXT NOT NULL,
|
||||
source TEXT,
|
||||
created_at TEXT NOT NULL
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
|
||||
CREATE INDEX IF NOT EXISTS idx_facts_key ON context_facts(key);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_documents (
|
||||
id TEXT PRIMARY KEY,
|
||||
filename TEXT NOT NULL,
|
||||
doc_type TEXT NOT NULL,
|
||||
full_text TEXT NOT NULL,
|
||||
file_size INTEGER,
|
||||
uploaded_at TEXT NOT NULL
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS context_chunks (
|
||||
id TEXT PRIMARY KEY,
|
||||
document_id TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL,
|
||||
text TEXT NOT NULL,
|
||||
embedding BLOB
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS blocklist_candidates (
|
||||
id TEXT PRIMARY KEY,
|
||||
domain_or_ip TEXT NOT NULL,
|
||||
source_device_ip TEXT,
|
||||
source_device_name TEXT,
|
||||
first_seen TEXT NOT NULL,
|
||||
last_seen TEXT NOT NULL,
|
||||
hit_count INTEGER DEFAULT 1,
|
||||
status TEXT DEFAULT 'pending',
|
||||
pushed_at TEXT,
|
||||
log_evidence TEXT DEFAULT '[]',
|
||||
matched_rule TEXT,
|
||||
llm_score REAL,
|
||||
llm_reason TEXT
|
||||
);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
|
||||
CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
|
||||
"""
|
||||
|
||||
|
||||
def ensure_schema(db_path: Path) -> None:
|
||||
"""Create all tables and apply additive migrations. Safe to call on every startup."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
# Additive column migrations — ALTER TABLE silently skips if column exists
|
||||
for stmt in [
|
||||
"ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
|
||||
]:
|
||||
try:
|
||||
conn.execute(stmt)
|
||||
except sqlite3.OperationalError:
|
||||
pass
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def _detect_format(first_line: str) -> str:
|
||||
try:
|
||||
obj = json.loads(first_line)
|
||||
if "__REALTIME_TIMESTAMP" in obj:
|
||||
return "journald"
|
||||
if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
|
||||
return "docker"
|
||||
if wazuh.is_wazuh_alert(obj):
|
||||
return "wazuh"
|
||||
if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
|
||||
return "caddy"
|
||||
except (json.JSONDecodeError, AttributeError):
|
||||
pass
|
||||
if plex.is_plex_log(first_line):
|
||||
return "plex"
|
||||
if qbittorrent.is_qbit_log(first_line):
|
||||
return "qbittorrent"
|
||||
if servarr.is_servarr_log(first_line):
|
||||
return "servarr"
|
||||
if dmesg_log.is_dmesg_log(first_line):
|
||||
return "dmesg"
|
||||
if syslog.is_syslog(first_line):
|
||||
return "syslog"
|
||||
return "plaintext"
|
||||
|
||||
|
||||
def _parse_file(
|
||||
path: Path,
|
||||
compiled: list[tuple[LogPattern, object]],
|
||||
ingest_time: str,
|
||||
source_id: str | None = None,
|
||||
) -> Iterator[RetrievedEntry]:
|
||||
source_id = source_id or path.stem
|
||||
|
||||
with path.open("r", errors="replace") as f:
|
||||
lines = iter(f)
|
||||
try:
|
||||
first = next(lines)
|
||||
except StopIteration:
|
||||
return
|
||||
|
||||
fmt = _detect_format(first.strip())
|
||||
logger.info("Detected format %r for %s", fmt, path.name)
|
||||
|
||||
def all_lines():
|
||||
yield first
|
||||
yield from lines
|
||||
|
||||
if fmt == "journald":
|
||||
yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "wazuh":
|
||||
yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "docker":
|
||||
yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "caddy":
|
||||
yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "plex":
|
||||
yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "qbittorrent":
|
||||
yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "servarr":
|
||||
yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "dmesg":
|
||||
yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
elif fmt == "syslog":
|
||||
yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
else:
|
||||
yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
|
||||
|
||||
|
||||
def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
|
||||
conn.executemany(
|
||||
"""
|
||||
INSERT OR IGNORE INTO log_entries
|
||||
(id, source_id, sequence, timestamp_raw, timestamp_iso,
|
||||
ingest_time, severity, repeat_count, out_of_order,
|
||||
matched_patterns, text)
|
||||
VALUES (?,?,?,?,?,?,?,?,?,?,?)
|
||||
""",
|
||||
[
|
||||
(
|
||||
e.entry_id, e.source_id, e.sequence,
|
||||
e.timestamp_raw, e.timestamp_iso, e.ingest_time,
|
||||
e.severity, e.repeat_count, int(e.out_of_order),
|
||||
json.dumps(list(e.matched_patterns)), e.text,
|
||||
)
|
||||
for e in batch
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _ingest_files(
|
||||
files: list[Path],
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
source_id_map: dict[Path, str] | None = None,
|
||||
) -> dict[str, int]:
|
||||
pattern_file = pattern_file or Path("patterns/default.yaml")
|
||||
patterns = load_patterns(pattern_file)
|
||||
compiled = _compile(patterns)
|
||||
ingest_time = now_iso()
|
||||
source_id_map = source_id_map or {}
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executescript(_SCHEMA)
|
||||
conn.commit()
|
||||
|
||||
stats: dict[str, int] = {}
|
||||
|
||||
for log_file in files:
|
||||
source_id = source_id_map.get(log_file, log_file.stem)
|
||||
count = 0
|
||||
batch: list[RetrievedEntry] = []
|
||||
for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
|
||||
batch.append(entry)
|
||||
if len(batch) >= batch_size:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
batch.clear()
|
||||
if batch:
|
||||
_write_batch(conn, batch)
|
||||
conn.commit()
|
||||
count += len(batch)
|
||||
stats[source_id] = stats.get(source_id, 0) + count
|
||||
logger.info("Ingested %d entries from %s (source: %s)", count, log_file.name, source_id)
|
||||
|
||||
conn.close()
|
||||
|
||||
logger.info("Building FTS index...")
|
||||
build_fts_index(db_path)
|
||||
logger.info("FTS index ready")
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
def ingest(
|
||||
corpus_dir: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
) -> dict[str, int]:
|
||||
"""Ingest all .jsonl and .log files from a corpus directory."""
|
||||
files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
|
||||
return _ingest_files(files, db_path, pattern_file, batch_size)
|
||||
|
||||
|
||||
def ingest_file(
|
||||
log_file: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
) -> dict[str, int]:
|
||||
"""Ingest a single log file (any supported format)."""
|
||||
return _ingest_files([log_file], db_path, pattern_file)
|
||||
|
||||
|
||||
def ingest_sources(
|
||||
sources_file: Path,
|
||||
db_path: Path,
|
||||
pattern_file: Path | None = None,
|
||||
batch_size: int = 1000,
|
||||
) -> dict[str, int]:
|
||||
"""Ingest all sources listed in a sources.yaml config file.
|
||||
|
||||
sources.yaml format:
|
||||
sources:
|
||||
- id: sonarr
|
||||
path: /opt/sonarr/config/logs/sonarr.0.txt
|
||||
- id: qbittorrent
|
||||
path: /opt/qbittorrent/config/data/logs/qbittorrent.log
|
||||
|
||||
Missing paths are skipped with a warning so the cron keeps running
|
||||
when a service is temporarily down.
|
||||
"""
|
||||
with open(sources_file) as f:
|
||||
config = yaml.safe_load(f)
|
||||
|
||||
files: list[Path] = []
|
||||
source_id_map: dict[Path, str] = {}
|
||||
|
||||
for src in config.get("sources", []):
|
||||
path = Path(src["path"])
|
||||
if not path.exists():
|
||||
logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
|
||||
continue
|
||||
files.append(path)
|
||||
if "id" in src:
|
||||
source_id_map[path] = src["id"]
|
||||
|
||||
if not files:
|
||||
logger.warning("No source files found — check sources.yaml paths")
|
||||
return {}
|
||||
|
||||
return _ingest_files(files, db_path, pattern_file, batch_size, source_id_map)
|
||||
|
|
@ -94,7 +94,7 @@ def search_logs(
|
|||
severity: Filter by level — EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG.
|
||||
source: Partial match on source_id. Format is 'corpus:host:service'.
|
||||
Example: 'xanderland:caddy' matches all Caddy entries from xanderland.
|
||||
pattern: Filter by named pattern tag applied at ingest time.
|
||||
pattern: Filter by named pattern tag applied at glean time.
|
||||
Known tags: auth_failure, connection_lost, oom, segfault, disk_full,
|
||||
timeout, caddy_tls_error, caddy_config_error, caddy_auth_error,
|
||||
caddy_upstream_error, service_restart, service_update,
|
||||
|
|
@ -176,7 +176,7 @@ def list_log_sources() -> str:
|
|||
"""
|
||||
sources = list_sources(DB_PATH)
|
||||
if not sources:
|
||||
return "No log sources found. Has the corpus been ingested? Run: python scripts/ingest_corpus.py"
|
||||
return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py"
|
||||
|
||||
lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"]
|
||||
for s in sources:
|
||||
|
|
@ -192,7 +192,7 @@ def list_log_sources() -> str:
|
|||
if __name__ == "__main__":
|
||||
if not DB_PATH.exists():
|
||||
logger.error("Database not found: %s", DB_PATH)
|
||||
logger.error("Run: python scripts/ingest_corpus.py <corpus_dir> <db_path>")
|
||||
logger.error("Run: python scripts/glean_corpus.py <corpus_dir> <db_path>")
|
||||
sys.exit(1)
|
||||
logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH)
|
||||
mcp.run()
|
||||
|
|
|
|||
178
app/rest.py
178
app/rest.py
|
|
@ -27,10 +27,10 @@ from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse
|
|||
from fastapi.staticfiles import StaticFiles
|
||||
from pydantic import BaseModel
|
||||
|
||||
from app.ingest.pipeline import ensure_schema, ingest_file as _ingest_file
|
||||
from app.ingest.base import load_compiled_patterns, now_iso
|
||||
from app.ingest.tautulli import parse_webhook as _parse_tautulli
|
||||
from app.ingest.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
|
||||
from app.glean.pipeline import ensure_schema, glean_file as _glean_file, glean_ssh_source as _glean_ssh_source
|
||||
from app.glean.base import load_compiled_patterns, now_iso
|
||||
from app.glean.tautulli import parse_webhook as _parse_tautulli
|
||||
from app.glean.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
|
||||
from app.services.blocklist import (
|
||||
BlocklistCandidate,
|
||||
get_candidate,
|
||||
|
|
@ -71,11 +71,11 @@ from app.context.store import (
|
|||
delete_document as _delete_document,
|
||||
)
|
||||
from app.context.retriever import retrieve_context as _retrieve_context, format_context_block
|
||||
from app.ingest.doc_upload import ingest_upload as _ingest_upload
|
||||
from app.glean.doc_upload import glean_upload as _glean_upload
|
||||
from app.context.wizard import get_schema as _wizard_schema, advance_step, is_complete, apply_session
|
||||
from app.context.chunker import UnsupportedDocType, FileTooLarge
|
||||
from app.tasks.ingest_scheduler import get_state as _ingest_state, run_once as _run_ingest, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
|
||||
from app.ingest.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers
|
||||
from app.tasks.glean_scheduler import get_state as _glean_state, run_once as _run_glean, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
|
||||
from app.glean.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers
|
||||
|
||||
DB_PATH = Path(os.environ.get("TURNSTONE_DB", Path(__file__).parent.parent / "data" / "turnstone.db"))
|
||||
PREFS_PATH = DB_PATH.parent / "preferences.json"
|
||||
|
|
@ -84,7 +84,7 @@ SOURCE_HOST = os.environ.get("TURNSTONE_SOURCE_HOST", "unknown")
|
|||
BUNDLE_ENDPOINT = os.environ.get("TURNSTONE_BUNDLE_ENDPOINT", "")
|
||||
PATTERN_DIR = Path(os.environ.get("TURNSTONE_PATTERNS", Path(__file__).parent.parent / "patterns"))
|
||||
PATTERN_FILE = PATTERN_DIR / "default.yaml"
|
||||
INGEST_INTERVAL = int(os.environ.get("TURNSTONE_INGEST_INTERVAL", "900"))
|
||||
GLEAN_INTERVAL = int(os.environ.get("TURNSTONE_GLEAN_INTERVAL", "900"))
|
||||
SUBMIT_ENDPOINT = os.environ.get("TURNSTONE_SUBMIT_ENDPOINT", "").rstrip("/")
|
||||
|
||||
# GPU inference server URL.
|
||||
|
|
@ -119,14 +119,14 @@ async def _lifespan(app: FastAPI):
|
|||
|
||||
sources_file = PATTERN_DIR / "sources.yaml"
|
||||
_scheduler_task: asyncio.Task | None = None
|
||||
if INGEST_INTERVAL > 0 and sources_file.exists():
|
||||
if GLEAN_INTERVAL > 0 and sources_file.exists():
|
||||
_scheduler_task = asyncio.create_task(
|
||||
_scheduler_loop(
|
||||
sources_file, DB_PATH, PATTERN_FILE, INGEST_INTERVAL,
|
||||
sources_file, DB_PATH, PATTERN_FILE, GLEAN_INTERVAL,
|
||||
submit_endpoint=SUBMIT_ENDPOINT or None,
|
||||
source_host=SOURCE_HOST,
|
||||
),
|
||||
name="ingest-scheduler",
|
||||
name="glean-scheduler",
|
||||
)
|
||||
|
||||
_mqtt_task: asyncio.Task | None = None
|
||||
|
|
@ -433,6 +433,72 @@ def list_sources() -> dict:
|
|||
return {"sources": _list_sources(DB_PATH)}
|
||||
|
||||
|
||||
@router.get("/api/sources/configured")
|
||||
def list_configured_sources() -> dict:
|
||||
"""Return every source in sources.yaml, enriched with DB stats.
|
||||
|
||||
Unlike ``/api/sources`` (which is DB-only), this endpoint reads sources.yaml
|
||||
so SSH sources appear even before their first successful glean. DB entry
|
||||
counts, error counts, and timestamps are aggregated and merged in.
|
||||
|
||||
For SSH sources, sub-source IDs (e.g. ``rack01/journald``) are summed to
|
||||
produce a single aggregate stat row for the top-level host entry.
|
||||
"""
|
||||
sources_file = PATTERN_DIR / "sources.yaml"
|
||||
if not sources_file.exists():
|
||||
return {"sources": []}
|
||||
|
||||
with open(sources_file) as f:
|
||||
config = yaml.safe_load(f) or {}
|
||||
|
||||
# Fetch all DB source stats once; key by source_id for O(1) lookup.
|
||||
db_stats: dict[str, dict] = {}
|
||||
try:
|
||||
for row in _list_sources(DB_PATH):
|
||||
db_stats[row["source_id"]] = row
|
||||
except Exception:
|
||||
pass # DB may not exist on first run
|
||||
|
||||
result = []
|
||||
for src in config.get("sources", []):
|
||||
transport = src.get("transport", "local")
|
||||
src_id = src.get("id", "")
|
||||
|
||||
entry: dict = {"id": src_id, "transport": transport}
|
||||
|
||||
if transport != "ssh":
|
||||
entry["path"] = src.get("path", "")
|
||||
db = db_stats.get(src_id, {})
|
||||
entry["entry_count"] = db.get("entry_count", 0)
|
||||
entry["error_count"] = db.get("error_count", 0)
|
||||
entry["earliest"] = db.get("earliest")
|
||||
entry["latest"] = db.get("latest")
|
||||
else:
|
||||
entry["host"] = src.get("host", "")
|
||||
entry["user"] = src.get("user", "")
|
||||
glean_items: list[dict] = src.get("glean", [])
|
||||
entry["glean_types"] = sorted({item.get("type", "plaintext") for item in glean_items})
|
||||
entry["glean_items"] = glean_items
|
||||
|
||||
# Aggregate sub-source DB rows that belong to this SSH host.
|
||||
# Sub-sources use IDs like "{host_id}/{type}" or "{host_id}/{type}/{container}".
|
||||
prefix = src_id + "/"
|
||||
matching_rows = [
|
||||
v for k, v in db_stats.items()
|
||||
if k.startswith(prefix) or k == src_id
|
||||
]
|
||||
entry["entry_count"] = sum(r.get("entry_count", 0) for r in matching_rows)
|
||||
entry["error_count"] = sum(r.get("error_count", 0) for r in matching_rows)
|
||||
earliests = [r["earliest"] for r in matching_rows if r.get("earliest")]
|
||||
latests = [r["latest"] for r in matching_rows if r.get("latest")]
|
||||
entry["earliest"] = min(earliests) if earliests else None
|
||||
entry["latest"] = max(latests) if latests else None
|
||||
|
||||
result.append(entry)
|
||||
|
||||
return {"sources": result}
|
||||
|
||||
|
||||
@router.delete("/api/sources/{source_id}")
|
||||
def delete_source(source_id: str) -> dict:
|
||||
"""Delete all log entries (and FTS index rows) for a given source."""
|
||||
|
|
@ -448,9 +514,22 @@ def delete_source(source_id: str) -> dict:
|
|||
return {"deleted": deleted, "source_id": source_id}
|
||||
|
||||
|
||||
@router.post("/api/sources/{source_id}/ingest")
|
||||
def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
|
||||
"""Trigger a re-ingest for a configured source from sources.yaml."""
|
||||
@router.post("/api/sources/{source_id}/glean")
|
||||
def reglean_source(
|
||||
source_id: str,
|
||||
background_tasks: BackgroundTasks,
|
||||
force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean even if file is unchanged")] = False,
|
||||
) -> dict:
|
||||
"""Trigger a re-glean for a configured source from sources.yaml.
|
||||
|
||||
Handles both local file sources and SSH remote sources. For SSH sources,
|
||||
the glean runs in the foreground and rebuilds the FTS index before returning
|
||||
(same behaviour as local sources — callers can rely on the count being final
|
||||
when the response arrives).
|
||||
|
||||
Use ``?force=true`` to bypass the fingerprint cache and re-glean the file
|
||||
even if mtime and size appear unchanged since the last run.
|
||||
"""
|
||||
sources_file = PATTERN_DIR / "sources.yaml"
|
||||
if not sources_file.exists():
|
||||
raise HTTPException(status_code=404, detail="sources.yaml not found")
|
||||
|
|
@ -459,21 +538,31 @@ def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
|
|||
matching = [s for s in config.get("sources", []) if s.get("id") == source_id]
|
||||
if not matching:
|
||||
raise HTTPException(status_code=404, detail=f"Source {source_id!r} not in sources.yaml")
|
||||
src_path = Path(matching[0]["path"])
|
||||
|
||||
src = matching[0]
|
||||
|
||||
if src.get("transport") == "ssh":
|
||||
# SSH sources: open connection, glean all items, rebuild FTS inline.
|
||||
# Fingerprint skipping applies only to local file sources.
|
||||
stats = _glean_ssh_source(src, DB_PATH, PATTERN_FILE)
|
||||
return {"source_id": source_id, "gleaned": sum(stats.values())}
|
||||
|
||||
# Local file source.
|
||||
src_path = Path(src["path"])
|
||||
if not src_path.exists():
|
||||
raise HTTPException(status_code=422, detail=f"Path does not exist: {src_path}")
|
||||
stats = _ingest_file(src_path, DB_PATH, PATTERN_FILE)
|
||||
stats = _glean_file(src_path, DB_PATH, PATTERN_FILE, force=force)
|
||||
background_tasks.add_task(build_fts_index, DB_PATH)
|
||||
return {"source_id": source_id, "ingested": stats.get(source_id, sum(stats.values()))}
|
||||
return {"source_id": source_id, "gleaned": stats.get(source_id, sum(stats.values()))}
|
||||
|
||||
|
||||
@router.post("/api/ingest/upload")
|
||||
async def ingest_upload(
|
||||
@router.post("/api/glean/upload")
|
||||
async def glean_upload(
|
||||
file: UploadFile,
|
||||
source_id: Annotated[str | None, Query(description="Override source ID (defaults to filename)")] = None,
|
||||
background_tasks: BackgroundTasks = None,
|
||||
) -> dict:
|
||||
"""Accept a multipart log file, auto-detect format, ingest into DB."""
|
||||
"""Accept a multipart log file, auto-detect format, glean into DB."""
|
||||
sid = source_id or Path(file.filename or "upload").stem
|
||||
content = await file.read()
|
||||
with tempfile.NamedTemporaryFile(
|
||||
|
|
@ -483,13 +572,13 @@ async def ingest_upload(
|
|||
tmp.write(content)
|
||||
tmp_path = Path(tmp.name)
|
||||
try:
|
||||
stats = _ingest_file(tmp_path, DB_PATH, PATTERN_FILE)
|
||||
stats = _glean_file(tmp_path, DB_PATH, PATTERN_FILE)
|
||||
finally:
|
||||
tmp_path.unlink(missing_ok=True)
|
||||
if background_tasks is not None:
|
||||
background_tasks.add_task(build_fts_index, DB_PATH)
|
||||
total = sum(stats.values())
|
||||
return {"source_id": sid, "ingested": total, "stats": stats}
|
||||
return {"source_id": sid, "gleaned": total, "stats": stats}
|
||||
|
||||
|
||||
class BatchEntry(BaseModel):
|
||||
|
|
@ -506,20 +595,20 @@ class BatchEntry(BaseModel):
|
|||
text: str
|
||||
|
||||
|
||||
class BatchIngestRequest(BaseModel):
|
||||
class BatchGleanRequest(BaseModel):
|
||||
source_host: str = "unknown"
|
||||
entries: list[BatchEntry]
|
||||
|
||||
|
||||
@router.post("/api/ingest/batch")
|
||||
def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks) -> dict:
|
||||
@router.post("/api/glean/batch")
|
||||
def glean_batch(payload: BatchGleanRequest, background_tasks: BackgroundTasks) -> dict:
|
||||
"""Accept pre-parsed log entries from a remote Turnstone instance (submission protocol).
|
||||
|
||||
Used by nodes with TURNSTONE_SUBMIT_ENDPOINT configured to push their
|
||||
pattern-matched entries to a central receiving instance.
|
||||
"""
|
||||
if not payload.entries:
|
||||
return {"ingested": 0}
|
||||
return {"gleaned": 0}
|
||||
conn = sqlite3.connect(str(DB_PATH))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.executemany(
|
||||
|
|
@ -550,13 +639,13 @@ def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks)
|
|||
conn.commit()
|
||||
conn.close()
|
||||
background_tasks.add_task(build_fts_index, DB_PATH)
|
||||
return {"ingested": len(payload.entries), "source_host": payload.source_host}
|
||||
return {"gleaned": len(payload.entries), "source_host": payload.source_host}
|
||||
|
||||
|
||||
@router.get("/api/tasks/ingest/status")
|
||||
def ingest_task_status() -> dict:
|
||||
"""Return the current state of the periodic batch ingest scheduler."""
|
||||
s = _ingest_state()
|
||||
@router.get("/api/tasks/glean/status")
|
||||
def glean_task_status() -> dict:
|
||||
"""Return the current state of the periodic glean scheduler."""
|
||||
s = _glean_state()
|
||||
return {
|
||||
"running": s.running,
|
||||
"run_count": s.run_count,
|
||||
|
|
@ -565,8 +654,8 @@ def ingest_task_status() -> dict:
|
|||
"last_stats": s.last_stats,
|
||||
"last_error": s.last_error,
|
||||
"next_run_at": s.next_run_at,
|
||||
"interval_s": INGEST_INTERVAL,
|
||||
"scheduler_active": INGEST_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
|
||||
"interval_s": GLEAN_INTERVAL,
|
||||
"scheduler_active": GLEAN_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
|
||||
"submit_endpoint": SUBMIT_ENDPOINT or None,
|
||||
"last_submitted_at": s.last_submitted_at,
|
||||
"last_submit_count": s.last_submit_count,
|
||||
|
|
@ -574,21 +663,28 @@ def ingest_task_status() -> dict:
|
|||
}
|
||||
|
||||
|
||||
@router.post("/api/tasks/ingest")
|
||||
async def trigger_ingest() -> dict:
|
||||
"""Manually trigger a batch ingest of all configured sources. No-ops if already running."""
|
||||
@router.post("/api/tasks/glean")
|
||||
async def trigger_glean(
|
||||
force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean all sources")] = False,
|
||||
) -> dict:
|
||||
"""Manually trigger a glean of all configured sources. No-ops if already running.
|
||||
|
||||
Use ``?force=true`` to bypass the fingerprint cache and re-glean every local
|
||||
file source even when mtime and size are unchanged since the last run.
|
||||
"""
|
||||
sources_file = PATTERN_DIR / "sources.yaml"
|
||||
if not sources_file.exists():
|
||||
raise HTTPException(status_code=404, detail="sources.yaml not found — configure log sources first")
|
||||
return await _run_ingest(
|
||||
return await _run_glean(
|
||||
sources_file, DB_PATH, PATTERN_FILE,
|
||||
submit_endpoint=SUBMIT_ENDPOINT or None,
|
||||
source_host=SOURCE_HOST,
|
||||
force=force,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/api/ingest/wazuh/alert")
|
||||
async def ingest_wazuh_alert(
|
||||
@router.post("/api/glean/wazuh/alert")
|
||||
async def glean_wazuh_alert(
|
||||
alert: dict,
|
||||
source_id: Annotated[str | None, Query(description="Source label (defaults to 'wazuh')")] = None,
|
||||
background_tasks: BackgroundTasks = None,
|
||||
|
|
@ -769,8 +865,8 @@ def _tautulli_write_entry(conn: sqlite3.Connection, entry) -> None:
|
|||
)
|
||||
|
||||
|
||||
@router.post("/api/ingest/tautulli")
|
||||
def ingest_tautulli(
|
||||
@router.post("/api/glean/tautulli")
|
||||
def glean_tautulli(
|
||||
payload: dict,
|
||||
request: Request,
|
||||
background_tasks: BackgroundTasks,
|
||||
|
|
|
|||
357
app/services/diagnose/__init__.py
Normal file
357
app/services/diagnose/__init__.py
Normal file
|
|
@ -0,0 +1,357 @@
|
|||
"""Frictionless diagnose service — NL time extraction + layered log search.
|
||||
|
||||
This module is the public interface for the diagnose package.
|
||||
Full implementation lives here so that patch("app.services.diagnose._HAS_DATEPARSER")
|
||||
and patch("app.services.diagnose._search_dates") continue to target the correct
|
||||
namespace, preserving backward compatibility with existing tests.
|
||||
|
||||
The verbatim original is preserved in legacy.py for reference.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from collections.abc import AsyncGenerator
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.context.retriever import retrieve_context, format_context_block
|
||||
from app.services.llm import summarize
|
||||
from app.services.search import SearchResult, entries_in_window, search
|
||||
from app.services.diagnose.pipeline import run_pipeline
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
try:
|
||||
from dateparser.search import search_dates as _search_dates # type: ignore[import]
|
||||
|
||||
_HAS_DATEPARSER = True
|
||||
except ImportError:
|
||||
_search_dates = None # type: ignore[assignment]
|
||||
_HAS_DATEPARSER = False
|
||||
|
||||
|
||||
_RELATIVE_RE = re.compile(
|
||||
r"\b(?:last|past)\s+(?:(?P<n>\d+)|(?P<approx>a\s+few|few|couple(?:\s+of)?|several))?\s*(?P<unit>minute|hour|day|week)s?\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_RELATIVE_UNITS = {"minute": 1, "hour": 60, "day": 1440, "week": 10080}
|
||||
# Fuzzy quantifiers map to a reasonable span so "last few hours" → 3h window
|
||||
_APPROX_N = 3
|
||||
|
||||
|
||||
def _relative_window(match: re.Match) -> tuple[str, str]:
|
||||
"""Convert a relative time match to (since_iso, until_iso)."""
|
||||
n_str = match.group("n")
|
||||
approx = match.group("approx")
|
||||
unit = match.group("unit").lower()
|
||||
n = int(n_str) if n_str else (_APPROX_N if approx else 1)
|
||||
minutes = n * _RELATIVE_UNITS[unit]
|
||||
return _last_n_minutes(minutes), _now_iso()
|
||||
|
||||
|
||||
def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
|
||||
"""Extract a time window from a natural-language query string.
|
||||
|
||||
Returns (since_iso, until_iso, keywords) where keywords is the query with
|
||||
the matched time phrase stripped. Falls back to last-60-min window.
|
||||
"""
|
||||
# Handle relative expressions first ("last hour", "past 30 minutes", etc.)
|
||||
# dateparser misinterprets these as absolute times.
|
||||
m = _RELATIVE_RE.search(query)
|
||||
if m:
|
||||
since, until = _relative_window(m)
|
||||
keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip()
|
||||
return since, until, keywords or query
|
||||
|
||||
if _HAS_DATEPARSER and _search_dates is not None:
|
||||
# Tell dateparser what timezone the user is in so "3:35 am" means local time.
|
||||
# PREFER_DAY_OF_MONTH is unused here but PREFER_DATES_FROM=past ensures
|
||||
# "3:35 am" resolves to the most recent past occurrence, not a future one.
|
||||
local_offset = datetime.now().astimezone().utcoffset()
|
||||
offset_h = int((local_offset.total_seconds() if local_offset else 0) / 3600)
|
||||
tz_str = f"UTC{'+' if offset_h >= 0 else ''}{offset_h}"
|
||||
try:
|
||||
results = _search_dates(
|
||||
query,
|
||||
languages=["en"],
|
||||
settings={
|
||||
"PREFER_DATES_FROM": "past",
|
||||
"TIMEZONE": tz_str,
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
},
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning(
|
||||
"dateparser failed (%s) on query %r — falling back to 60-min window",
|
||||
type(e).__name__,
|
||||
query,
|
||||
)
|
||||
results = None
|
||||
if results:
|
||||
phrase, dt = results[0]
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
dt = dt.astimezone(
|
||||
timezone.utc
|
||||
) # normalise to UTC for SQLite string compare
|
||||
since = (dt - timedelta(minutes=30)).isoformat()
|
||||
until = (dt + timedelta(minutes=30)).isoformat()
|
||||
keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
|
||||
return since, until, keywords or query
|
||||
|
||||
return _last_n_minutes(60), _now_iso(), query
|
||||
|
||||
|
||||
def diagnose(
|
||||
db_path: Path,
|
||||
query: str,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
source_filter: str | None = None,
|
||||
llm_url: str | None = None,
|
||||
llm_model: str | None = None,
|
||||
llm_api_key: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
"""Run layered log search with NL time extraction. Returns summary + entries."""
|
||||
time_detected = since is not None and until is not None
|
||||
if not time_detected:
|
||||
parsed_since, parsed_until, keywords = parse_time_window(query)
|
||||
since = since or parsed_since
|
||||
until = until or parsed_until
|
||||
time_detected = keywords != query
|
||||
else:
|
||||
keywords = query
|
||||
|
||||
keyword_hits = search(
|
||||
db_path,
|
||||
query=keywords,
|
||||
since=since,
|
||||
until=until,
|
||||
source_filter=source_filter,
|
||||
limit=150,
|
||||
or_mode=True,
|
||||
)
|
||||
window_hits = entries_in_window(
|
||||
db_path,
|
||||
since=since,
|
||||
until=until,
|
||||
source_filter=source_filter,
|
||||
limit=50,
|
||||
per_source_cap=15,
|
||||
)
|
||||
|
||||
seen: set[str] = set()
|
||||
merged: list[SearchResult] = []
|
||||
for r in keyword_hits + window_hits:
|
||||
if r.entry_id not in seen:
|
||||
seen.add(r.entry_id)
|
||||
merged.append(r)
|
||||
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
|
||||
:200
|
||||
]
|
||||
|
||||
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
|
||||
by_source: dict[str, int] = {}
|
||||
for r in combined:
|
||||
sev = (r.severity or "INFO").upper()
|
||||
if sev in by_severity:
|
||||
by_severity[sev] += 1
|
||||
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
|
||||
|
||||
reasoning: str | None = None
|
||||
if llm_url and llm_model:
|
||||
reasoning = summarize(
|
||||
query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
|
||||
)
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
"total": len(combined),
|
||||
"window_start": since,
|
||||
"window_end": until,
|
||||
"time_detected": time_detected,
|
||||
"by_severity": by_severity,
|
||||
"by_source": by_source,
|
||||
},
|
||||
"reasoning": reasoning,
|
||||
"entries": combined,
|
||||
}
|
||||
|
||||
|
||||
async def diagnose_stream(
|
||||
db_path: Path,
|
||||
query: str,
|
||||
since: str | None = None,
|
||||
until: str | None = None,
|
||||
source_filter: str | None = None,
|
||||
llm_url: str | None = None,
|
||||
llm_model: str | None = None,
|
||||
llm_api_key: str | None = None,
|
||||
) -> AsyncGenerator[dict[str, Any], None]:
|
||||
"""Async generator yielding SSE event dicts for the diagnose pipeline.
|
||||
|
||||
Yields events in order:
|
||||
{"type":"status","message":"…"} — pipeline progress
|
||||
{"type":"summary","data":{…}} — window + severity counts (fast, from DB)
|
||||
{"type":"entries","data":[…]} — log entries (fast, from DB)
|
||||
{"type":"reasoning","text":"…"} — LLM analysis (slow, optional)
|
||||
{"type":"done"}
|
||||
"""
|
||||
keywords = query.strip()
|
||||
source_browse = not keywords and source_filter is not None
|
||||
|
||||
if source_browse:
|
||||
# No keyword — browsing a source directly. Use 24h window; skip FTS entirely.
|
||||
yield {"type": "status", "message": f"Loading {source_filter}…"}
|
||||
since = since or _last_n_minutes(60 * 24)
|
||||
until = until or _now_iso()
|
||||
time_detected = False
|
||||
else:
|
||||
yield {"type": "status", "message": "Parsing time window…"}
|
||||
time_detected = since is not None and until is not None
|
||||
if not time_detected:
|
||||
parsed_since, parsed_until, keywords = await asyncio.to_thread(
|
||||
parse_time_window, query
|
||||
)
|
||||
since = since or parsed_since
|
||||
until = until or parsed_until
|
||||
time_detected = keywords != query
|
||||
|
||||
yield {"type": "status", "message": "Loading environment context…"}
|
||||
ctx = await asyncio.to_thread(lambda: retrieve_context(db_path, query))
|
||||
context_block = format_context_block(ctx)
|
||||
yield {
|
||||
"type": "context",
|
||||
"facts": ctx.facts,
|
||||
"chunks": ctx.chunks,
|
||||
}
|
||||
|
||||
yield {"type": "status", "message": "Searching logs…"}
|
||||
|
||||
if source_browse:
|
||||
keyword_hits: list[SearchResult] = []
|
||||
window_hits = await asyncio.to_thread(
|
||||
lambda: entries_in_window(
|
||||
db_path,
|
||||
since,
|
||||
until,
|
||||
source_filter=source_filter,
|
||||
limit=200,
|
||||
)
|
||||
)
|
||||
else:
|
||||
keyword_hits, window_hits = await asyncio.gather(
|
||||
asyncio.to_thread(
|
||||
lambda: search(
|
||||
db_path,
|
||||
keywords,
|
||||
source_filter=source_filter,
|
||||
since=since,
|
||||
until=until,
|
||||
limit=150,
|
||||
or_mode=True,
|
||||
)
|
||||
),
|
||||
asyncio.to_thread(
|
||||
lambda: entries_in_window(
|
||||
db_path,
|
||||
since,
|
||||
until,
|
||||
source_filter=source_filter,
|
||||
limit=50,
|
||||
per_source_cap=15,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
seen: set[str] = set()
|
||||
merged: list[SearchResult] = []
|
||||
for r in keyword_hits + window_hits:
|
||||
if r.entry_id not in seen:
|
||||
seen.add(r.entry_id)
|
||||
merged.append(r)
|
||||
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
|
||||
:200
|
||||
]
|
||||
|
||||
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
|
||||
by_source: dict[str, int] = {}
|
||||
for r in combined:
|
||||
sev = (r.severity or "INFO").upper()
|
||||
if sev in by_severity:
|
||||
by_severity[sev] += 1
|
||||
by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
|
||||
|
||||
yield {
|
||||
"type": "summary",
|
||||
"data": {
|
||||
"total": len(combined),
|
||||
"window_start": since,
|
||||
"window_end": until,
|
||||
"time_detected": time_detected,
|
||||
"by_severity": by_severity,
|
||||
"by_source": by_source,
|
||||
},
|
||||
}
|
||||
yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
|
||||
|
||||
if MULTI_AGENT_ENABLED:
|
||||
async for event in run_pipeline(
|
||||
db_path=db_path,
|
||||
entries=combined,
|
||||
ctx=ctx,
|
||||
query=query,
|
||||
since=since,
|
||||
until=until,
|
||||
llm_url=llm_url,
|
||||
llm_model=llm_model,
|
||||
llm_api_key=llm_api_key,
|
||||
):
|
||||
yield event
|
||||
return # pipeline emits its own "done" event
|
||||
|
||||
if llm_url and llm_model and combined:
|
||||
yield {"type": "status", "message": "Analyzing with LLM…"}
|
||||
reasoning = await asyncio.to_thread(
|
||||
lambda: summarize(
|
||||
query,
|
||||
combined,
|
||||
llm_url,
|
||||
llm_model,
|
||||
llm_api_key,
|
||||
context_block=context_block,
|
||||
)
|
||||
)
|
||||
if reasoning:
|
||||
yield {"type": "reasoning", "text": reasoning}
|
||||
|
||||
yield {"type": "done"}
|
||||
|
||||
|
||||
def _now_iso() -> str:
|
||||
return datetime.now(timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _last_n_minutes(n: int) -> str:
|
||||
return (datetime.now(timezone.utc) - timedelta(minutes=n)).isoformat()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"diagnose",
|
||||
"diagnose_stream",
|
||||
"parse_time_window",
|
||||
]
|
||||
|
||||
# Feature flag for Task 6
|
||||
MULTI_AGENT_ENABLED = (
|
||||
os.getenv("TURNSTONE_MULTI_AGENT_DIAGNOSE", "false").lower() == "true"
|
||||
)
|
||||
249
app/services/diagnose/classifier.py
Normal file
249
app/services/diagnose/classifier.py
Normal file
|
|
@ -0,0 +1,249 @@
|
|||
"""Stage 2: Severity Classifier — ML with two fallback levels.
|
||||
|
||||
Classification strategy (in priority order):
|
||||
|
||||
Path A — ML: Hugging Face text-classification pipeline, loaded lazily.
|
||||
Path B — pattern_tags: Map cluster.pattern_tags through the loaded pattern
|
||||
severity dict; pick the highest severity across matching tags.
|
||||
Path C — regex: Call detect_severity() from app.glean.base on the cluster's
|
||||
representative_text.
|
||||
|
||||
Each cluster is classified independently. The ``classifier_used`` field on the
|
||||
returned ``ClassifiedTimeline`` reflects the primary path (the one that governed
|
||||
the overall classification session, not individual cluster fallbacks).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.diagnose.models import (
|
||||
ClassifiedTimeline,
|
||||
EventCluster,
|
||||
SeverityLabel,
|
||||
TimelineResult,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Module-level ML singleton — reset to None between tests via the fixture
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_ml_classifier: Any | None = None
|
||||
|
||||
|
||||
def _get_ml_classifier(model_id: str, device: str) -> Any:
|
||||
"""Return the cached HF pipeline, loading it on first call."""
|
||||
global _ml_classifier # noqa: PLW0603
|
||||
if _ml_classifier is None:
|
||||
from transformers import pipeline as hf_pipeline # type: ignore[import-untyped]
|
||||
|
||||
_ml_classifier = hf_pipeline(
|
||||
"text-classification", model=model_id, device=device
|
||||
)
|
||||
return _ml_classifier
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Label mapping
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_LABEL_MAP: dict[str, SeverityLabel] = {
|
||||
"ERROR": "ERROR",
|
||||
"WARNING": "WARN",
|
||||
"WARN": "WARN",
|
||||
"INFO": "INFO",
|
||||
"DEBUG": "DEBUG",
|
||||
"CRITICAL": "CRITICAL",
|
||||
}
|
||||
|
||||
_CRITICAL_KEYWORDS: frozenset[str] = frozenset(
|
||||
{
|
||||
"panic",
|
||||
"oom",
|
||||
"fatal",
|
||||
"critical",
|
||||
"kernel panic",
|
||||
"out of memory",
|
||||
"segfault",
|
||||
"segmentation fault",
|
||||
}
|
||||
)
|
||||
|
||||
_SEVERITY_ORDER: dict[str | None, int] = {
|
||||
"CRITICAL": 5,
|
||||
"ERROR": 4,
|
||||
"WARN": 3,
|
||||
"WARNING": 3,
|
||||
"INFO": 2,
|
||||
"DEBUG": 1,
|
||||
None: 0,
|
||||
}
|
||||
|
||||
|
||||
def _map_label(label: str, score: float, text: str) -> SeverityLabel:
|
||||
"""Apply the severity shim: promote to CRITICAL or demote to DEBUG where warranted."""
|
||||
upper = label.upper()
|
||||
if upper == "ERROR" and score > 0.95 and any(
|
||||
k in text.lower() for k in _CRITICAL_KEYWORDS
|
||||
):
|
||||
return "CRITICAL"
|
||||
if upper == "INFO" and score < 0.4:
|
||||
return "DEBUG"
|
||||
return _LABEL_MAP.get(upper, "UNKNOWN") # type: ignore[return-value]
|
||||
|
||||
|
||||
def _highest_from_tags(
|
||||
tags: tuple[str, ...], severity_map: dict[str, str]
|
||||
) -> SeverityLabel | None:
|
||||
"""Return the highest severity from the pattern_tags that appear in severity_map."""
|
||||
best: str | None = None
|
||||
best_rank = -1
|
||||
for tag in tags:
|
||||
sev = severity_map.get(tag)
|
||||
rank = _SEVERITY_ORDER.get(sev, 0)
|
||||
if rank > best_rank:
|
||||
best_rank = rank
|
||||
best = sev
|
||||
if best is None:
|
||||
return None
|
||||
normalised = "WARN" if best.upper() == "WARNING" else best.upper()
|
||||
return normalised # type: ignore[return-value]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# SeverityClassifier
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class SeverityClassifier:
|
||||
"""Classify each EventCluster's severity using ML, patterns, or regex fallback.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
model_id:
|
||||
Hugging Face model identifier. When empty (default), ML is skipped.
|
||||
device:
|
||||
Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``).
|
||||
pattern_file:
|
||||
Path to the YAML pattern file. When ``None`` the classifier reads
|
||||
``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str = "",
|
||||
device: str = "cpu",
|
||||
pattern_file: Path | None = None,
|
||||
) -> None:
|
||||
self._model_id = model_id
|
||||
self._device = device
|
||||
self._pattern_file: Path | None = pattern_file
|
||||
self._pattern_severity: dict[str, str] = {}
|
||||
self._patterns_loaded = False
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lazy loaders
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _resolve_pattern_file(self) -> Path | None:
|
||||
"""Resolve pattern file from constructor arg or env var."""
|
||||
if self._pattern_file is not None:
|
||||
return self._pattern_file
|
||||
env_dir = os.environ.get("TURNSTONE_PATTERNS")
|
||||
if env_dir:
|
||||
return Path(env_dir) / "default.yaml"
|
||||
return None
|
||||
|
||||
def _ensure_patterns_loaded(self) -> None:
|
||||
"""Populate _pattern_severity from the pattern YAML file (once)."""
|
||||
if self._patterns_loaded:
|
||||
return
|
||||
self._patterns_loaded = True
|
||||
path = self._resolve_pattern_file()
|
||||
if path is None:
|
||||
return
|
||||
from app.glean.base import load_patterns
|
||||
|
||||
patterns = load_patterns(path)
|
||||
self._pattern_severity = {p.name: p.severity for p in patterns}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Per-cluster classification helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None:
|
||||
"""Attempt ML classification. Returns None on any inference failure."""
|
||||
try:
|
||||
pipe = _get_ml_classifier(self._model_id, self._device)
|
||||
results = pipe(cluster.representative_text)
|
||||
if not results:
|
||||
return None
|
||||
hit = results[0]
|
||||
return _map_label(hit["label"], hit["score"], cluster.representative_text)
|
||||
except Exception: # noqa: BLE001
|
||||
logger.warning(
|
||||
"ML inference failed for cluster %s — falling back",
|
||||
cluster.cluster_id,
|
||||
)
|
||||
return None
|
||||
|
||||
def _classify_cluster_pattern_tags(
|
||||
self, cluster: EventCluster
|
||||
) -> SeverityLabel | None:
|
||||
"""Derive severity from the cluster's pattern_tags. Returns None if no match."""
|
||||
return _highest_from_tags(cluster.pattern_tags, self._pattern_severity)
|
||||
|
||||
def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel:
|
||||
"""Classify by scanning representative_text with the severity regex."""
|
||||
from app.glean.base import detect_severity
|
||||
|
||||
raw = detect_severity(cluster.representative_text)
|
||||
if raw is None:
|
||||
return "INFO"
|
||||
return _LABEL_MAP.get(raw.upper(), "INFO") # type: ignore[return-value]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Public API
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def classify(self, timeline: TimelineResult) -> ClassifiedTimeline:
|
||||
"""Classify every cluster in *timeline* and return a ``ClassifiedTimeline``."""
|
||||
self._ensure_patterns_loaded()
|
||||
|
||||
# Determine which primary path governs this session
|
||||
ml_available = bool(self._model_id)
|
||||
patterns_available = bool(self._pattern_severity)
|
||||
|
||||
if ml_available:
|
||||
classifier_used: str = "ml"
|
||||
elif patterns_available:
|
||||
classifier_used = "pattern_tags"
|
||||
else:
|
||||
classifier_used = "regex"
|
||||
|
||||
cluster_severities: dict[str, SeverityLabel] = {}
|
||||
|
||||
for cluster in timeline.clusters:
|
||||
severity: SeverityLabel | None = None
|
||||
|
||||
if ml_available:
|
||||
severity = self._classify_cluster_ml(cluster)
|
||||
|
||||
if severity is None and patterns_available:
|
||||
severity = self._classify_cluster_pattern_tags(cluster)
|
||||
|
||||
if severity is None:
|
||||
severity = self._classify_cluster_regex(cluster)
|
||||
|
||||
cluster_severities[cluster.cluster_id] = severity
|
||||
|
||||
return ClassifiedTimeline(
|
||||
timeline=timeline,
|
||||
cluster_severities=cluster_severities,
|
||||
classifier_used=classifier_used, # type: ignore[arg-type]
|
||||
model_id=self._model_id if ml_available else None,
|
||||
)
|
||||
216
app/services/diagnose/hypothesizer.py
Normal file
216
app/services/diagnose/hypothesizer.py
Normal file
|
|
@ -0,0 +1,216 @@
|
|||
"""Stage 3: Root-Cause Hypothesizer — LLM + RAG context."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
from uuid import uuid4
|
||||
|
||||
import httpx
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.models import (
|
||||
ClassifiedTimeline,
|
||||
EventCluster,
|
||||
Hypothesis,
|
||||
SeverityLabel,
|
||||
)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_VALID_SEVERITIES: frozenset[str] = frozenset({"CRITICAL", "ERROR", "WARN", "INFO", "DEBUG"})
|
||||
|
||||
_SYSTEM_PROMPT = (
|
||||
"You are a Linux sysadmin log analyst. Analyze the following clustered log timeline "
|
||||
"and generate 2-4 root cause hypotheses as a JSON array.\n\n"
|
||||
"Each hypothesis must follow this exact JSON schema:\n"
|
||||
'{"title": str (≤80 chars), "description": str (2-4 sentences), '
|
||||
'"confidence": float (0.0-1.0), "severity": str (one of: CRITICAL, ERROR, WARN, INFO), '
|
||||
'"supporting_clusters": [str list of cluster IDs]}\n\n'
|
||||
"Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON."
|
||||
)
|
||||
|
||||
|
||||
def _coerce_float(val: object, default: float) -> float:
|
||||
"""Safely coerce LLM output to float, returning default on failure."""
|
||||
try:
|
||||
return float(val) # type: ignore[arg-type]
|
||||
except (TypeError, ValueError):
|
||||
return default
|
||||
|
||||
|
||||
def _validate_severity(s: str) -> SeverityLabel:
|
||||
"""Map a raw severity string to a valid SeverityLabel, defaulting to ERROR."""
|
||||
upper = s.upper()
|
||||
if upper == "WARNING":
|
||||
return "WARN"
|
||||
return upper if upper in _VALID_SEVERITIES else "ERROR" # type: ignore[return-value]
|
||||
|
||||
|
||||
def _cluster_summary(cluster: EventCluster, severity: str) -> str:
|
||||
"""Build a condensed single-line summary of a cluster for the prompt."""
|
||||
sources = ", ".join(list(cluster.source_ids)[:3])
|
||||
patterns = ", ".join(list(cluster.pattern_tags)[:5])
|
||||
text_preview = cluster.representative_text[:200]
|
||||
summary = (
|
||||
f"[{severity}] {cluster.start_iso or 'unknown'} "
|
||||
f"({sources}) — {text_preview}"
|
||||
)
|
||||
if patterns:
|
||||
summary += f" [patterns: {patterns}]"
|
||||
return summary
|
||||
|
||||
|
||||
def _extract_content(resp_json: dict) -> str | None:
|
||||
"""Pull text content from an OpenAI-compat chat completion response."""
|
||||
choices = resp_json.get("choices") or []
|
||||
if not choices:
|
||||
return None
|
||||
return (choices[0].get("message", {}).get("content") or "").strip() or None
|
||||
|
||||
|
||||
class RootCauseHypothesizer:
|
||||
"""Generate ranked root-cause hypotheses from a classified log timeline."""
|
||||
|
||||
def __init__(self, max_hypotheses: int = 4) -> None:
|
||||
self._max_hypotheses = max_hypotheses
|
||||
|
||||
def hypothesize(
|
||||
self,
|
||||
classified: ClassifiedTimeline,
|
||||
ctx: RetrievedContext,
|
||||
query: str,
|
||||
llm_url: str | None = None,
|
||||
llm_model: str | None = None,
|
||||
llm_api_key: str | None = None,
|
||||
) -> list[Hypothesis]:
|
||||
"""Generate hypotheses from a classified timeline and RAG context.
|
||||
|
||||
Returns an empty list when no LLM is configured or there are no
|
||||
clusters to analyse.
|
||||
"""
|
||||
if not llm_url or not llm_model:
|
||||
return []
|
||||
|
||||
clusters = classified.timeline.clusters
|
||||
if not clusters:
|
||||
return []
|
||||
|
||||
cluster_lines = [
|
||||
_cluster_summary(c, classified.cluster_severities.get(c.cluster_id, c.severity))
|
||||
for c in clusters
|
||||
]
|
||||
cluster_block = "\n".join(cluster_lines)
|
||||
|
||||
context_parts: list[str] = []
|
||||
for chunk in ctx.chunks[:5]:
|
||||
filename = chunk.get("filename", "unknown")
|
||||
text = chunk.get("text", "")[:300]
|
||||
context_parts.append(f"[{filename}] {text}")
|
||||
context_block = "\n".join(context_parts) if context_parts else "(none)"
|
||||
|
||||
user_message = (
|
||||
f"Query: {query}\n\n"
|
||||
f"Context from runbooks and known patterns:\n{context_block}\n\n"
|
||||
f"Log timeline (clustered, {len(clusters)} clusters):\n{cluster_block}\n\n"
|
||||
f"Generate up to {self._max_hypotheses} hypotheses. Return JSON array only."
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
|
||||
raw_response = self._call_llm(
|
||||
llm_url=llm_url,
|
||||
llm_model=llm_model,
|
||||
llm_api_key=llm_api_key,
|
||||
messages=messages,
|
||||
)
|
||||
if raw_response is None:
|
||||
return []
|
||||
|
||||
return self._parse_response(raw_response)
|
||||
|
||||
def _call_llm(
|
||||
self,
|
||||
llm_url: str,
|
||||
llm_model: str,
|
||||
llm_api_key: str | None,
|
||||
messages: list[dict],
|
||||
) -> str | None:
|
||||
"""Send messages to the LLM and return raw text content."""
|
||||
headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
|
||||
|
||||
# Try cf-orch task-based endpoint first.
|
||||
task_url = f"{llm_url.rstrip('/')}/api/inference/task"
|
||||
try:
|
||||
resp = httpx.post(
|
||||
task_url,
|
||||
json={
|
||||
"product": "turnstone",
|
||||
"task": "log_analysis",
|
||||
"payload": {"messages": messages, "stream": False},
|
||||
},
|
||||
headers=headers,
|
||||
timeout=120.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return _extract_content(resp.json())
|
||||
if resp.status_code != 404:
|
||||
resp.raise_for_status()
|
||||
logger.debug(
|
||||
"No task assignment for turnstone.log_analysis — falling back to direct model"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc)
|
||||
|
||||
# Fallback: OpenAI-compat endpoint with explicit model name.
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{llm_url.rstrip('/')}/v1/chat/completions",
|
||||
json={"model": llm_model, "messages": messages, "stream": False},
|
||||
headers=headers,
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return _extract_content(resp.json())
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"LLM hypothesizer failed (%s): %s", type(exc).__name__, exc
|
||||
)
|
||||
return None
|
||||
|
||||
def _parse_response(self, raw: str) -> list[Hypothesis]:
|
||||
"""Parse the LLM JSON response into a list of Hypothesis objects."""
|
||||
try:
|
||||
data = json.loads(raw.strip())
|
||||
except json.JSONDecodeError:
|
||||
logger.warning(
|
||||
"Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw
|
||||
)
|
||||
return []
|
||||
|
||||
if not isinstance(data, list):
|
||||
logger.warning(
|
||||
"Hypothesizer: expected JSON array, got %s", type(data).__name__
|
||||
)
|
||||
return []
|
||||
|
||||
hypotheses: list[Hypothesis] = []
|
||||
for item in data[: self._max_hypotheses]:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
severity_raw = item.get("severity", "ERROR")
|
||||
severity = _validate_severity(str(severity_raw))
|
||||
hypothesis = Hypothesis(
|
||||
hypothesis_id=str(uuid4()),
|
||||
title=str(item.get("title", "Unknown"))[:80],
|
||||
description=str(item.get("description", "")),
|
||||
confidence=_coerce_float(item.get("confidence"), 0.5),
|
||||
supporting_cluster_ids=tuple(item.get("supporting_clusters") or []),
|
||||
runbook_refs=(),
|
||||
severity=severity,
|
||||
)
|
||||
hypotheses.append(hypothesis)
|
||||
|
||||
return hypotheses
|
||||
|
|
@ -1,4 +1,5 @@
|
|||
"""Frictionless diagnose service — NL time extraction + layered log search."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
|
|
@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
try:
|
||||
from dateparser.search import search_dates as _search_dates # type: ignore[import]
|
||||
|
||||
_HAS_DATEPARSER = True
|
||||
except ImportError:
|
||||
_search_dates = None # type: ignore[assignment]
|
||||
|
|
@ -68,17 +70,25 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
|
|||
results = _search_dates(
|
||||
query,
|
||||
languages=["en"],
|
||||
settings={"PREFER_DATES_FROM": "past", "TIMEZONE": tz_str, "RETURN_AS_TIMEZONE_AWARE": True},
|
||||
settings={
|
||||
"PREFER_DATES_FROM": "past",
|
||||
"TIMEZONE": tz_str,
|
||||
"RETURN_AS_TIMEZONE_AWARE": True,
|
||||
},
|
||||
)
|
||||
except Exception:
|
||||
logger.warning("dateparser failed on query %r — falling back to 60-min window", query)
|
||||
logger.warning(
|
||||
"dateparser failed on query %r — falling back to 60-min window", query
|
||||
)
|
||||
results = None
|
||||
if results:
|
||||
phrase, dt = results[0]
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
dt = dt.astimezone(timezone.utc) # normalise to UTC for SQLite string compare
|
||||
dt = dt.astimezone(
|
||||
timezone.utc
|
||||
) # normalise to UTC for SQLite string compare
|
||||
since = (dt - timedelta(minutes=30)).isoformat()
|
||||
until = (dt + timedelta(minutes=30)).isoformat()
|
||||
keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
|
||||
|
|
@ -107,8 +117,23 @@ def diagnose(
|
|||
else:
|
||||
keywords = query
|
||||
|
||||
keyword_hits = search(db_path, query=keywords, since=since, until=until, source_filter=source_filter, limit=150, or_mode=True)
|
||||
window_hits = entries_in_window(db_path, since=since, until=until, source_filter=source_filter, limit=50, per_source_cap=15)
|
||||
keyword_hits = search(
|
||||
db_path,
|
||||
query=keywords,
|
||||
since=since,
|
||||
until=until,
|
||||
source_filter=source_filter,
|
||||
limit=150,
|
||||
or_mode=True,
|
||||
)
|
||||
window_hits = entries_in_window(
|
||||
db_path,
|
||||
since=since,
|
||||
until=until,
|
||||
source_filter=source_filter,
|
||||
limit=50,
|
||||
per_source_cap=15,
|
||||
)
|
||||
|
||||
seen: set[str] = set()
|
||||
merged: list[SearchResult] = []
|
||||
|
|
@ -117,7 +142,9 @@ def diagnose(
|
|||
seen.add(r.entry_id)
|
||||
merged.append(r)
|
||||
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
|
||||
:200
|
||||
]
|
||||
|
||||
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
|
||||
by_source: dict[str, int] = {}
|
||||
|
|
@ -129,7 +156,9 @@ def diagnose(
|
|||
|
||||
reasoning: str | None = None
|
||||
if llm_url and llm_model:
|
||||
reasoning = summarize(query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key)
|
||||
reasoning = summarize(
|
||||
query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
|
||||
)
|
||||
|
||||
return {
|
||||
"summary": {
|
||||
|
|
@ -177,7 +206,9 @@ async def diagnose_stream(
|
|||
yield {"type": "status", "message": "Parsing time window…"}
|
||||
time_detected = since is not None and until is not None
|
||||
if not time_detected:
|
||||
parsed_since, parsed_until, keywords = await asyncio.to_thread(parse_time_window, query)
|
||||
parsed_since, parsed_until, keywords = await asyncio.to_thread(
|
||||
parse_time_window, query
|
||||
)
|
||||
since = since or parsed_since
|
||||
until = until or parsed_until
|
||||
time_detected = keywords != query
|
||||
|
|
@ -197,23 +228,34 @@ async def diagnose_stream(
|
|||
keyword_hits: list[SearchResult] = []
|
||||
window_hits = await asyncio.to_thread(
|
||||
lambda: entries_in_window(
|
||||
db_path, since, until,
|
||||
source_filter=source_filter, limit=200,
|
||||
db_path,
|
||||
since,
|
||||
until,
|
||||
source_filter=source_filter,
|
||||
limit=200,
|
||||
)
|
||||
)
|
||||
else:
|
||||
keyword_hits, window_hits = await asyncio.gather(
|
||||
asyncio.to_thread(
|
||||
lambda: search(
|
||||
db_path, keywords,
|
||||
source_filter=source_filter, since=since, until=until,
|
||||
limit=150, or_mode=True,
|
||||
db_path,
|
||||
keywords,
|
||||
source_filter=source_filter,
|
||||
since=since,
|
||||
until=until,
|
||||
limit=150,
|
||||
or_mode=True,
|
||||
)
|
||||
),
|
||||
asyncio.to_thread(
|
||||
lambda: entries_in_window(
|
||||
db_path, since, until,
|
||||
source_filter=source_filter, limit=50, per_source_cap=15,
|
||||
db_path,
|
||||
since,
|
||||
until,
|
||||
source_filter=source_filter,
|
||||
limit=50,
|
||||
per_source_cap=15,
|
||||
)
|
||||
),
|
||||
)
|
||||
|
|
@ -225,7 +267,9 @@ async def diagnose_stream(
|
|||
seen.add(r.entry_id)
|
||||
merged.append(r)
|
||||
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
|
||||
combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
|
||||
:200
|
||||
]
|
||||
|
||||
by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
|
||||
by_source: dict[str, int] = {}
|
||||
|
|
@ -251,7 +295,14 @@ async def diagnose_stream(
|
|||
if llm_url and llm_model and combined:
|
||||
yield {"type": "status", "message": "Analyzing with LLM…"}
|
||||
reasoning = await asyncio.to_thread(
|
||||
lambda: summarize(query, combined, llm_url, llm_model, llm_api_key, context_block=context_block)
|
||||
lambda: summarize(
|
||||
query,
|
||||
combined,
|
||||
llm_url,
|
||||
llm_model,
|
||||
llm_api_key,
|
||||
context_block=context_block,
|
||||
)
|
||||
)
|
||||
if reasoning:
|
||||
yield {"type": "reasoning", "text": reasoning}
|
||||
72
app/services/diagnose/models.py
Normal file
72
app/services/diagnose/models.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
"""Pipeline data types for the multi-agent diagnose pipeline."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal
|
||||
|
||||
SeverityLabel = Literal["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "UNKNOWN"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class EventCluster:
|
||||
"""A time-correlated group of log entries within the timeline."""
|
||||
|
||||
cluster_id: str
|
||||
entries: tuple[str, ...] # entry_id refs
|
||||
start_iso: str | None
|
||||
end_iso: str | None
|
||||
duration_seconds: float
|
||||
source_ids: tuple[str, ...]
|
||||
pattern_tags: tuple[str, ...]
|
||||
severity: SeverityLabel
|
||||
burst: bool
|
||||
gap_before_seconds: float
|
||||
representative_text: str
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TimelineResult:
|
||||
"""Structured timeline of event clusters built from log entries."""
|
||||
|
||||
clusters: tuple[EventCluster, ...]
|
||||
total_entries: int
|
||||
window_start: str | None
|
||||
window_end: str | None
|
||||
gap_count: int
|
||||
burst_count: int
|
||||
dominant_sources: tuple[str, ...]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClassifiedTimeline:
|
||||
"""Timeline annotated with ML-assigned severity per cluster."""
|
||||
|
||||
timeline: TimelineResult
|
||||
cluster_severities: dict[str, SeverityLabel]
|
||||
classifier_used: Literal["ml", "pattern_tags", "regex"]
|
||||
model_id: str | None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Hypothesis:
|
||||
"""A root-cause hypothesis generated by Stage 3."""
|
||||
|
||||
hypothesis_id: str
|
||||
title: str
|
||||
description: str
|
||||
confidence: float
|
||||
supporting_cluster_ids: tuple[str, ...]
|
||||
runbook_refs: tuple[str, ...]
|
||||
severity: SeverityLabel
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RankedHypothesis:
|
||||
"""A hypothesis enriched by Stage 4 false-positive suppression."""
|
||||
|
||||
hypothesis: Hypothesis
|
||||
novelty_score: float
|
||||
similarity_to_known: float
|
||||
suppress: bool
|
||||
suppression_reason: str | None
|
||||
132
app/services/diagnose/pipeline.py
Normal file
132
app/services/diagnose/pipeline.py
Normal file
|
|
@ -0,0 +1,132 @@
|
|||
"""Multi-agent diagnose pipeline orchestrator — Stage 1–5 wiring."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import dataclasses
|
||||
import logging
|
||||
from collections.abc import AsyncGenerator
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.classifier import SeverityClassifier
|
||||
from app.services.diagnose.hypothesizer import RootCauseHypothesizer
|
||||
from app.services.diagnose.suppressor import FalsePositiveSuppressor
|
||||
from app.services.diagnose.synthesizer import SummarySynthesizer
|
||||
from app.services.diagnose.timeline import TimelineReconstructor
|
||||
from app.services.search import SearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def run_pipeline(
|
||||
db_path: Path,
|
||||
entries: list[SearchResult],
|
||||
ctx: RetrievedContext,
|
||||
query: str,
|
||||
since: str | None, # reserved for future range-filtering in stage queries (#29 follow-up)
|
||||
until: str | None, # reserved for future range-filtering in stage queries (#29 follow-up)
|
||||
llm_url: str | None,
|
||||
llm_model: str | None,
|
||||
llm_api_key: str | None,
|
||||
) -> AsyncGenerator[dict[str, Any], None]:
|
||||
"""Async generator that runs all 5 pipeline stages and yields SSE event dicts.
|
||||
|
||||
Stages:
|
||||
1. TimelineReconstructor — cluster log entries by time
|
||||
2. SeverityClassifier — annotate clusters with severity
|
||||
3. RootCauseHypothesizer — generate hypotheses via LLM
|
||||
4. FalsePositiveSuppressor — rank and suppress known patterns
|
||||
5. SummarySynthesizer — produce a narrative diagnosis
|
||||
|
||||
Yields events in order:
|
||||
{"type": "status", "message": "Building timeline…"}
|
||||
{"type": "pipeline_stage", "stage": 1, ...}
|
||||
{"type": "pipeline_stage", "stage": 2, ...}
|
||||
{"type": "pipeline_stage", "stage": 3, ...}
|
||||
{"type": "pipeline_stage", "stage": 4, ...}
|
||||
{"type": "hypotheses", "data": [...]}
|
||||
{"type": "status", "message": "Synthesizing…"}
|
||||
{"type": "reasoning", "text": "..."} — only when synthesis produces text
|
||||
{"type": "done"}
|
||||
"""
|
||||
# Stage 1: Timeline reconstruction
|
||||
yield {"type": "status", "message": "Building timeline…"}
|
||||
timeline = await asyncio.to_thread(
|
||||
TimelineReconstructor().reconstruct, entries
|
||||
)
|
||||
n_clusters = len(timeline.clusters)
|
||||
burst = timeline.burst_count
|
||||
yield {
|
||||
"type": "pipeline_stage",
|
||||
"stage": 1,
|
||||
"name": "timeline",
|
||||
"message": f"Built {n_clusters} clusters, {burst} bursts",
|
||||
}
|
||||
|
||||
# Stage 2: Severity classification
|
||||
classified = await asyncio.to_thread(
|
||||
SeverityClassifier().classify, timeline
|
||||
)
|
||||
sev_counts: dict[str, int] = {}
|
||||
for sev in classified.cluster_severities.values():
|
||||
sev_counts[sev] = sev_counts.get(sev, 0) + 1
|
||||
counts_str = ", ".join(f"{k}:{v}" for k, v in sorted(sev_counts.items()))
|
||||
yield {
|
||||
"type": "pipeline_stage",
|
||||
"stage": 2,
|
||||
"name": "classifier",
|
||||
"message": f"{classified.classifier_used} classifier: {counts_str}",
|
||||
}
|
||||
|
||||
# Stage 3: Root-cause hypotheses
|
||||
hypotheses = await asyncio.to_thread(
|
||||
RootCauseHypothesizer().hypothesize,
|
||||
classified,
|
||||
ctx,
|
||||
query,
|
||||
llm_url,
|
||||
llm_model,
|
||||
llm_api_key,
|
||||
)
|
||||
yield {
|
||||
"type": "pipeline_stage",
|
||||
"stage": 3,
|
||||
"name": "hypotheses",
|
||||
"message": f"{len(hypotheses)} hypotheses generated",
|
||||
}
|
||||
|
||||
# Stage 4: False-positive suppression
|
||||
ranked = await asyncio.to_thread(
|
||||
FalsePositiveSuppressor().suppress, hypotheses, db_path
|
||||
)
|
||||
suppressed = sum(1 for rh in ranked if rh.suppress)
|
||||
active = len(ranked) - suppressed
|
||||
yield {
|
||||
"type": "pipeline_stage",
|
||||
"stage": 4,
|
||||
"name": "suppressor",
|
||||
"message": f"{suppressed} suppressed, {active} active",
|
||||
}
|
||||
yield {
|
||||
"type": "hypotheses",
|
||||
"data": [dataclasses.asdict(rh) for rh in ranked],
|
||||
}
|
||||
|
||||
# Stage 5: Summary synthesis
|
||||
yield {"type": "status", "message": "Synthesizing…"}
|
||||
synthesis_text = await asyncio.to_thread(
|
||||
SummarySynthesizer().synthesize,
|
||||
ranked,
|
||||
timeline,
|
||||
ctx,
|
||||
query,
|
||||
llm_url,
|
||||
llm_model,
|
||||
llm_api_key,
|
||||
)
|
||||
if synthesis_text:
|
||||
yield {"type": "reasoning", "text": synthesis_text}
|
||||
|
||||
yield {"type": "done"}
|
||||
275
app/services/diagnose/suppressor.py
Normal file
275
app/services/diagnose/suppressor.py
Normal file
|
|
@ -0,0 +1,275 @@
|
|||
"""Stage 4: False-Positive Suppressor — embedding cosine similarity.
|
||||
|
||||
Compares each hypothesis against a corpus of resolved incidents using
|
||||
embedding cosine similarity. Hypotheses that closely match a previously
|
||||
resolved incident are suppressed as likely false positives.
|
||||
|
||||
When no embedding model is configured or the service is unavailable, all
|
||||
hypotheses pass through with novelty_score=1.0 (full novelty assumed).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from app.services.diagnose.models import Hypothesis, RankedHypothesis
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Module-level corpus cache: db_path_str -> (corpus_texts, embeddings)
|
||||
# Invalidated when the corpus text list changes between calls.
|
||||
_corpus_cache: dict[str, tuple[list[str], Any]] = {}
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cosine similarity helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
|
||||
def _cosine_similarities(
|
||||
query_emb: list[float], corpus_embs: list[list[float]]
|
||||
) -> list[float]:
|
||||
"""Batch cosine similarity of one query embedding against all corpus embeddings."""
|
||||
q = np.array(query_emb, dtype=np.float32)
|
||||
c = np.array(corpus_embs, dtype=np.float32)
|
||||
q_norm = q / (np.linalg.norm(q) + 1e-10)
|
||||
c_norm = c / (np.linalg.norm(c, axis=1, keepdims=True) + 1e-10)
|
||||
return list(c_norm @ q_norm)
|
||||
|
||||
_HAS_NUMPY = True
|
||||
|
||||
except ImportError: # pragma: no cover
|
||||
import math
|
||||
|
||||
_HAS_NUMPY = False
|
||||
|
||||
def _dot(a: list[float], b: list[float]) -> float:
|
||||
return sum(x * y for x, y in zip(a, b))
|
||||
|
||||
def _norm(a: list[float]) -> float:
|
||||
return math.sqrt(sum(x * x for x in a)) + 1e-10
|
||||
|
||||
def _cosine(a: list[float], b: list[float]) -> float:
|
||||
return _dot(a, b) / (_norm(a) * _norm(b))
|
||||
|
||||
def _cosine_similarities(
|
||||
query_emb: list[float], corpus_embs: list[list[float]]
|
||||
) -> list[float]:
|
||||
return [_cosine(query_emb, c) for c in corpus_embs]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DB helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fetch_resolved_incidents(db_path: Path) -> list[str]:
|
||||
"""Fetch resolved incident texts from SQLite.
|
||||
|
||||
Returns a list of non-empty combined strings for each resolved incident.
|
||||
Returns an empty list on any error (missing table, connection failure, etc.).
|
||||
"""
|
||||
try:
|
||||
with sqlite3.connect(str(db_path)) as conn:
|
||||
cursor = conn.execute(
|
||||
"SELECT label, notes FROM incidents WHERE ended_at IS NOT NULL LIMIT 200"
|
||||
)
|
||||
rows = cursor.fetchall()
|
||||
except sqlite3.OperationalError as exc:
|
||||
logger.warning("Could not query resolved incidents (%s) — treating as empty corpus", exc)
|
||||
return []
|
||||
except sqlite3.Error as exc:
|
||||
# Catches all remaining SQLite-family errors (IntegrityError, DatabaseError, etc.)
|
||||
logger.warning("Unexpected SQLite error fetching resolved incidents (%s) — treating as empty corpus", exc)
|
||||
return []
|
||||
|
||||
texts: list[str] = []
|
||||
for label, notes in rows:
|
||||
label = (label or "").strip()
|
||||
notes = (notes or "").strip()
|
||||
combined = f"{label}. {notes}" if label and notes else (label or notes)
|
||||
if combined:
|
||||
texts.append(combined)
|
||||
return texts
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public class
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class FalsePositiveSuppressor:
|
||||
"""Stage 4 of the multi-agent diagnose pipeline.
|
||||
|
||||
Uses embedding cosine similarity to detect hypotheses that closely match
|
||||
previously resolved incidents and suppress them as likely false positives.
|
||||
|
||||
When model_id is empty or the embedding service is unavailable, all
|
||||
hypotheses pass through with novelty_score=1.0 (no suppression).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_id: str = "",
|
||||
device: str = "cpu",
|
||||
similarity_threshold: float = 0.85,
|
||||
) -> None:
|
||||
self._model_id = model_id
|
||||
self._device = device
|
||||
# _device stored for future use when get_embedder() supports device selection
|
||||
# Suppress when cosine similarity to a known resolved incident >= threshold.
|
||||
# A threshold of 0.85 means "suppress if 85%+ similar to something already resolved."
|
||||
self._similarity_threshold = similarity_threshold
|
||||
|
||||
def suppress(
|
||||
self,
|
||||
hypotheses: list[Hypothesis],
|
||||
db_path: Path,
|
||||
) -> list[RankedHypothesis]:
|
||||
"""Rank hypotheses by novelty, suppressing those matching resolved incidents.
|
||||
|
||||
Args:
|
||||
hypotheses: Candidate hypotheses from Stage 3.
|
||||
db_path: Path to the Turnstone SQLite database containing incidents.
|
||||
|
||||
Returns:
|
||||
List of RankedHypothesis sorted by (novelty_score * confidence) descending.
|
||||
Non-suppressed hypotheses appear first in practice.
|
||||
"""
|
||||
if not hypotheses:
|
||||
return []
|
||||
|
||||
# No model configured — full passthrough, rank by confidence only.
|
||||
if not self._model_id:
|
||||
return self._passthrough(hypotheses)
|
||||
|
||||
# Attempt to obtain an embedder; fall back to passthrough on failure.
|
||||
embedder = self._load_embedder()
|
||||
if embedder is None:
|
||||
logger.warning(
|
||||
"Embedding service unavailable for model %r — skipping suppression",
|
||||
self._model_id,
|
||||
)
|
||||
return self._passthrough(hypotheses)
|
||||
|
||||
# Fetch corpus texts from DB; fall back to passthrough if corpus is empty.
|
||||
corpus_texts = _fetch_resolved_incidents(db_path)
|
||||
if not corpus_texts:
|
||||
logger.debug("No resolved incidents found — all hypotheses treated as novel")
|
||||
return self._passthrough(hypotheses)
|
||||
|
||||
# Embed corpus (with caching).
|
||||
corpus_embeddings = self._get_corpus_embeddings(embedder, corpus_texts, db_path)
|
||||
|
||||
# Score each hypothesis and sort by novelty * confidence descending.
|
||||
ranked = [
|
||||
self._score_hypothesis(h, embedder, corpus_embeddings)
|
||||
for h in hypotheses
|
||||
]
|
||||
ranked.sort(key=lambda rh: rh.novelty_score * rh.hypothesis.confidence, reverse=True)
|
||||
return ranked
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Private helpers
|
||||
# ------------------------------------------------------------------
|
||||
|
||||
def _score_hypothesis(
|
||||
self,
|
||||
hypothesis: Hypothesis,
|
||||
embedder: Any,
|
||||
corpus_embeddings: list[list[float]],
|
||||
) -> RankedHypothesis:
|
||||
"""Score a single hypothesis against the resolved incident corpus."""
|
||||
try:
|
||||
query_text = f"{hypothesis.title}. {hypothesis.description}"
|
||||
h_emb = embedder.embed(query_text)
|
||||
# Convert numpy array to plain Python list for _cosine_similarities
|
||||
h_emb_list: list[float] = h_emb.tolist() if hasattr(h_emb, "tolist") else list(h_emb)
|
||||
sims = _cosine_similarities(h_emb_list, corpus_embeddings)
|
||||
max_sim = float(max(sims)) if sims else 0.0
|
||||
except Exception as exc:
|
||||
# Broad catch is intentional: catches unknown embedder runtime errors
|
||||
# (e.g. CUDA OOM, backend crashes) so one bad hypothesis never halts the pipeline.
|
||||
logger.warning("Embedding failed for hypothesis %r: %s — treating as novel", hypothesis.title, exc)
|
||||
return RankedHypothesis(
|
||||
hypothesis=hypothesis,
|
||||
novelty_score=1.0,
|
||||
similarity_to_known=0.0,
|
||||
suppress=False,
|
||||
suppression_reason=None,
|
||||
)
|
||||
|
||||
novelty_score = 1.0 - max_sim
|
||||
suppress = bool(max_sim >= self._similarity_threshold)
|
||||
suppression_reason = (
|
||||
f"Similar to resolved incident (similarity {max_sim:.2f})"
|
||||
if suppress
|
||||
else None
|
||||
)
|
||||
return RankedHypothesis(
|
||||
hypothesis=hypothesis,
|
||||
novelty_score=novelty_score,
|
||||
similarity_to_known=max_sim,
|
||||
suppress=suppress,
|
||||
suppression_reason=suppression_reason,
|
||||
)
|
||||
|
||||
def _load_embedder(self) -> Any | None:
|
||||
"""Load the embedding service. Returns None if unavailable."""
|
||||
try:
|
||||
from app.services.embeddings import get_embedder
|
||||
return get_embedder()
|
||||
except Exception as exc:
|
||||
# Broad catch is intentional: get_embedder() may raise on import or
|
||||
# backend init failures from any number of third-party libraries.
|
||||
logger.warning("Failed to import/initialise embedding service: %s", exc)
|
||||
return None
|
||||
|
||||
def _get_corpus_embeddings(
|
||||
self,
|
||||
embedder: Any,
|
||||
corpus_texts: list[str],
|
||||
db_path: Path,
|
||||
) -> list[list[float]]:
|
||||
"""Return cached corpus embeddings, re-embedding if the corpus has changed."""
|
||||
cache_key = str(db_path)
|
||||
cached = _corpus_cache.get(cache_key)
|
||||
|
||||
if cached is not None:
|
||||
cached_texts, cached_embeddings = cached
|
||||
if cached_texts == corpus_texts:
|
||||
return cached_embeddings
|
||||
|
||||
logger.debug("Embedding corpus of %d resolved incidents", len(corpus_texts))
|
||||
try:
|
||||
raw_embeddings = embedder.embed_batch(corpus_texts)
|
||||
# Normalise each embedding to a plain Python list for portability
|
||||
corpus_embeddings: list[list[float]] = [
|
||||
e.tolist() if hasattr(e, "tolist") else list(e)
|
||||
for e in raw_embeddings
|
||||
]
|
||||
except Exception as exc:
|
||||
# Broad catch is intentional: embed_batch() may raise from any backend
|
||||
# (network timeout, CUDA error, etc.) — treat as empty corpus so the
|
||||
# pipeline can continue without suppression.
|
||||
logger.warning("Corpus embedding failed: %s — treating as empty corpus", exc)
|
||||
return []
|
||||
|
||||
_corpus_cache[cache_key] = (corpus_texts, corpus_embeddings)
|
||||
return corpus_embeddings
|
||||
|
||||
def _passthrough(self, hypotheses: list[Hypothesis]) -> list[RankedHypothesis]:
|
||||
"""Return all hypotheses as non-suppressed, ranked by confidence descending."""
|
||||
ranked = [
|
||||
RankedHypothesis(
|
||||
hypothesis=h,
|
||||
novelty_score=1.0,
|
||||
similarity_to_known=0.0,
|
||||
suppress=False,
|
||||
suppression_reason=None,
|
||||
)
|
||||
for h in hypotheses
|
||||
]
|
||||
ranked.sort(key=lambda rh: rh.hypothesis.confidence, reverse=True)
|
||||
return ranked
|
||||
210
app/services/diagnose/synthesizer.py
Normal file
210
app/services/diagnose/synthesizer.py
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
"""Stage 5: Summary Synthesizer — deterministic narrative from ranked hypotheses.
|
||||
|
||||
Streaming upgrade (async SSE chunks) is tracked as a follow-up enhancement.
|
||||
This implementation is synchronous to match the rest of the pipeline.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
import httpx
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.models import RankedHypothesis, TimelineResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SYSTEM_PROMPT = (
|
||||
"You are a Linux sysadmin diagnosing a system incident. "
|
||||
"Write a concise, actionable incident diagnosis.\n\n"
|
||||
"Format your response exactly as:\n"
|
||||
"1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — <what happened> (<X>% confidence)\n"
|
||||
"2. TIMELINE: <what the logs show in sequence, 2-3 sentences>\n"
|
||||
"3. ROOT CAUSES:\n"
|
||||
" - <hypothesis 1 title> (<confidence>%)\n"
|
||||
" - <hypothesis 2 title> (<confidence>%)\n"
|
||||
"4. RECOMMENDED ACTIONS:\n"
|
||||
" - <action based on hypotheses>\n"
|
||||
"5. INVESTIGATE FURTHER: <open questions, if any>"
|
||||
)
|
||||
|
||||
|
||||
def _extract_content(resp_json: dict) -> str | None:
|
||||
"""Pull text content from an OpenAI-compat chat completion response."""
|
||||
choices = resp_json.get("choices") or []
|
||||
if not choices:
|
||||
return None
|
||||
return (choices[0].get("message", {}).get("content") or "").strip() or None
|
||||
|
||||
|
||||
def _build_hypothesis_block(ranked: list[RankedHypothesis]) -> str:
|
||||
"""Build the hypothesis block for the prompt (non-suppressed only, top 3)."""
|
||||
active = [rh for rh in ranked if not rh.suppress][:3]
|
||||
if not active:
|
||||
return "(none)"
|
||||
lines: list[str] = []
|
||||
for rh in active:
|
||||
h = rh.hypothesis
|
||||
conf_pct = int(h.confidence * 100)
|
||||
similar = (
|
||||
f"Yes — suppressed, {rh.suppression_reason}"
|
||||
if rh.suppress and rh.suppression_reason
|
||||
else "No"
|
||||
)
|
||||
novelty = f"{rh.novelty_score:.2f}"
|
||||
lines.append(
|
||||
f"- [{h.severity}, {conf_pct}%] {h.title}\n"
|
||||
f" Similar resolved incident? {similar} (novelty {novelty})"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _build_context_block(ctx: RetrievedContext) -> str:
|
||||
"""Build the runbook context block for the prompt."""
|
||||
parts: list[str] = []
|
||||
for chunk in ctx.chunks[:5]:
|
||||
filename = chunk.get("filename", "unknown")
|
||||
text = chunk.get("text", "")[:300]
|
||||
parts.append(f"[{filename}] {text}")
|
||||
return "\n".join(parts) if parts else "(none)"
|
||||
|
||||
|
||||
def _deterministic_fallback(
|
||||
ranked: list[RankedHypothesis],
|
||||
timeline: TimelineResult,
|
||||
) -> str:
|
||||
"""Build a deterministic fallback text when no LLM is available."""
|
||||
active = [rh for rh in ranked if not rh.suppress][:3]
|
||||
if active:
|
||||
top = active[0]
|
||||
verdict_severity = top.hypothesis.severity
|
||||
verdict_title = top.hypothesis.title
|
||||
verdict_conf = int(top.hypothesis.confidence * 100)
|
||||
elif ranked:
|
||||
top = ranked[0]
|
||||
verdict_severity = top.hypothesis.severity
|
||||
verdict_title = top.hypothesis.title
|
||||
verdict_conf = int(top.hypothesis.confidence * 100)
|
||||
else:
|
||||
verdict_severity = "UNKNOWN"
|
||||
verdict_title = "No hypotheses generated"
|
||||
verdict_conf = 0
|
||||
|
||||
root_causes = ", ".join(
|
||||
rh.hypothesis.title for rh in (active or ranked[:3])
|
||||
) or "None"
|
||||
|
||||
return (
|
||||
f"VERDICT: {verdict_severity} — {verdict_title} ({verdict_conf}% confidence)\n"
|
||||
f"TIMELINE: {timeline.total_entries} entries across {len(timeline.clusters)} clusters.\n"
|
||||
f"ROOT CAUSES: {root_causes}"
|
||||
)
|
||||
|
||||
|
||||
class SummarySynthesizer:
|
||||
"""Stage 5 of the multi-agent diagnose pipeline.
|
||||
|
||||
Synthesizes a human-readable incident narrative from ranked hypotheses,
|
||||
the reconstructed timeline, and RAG context. When no LLM is configured,
|
||||
returns a deterministic fallback built from the hypothesis data.
|
||||
"""
|
||||
|
||||
def synthesize(
|
||||
self,
|
||||
ranked: list[RankedHypothesis],
|
||||
timeline: TimelineResult,
|
||||
ctx: RetrievedContext,
|
||||
query: str,
|
||||
llm_url: str | None = None,
|
||||
llm_model: str | None = None,
|
||||
llm_api_key: str | None = None,
|
||||
) -> str:
|
||||
"""Return synthesis text (single string, synchronous).
|
||||
|
||||
Falls back to a deterministic narrative when no LLM URL or model is
|
||||
provided, or when the LLM call fails.
|
||||
"""
|
||||
fallback = _deterministic_fallback(ranked, timeline)
|
||||
|
||||
if not llm_url or not llm_model:
|
||||
return fallback
|
||||
|
||||
hypothesis_block = _build_hypothesis_block(ranked)
|
||||
context_block = _build_context_block(ctx)
|
||||
dominant = ", ".join(timeline.dominant_sources[:5]) or "none"
|
||||
|
||||
user_message = (
|
||||
f"Query: {query}\n\n"
|
||||
f"Timeline summary:\n"
|
||||
f"- {len(timeline.clusters)} clusters, "
|
||||
f"{timeline.burst_count} bursts, "
|
||||
f"{timeline.gap_count} silence gaps\n"
|
||||
f"- Primary sources: {dominant}\n\n"
|
||||
f"Top hypotheses:\n{hypothesis_block}\n\n"
|
||||
f"Context from runbooks:\n{context_block}"
|
||||
)
|
||||
|
||||
messages = [
|
||||
{"role": "system", "content": _SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_message},
|
||||
]
|
||||
|
||||
result = self._call_llm(
|
||||
llm_url=llm_url,
|
||||
llm_model=llm_model,
|
||||
llm_api_key=llm_api_key,
|
||||
messages=messages,
|
||||
)
|
||||
return result if result else fallback
|
||||
|
||||
def _call_llm(
|
||||
self,
|
||||
llm_url: str,
|
||||
llm_model: str,
|
||||
llm_api_key: str | None,
|
||||
messages: list[dict],
|
||||
) -> str | None:
|
||||
"""Send messages to the LLM and return raw text content.
|
||||
|
||||
Tries the cf-orch task endpoint first, falls back to direct OpenAI-compat.
|
||||
"""
|
||||
headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
|
||||
|
||||
task_url = f"{llm_url.rstrip('/')}/api/inference/task"
|
||||
try:
|
||||
resp = httpx.post(
|
||||
task_url,
|
||||
json={
|
||||
"product": "turnstone",
|
||||
"task": "log_analysis",
|
||||
"payload": {"messages": messages, "stream": False},
|
||||
},
|
||||
headers=headers,
|
||||
timeout=120.0,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
return _extract_content(resp.json())
|
||||
if resp.status_code != 404:
|
||||
resp.raise_for_status()
|
||||
logger.debug(
|
||||
"No task assignment for turnstone.log_analysis — falling back to direct model"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug(
|
||||
"Task endpoint unavailable (%s) — falling back to direct model", exc
|
||||
)
|
||||
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{llm_url.rstrip('/')}/v1/chat/completions",
|
||||
json={"model": llm_model, "messages": messages, "stream": False},
|
||||
headers=headers,
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return _extract_content(resp.json())
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"LLM synthesizer failed (%s): %s", type(exc).__name__, exc
|
||||
)
|
||||
return None
|
||||
272
app/services/diagnose/timeline.py
Normal file
272
app/services/diagnose/timeline.py
Normal file
|
|
@ -0,0 +1,272 @@
|
|||
"""Stage 1: Timeline Reconstructor — pure Python, no ML."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from app.services.diagnose.models import EventCluster, TimelineResult
|
||||
from app.services.search import SearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_SEVERITY_ORDER: dict[str | None, int] = {
|
||||
"CRITICAL": 5,
|
||||
"ERROR": 4,
|
||||
"WARN": 3,
|
||||
"WARNING": 3,
|
||||
"INFO": 2,
|
||||
"DEBUG": 1,
|
||||
None: 0,
|
||||
}
|
||||
|
||||
|
||||
def _parse_iso(s: str) -> datetime | None:
|
||||
"""Parse ISO 8601 string to UTC-aware datetime. Returns None on parse failure."""
|
||||
try:
|
||||
dt = datetime.fromisoformat(s)
|
||||
except ValueError:
|
||||
logger.warning("Unparseable timestamp in log entry, treating as None: %r", s)
|
||||
return None
|
||||
if dt.tzinfo is None:
|
||||
logger.debug("Naive timestamp treated as UTC: %s", s)
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
|
||||
def _sort_key(e: SearchResult) -> tuple[int, str]:
|
||||
"""Sort key: timestamped entries first (ascending), then None-timestamp entries."""
|
||||
if e.timestamp_iso is None:
|
||||
return (1, "")
|
||||
return (0, e.timestamp_iso)
|
||||
|
||||
|
||||
def _highest_severity(entries: list[SearchResult]) -> str:
|
||||
"""Return the highest severity label across all entries."""
|
||||
best: str | None = None
|
||||
best_rank = -1
|
||||
for entry in entries:
|
||||
sev = entry.severity
|
||||
rank = _SEVERITY_ORDER.get(sev, 0)
|
||||
if rank > best_rank:
|
||||
best_rank = rank
|
||||
best = sev
|
||||
# SeverityLabel requires a valid literal; fall back to "UNKNOWN" if None
|
||||
if best is None:
|
||||
return "UNKNOWN"
|
||||
# Normalise WARNING -> WARN for the output type
|
||||
if best == "WARNING":
|
||||
return "WARN"
|
||||
return best
|
||||
|
||||
|
||||
def _representative_text(entries: list[SearchResult]) -> str:
|
||||
"""Return text of the entry with highest rank; tie-break on longest text."""
|
||||
if not entries:
|
||||
return ""
|
||||
best = max(entries, key=lambda e: (e.rank, len(e.text)))
|
||||
return best.text
|
||||
|
||||
|
||||
def _cluster_id(entry_ids: list[str]) -> str:
|
||||
"""Compute a 12-char hex cluster ID from a sorted list of entry IDs."""
|
||||
payload = ",".join(sorted(entry_ids)).encode()
|
||||
return hashlib.sha1(payload).hexdigest()[:12] # noqa: S324 — not used for security
|
||||
|
||||
|
||||
def _make_event_cluster(
|
||||
cluster_entries: list[SearchResult],
|
||||
gap_before_seconds: float,
|
||||
burst_threshold: int,
|
||||
burst_window_seconds: int,
|
||||
) -> EventCluster:
|
||||
"""Construct an EventCluster from a list of SearchResult entries."""
|
||||
timestamps = [
|
||||
ts
|
||||
for e in cluster_entries
|
||||
if e.timestamp_iso is not None
|
||||
for ts in (_parse_iso(e.timestamp_iso),)
|
||||
if ts is not None
|
||||
]
|
||||
|
||||
start_iso: str | None = None
|
||||
end_iso: str | None = None
|
||||
duration_seconds = 0.0
|
||||
|
||||
if timestamps:
|
||||
ts_min = min(timestamps)
|
||||
ts_max = max(timestamps)
|
||||
start_iso = ts_min.isoformat()
|
||||
end_iso = ts_max.isoformat()
|
||||
duration_seconds = (ts_max - ts_min).total_seconds()
|
||||
|
||||
entry_ids = [e.entry_id for e in cluster_entries]
|
||||
burst = (
|
||||
len(cluster_entries) >= burst_threshold
|
||||
and duration_seconds <= burst_window_seconds
|
||||
)
|
||||
|
||||
return EventCluster(
|
||||
cluster_id=_cluster_id(entry_ids),
|
||||
entries=tuple(entry_ids),
|
||||
start_iso=start_iso,
|
||||
end_iso=end_iso,
|
||||
duration_seconds=duration_seconds,
|
||||
source_ids=tuple(sorted(set(e.source_id for e in cluster_entries))),
|
||||
pattern_tags=tuple(
|
||||
sorted(set(tag for e in cluster_entries for tag in e.matched_patterns))
|
||||
),
|
||||
severity=_highest_severity(cluster_entries), # type: ignore[arg-type] # SeverityLabel is a Literal; _highest_severity returns a compatible str
|
||||
burst=burst,
|
||||
gap_before_seconds=gap_before_seconds,
|
||||
representative_text=_representative_text(cluster_entries),
|
||||
)
|
||||
|
||||
|
||||
class TimelineReconstructor:
|
||||
"""Reconstruct a structured timeline of event clusters from log entries.
|
||||
|
||||
Pure Python — no ML or LLM calls. Designed as Stage 1 of the multi-agent
|
||||
diagnose pipeline.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
cluster_window_seconds: int = 30,
|
||||
burst_threshold: int = 10,
|
||||
burst_window_seconds: int = 5,
|
||||
gap_significance_seconds: int = 30,
|
||||
) -> None:
|
||||
self._cluster_window = cluster_window_seconds
|
||||
self._burst_threshold = burst_threshold
|
||||
self._burst_window = burst_window_seconds
|
||||
self._gap_significance_seconds: int = gap_significance_seconds
|
||||
|
||||
def _sort_entries(self, entries: list[SearchResult]) -> list[SearchResult]:
|
||||
"""Sort entries: timestamped first (ascending), then None-timestamp entries."""
|
||||
return sorted(entries, key=_sort_key)
|
||||
|
||||
def _group_into_raw_clusters(
|
||||
self, sorted_entries: list[SearchResult]
|
||||
) -> list[list[SearchResult]]:
|
||||
"""Group sorted entries into time-window clusters."""
|
||||
raw_clusters: list[list[SearchResult]] = []
|
||||
current: list[SearchResult] = []
|
||||
cluster_anchor: datetime | None = None
|
||||
|
||||
for entry in sorted_entries:
|
||||
if not current:
|
||||
current.append(entry)
|
||||
if entry.timestamp_iso is not None:
|
||||
cluster_anchor = _parse_iso(entry.timestamp_iso)
|
||||
continue
|
||||
|
||||
if entry.timestamp_iso is None:
|
||||
# No timestamp — always joins the current cluster
|
||||
current.append(entry)
|
||||
continue
|
||||
|
||||
entry_dt = _parse_iso(entry.timestamp_iso)
|
||||
|
||||
if entry_dt is None:
|
||||
# Malformed timestamp — treat same as None: join current cluster
|
||||
current.append(entry)
|
||||
continue
|
||||
|
||||
if cluster_anchor is None:
|
||||
# Current cluster has no anchor yet — set it, stay in cluster
|
||||
cluster_anchor = entry_dt
|
||||
current.append(entry)
|
||||
continue
|
||||
|
||||
delta = (entry_dt - cluster_anchor).total_seconds()
|
||||
if delta > self._cluster_window:
|
||||
raw_clusters.append(current)
|
||||
current = [entry]
|
||||
cluster_anchor = entry_dt
|
||||
else:
|
||||
current.append(entry)
|
||||
|
||||
if current:
|
||||
raw_clusters.append(current)
|
||||
|
||||
return raw_clusters
|
||||
|
||||
def _build_cluster(
|
||||
self,
|
||||
cluster_entries: list[SearchResult],
|
||||
prev_end_iso: str | None,
|
||||
) -> EventCluster:
|
||||
"""Build an EventCluster from a list of SearchResult entries."""
|
||||
gap_before = 0.0
|
||||
if prev_end_iso is not None:
|
||||
ts_list = [
|
||||
ts
|
||||
for e in cluster_entries
|
||||
if e.timestamp_iso is not None
|
||||
for ts in (_parse_iso(e.timestamp_iso),)
|
||||
if ts is not None
|
||||
]
|
||||
if ts_list:
|
||||
this_start = min(ts_list)
|
||||
prev_end = _parse_iso(prev_end_iso)
|
||||
if prev_end is not None:
|
||||
gap_before = (this_start - prev_end).total_seconds()
|
||||
|
||||
return _make_event_cluster(
|
||||
cluster_entries,
|
||||
gap_before_seconds=gap_before,
|
||||
burst_threshold=self._burst_threshold,
|
||||
burst_window_seconds=self._burst_window,
|
||||
)
|
||||
|
||||
def _dominant_sources_tuple(self, entries: list[SearchResult]) -> tuple[str, ...]:
|
||||
"""Return source_ids sorted by total entry count descending."""
|
||||
source_counts: dict[str, int] = defaultdict(int)
|
||||
for entry in entries:
|
||||
source_counts[entry.source_id] += 1
|
||||
return tuple(
|
||||
src for src, _ in sorted(source_counts.items(), key=lambda kv: -kv[1])
|
||||
)
|
||||
|
||||
def reconstruct(self, entries: list[SearchResult]) -> TimelineResult:
|
||||
"""Build a structured timeline from a flat list of log entries."""
|
||||
if not entries:
|
||||
return TimelineResult(
|
||||
clusters=(),
|
||||
total_entries=0,
|
||||
window_start=None,
|
||||
window_end=None,
|
||||
gap_count=0,
|
||||
burst_count=0,
|
||||
dominant_sources=(),
|
||||
)
|
||||
|
||||
sorted_entries = self._sort_entries(entries)
|
||||
raw_clusters = self._group_into_raw_clusters(sorted_entries)
|
||||
|
||||
clusters: list[EventCluster] = []
|
||||
prev_end: str | None = None
|
||||
for raw in raw_clusters:
|
||||
c = self._build_cluster(raw, prev_end)
|
||||
clusters.append(c)
|
||||
prev_end = c.end_iso
|
||||
|
||||
clusters_tuple = tuple(clusters)
|
||||
gap_count = sum(
|
||||
1
|
||||
for c in clusters_tuple
|
||||
if c.gap_before_seconds > self._gap_significance_seconds
|
||||
)
|
||||
|
||||
return TimelineResult(
|
||||
clusters=clusters_tuple,
|
||||
total_entries=len(entries),
|
||||
window_start=clusters_tuple[0].start_iso if clusters_tuple else None,
|
||||
window_end=clusters_tuple[-1].end_iso if clusters_tuple else None,
|
||||
gap_count=gap_count,
|
||||
burst_count=sum(1 for c in clusters_tuple if c.burst),
|
||||
dominant_sources=self._dominant_sources_tuple(entries),
|
||||
)
|
||||
229
app/services/embeddings.py
Normal file
229
app/services/embeddings.py
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
"""Configurable embedding service — BSL licensed.
|
||||
|
||||
Backends:
|
||||
sentence_transformers — local in-process inference (default, no server needed)
|
||||
ollama — HTTP to a running Ollama instance
|
||||
|
||||
Configuration (env vars):
|
||||
TURNSTONE_EMBED_BACKEND sentence_transformers | ollama (default: sentence_transformers)
|
||||
TURNSTONE_EMBED_MODEL model name/path (backend-specific default)
|
||||
TURNSTONE_EMBED_DEVICE cpu | cuda (default: cpu; ST backend only)
|
||||
TURNSTONE_LLM_URL Ollama base URL (default: http://localhost:11434)
|
||||
|
||||
When no backend is importable/reachable, EMBEDDING_AVAILABLE is False and all
|
||||
embed calls return empty arrays — callers must handle this gracefully.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import struct
|
||||
from typing import Protocol, runtime_checkable
|
||||
|
||||
import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ── Public availability flag ──────────────────────────────────────────────────
|
||||
|
||||
EMBEDDING_AVAILABLE: bool = False
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────────────
|
||||
|
||||
_BACKEND = os.environ.get("TURNSTONE_EMBED_BACKEND", "sentence_transformers").lower()
|
||||
_DEVICE = os.environ.get("TURNSTONE_EMBED_DEVICE", "cpu").lower()
|
||||
_LLM_URL = os.environ.get("TURNSTONE_LLM_URL", "http://localhost:11434")
|
||||
|
||||
# BAAI/bge-small-en-v1.5: 33MB, MIT, 49M downloads/month, 384-dim, 512-token max.
|
||||
# Benchmarked as the best quality-to-size ratio in the field (MTEB 62.17).
|
||||
# all-MiniLM-L6-v2 is a viable lighter alternative (23MB, 256-token max) if
|
||||
# inference speed is the primary constraint.
|
||||
_DEFAULT_MODEL: dict[str, str] = {
|
||||
"sentence_transformers": "BAAI/bge-small-en-v1.5",
|
||||
"ollama": "nomic-embed-text",
|
||||
}
|
||||
_MODEL = os.environ.get(
|
||||
"TURNSTONE_EMBED_MODEL",
|
||||
_DEFAULT_MODEL.get(_BACKEND, "sentence-transformers/all-MiniLM-L6-v2"),
|
||||
)
|
||||
|
||||
|
||||
# ── Protocol ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@runtime_checkable
|
||||
class Embedder(Protocol):
|
||||
"""Minimal interface all embedding backends must satisfy."""
|
||||
|
||||
@property
|
||||
def dim(self) -> int:
|
||||
"""Embedding dimension produced by this model."""
|
||||
...
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
"""Human-readable model identifier."""
|
||||
...
|
||||
|
||||
def embed(self, text: str) -> np.ndarray:
|
||||
"""Embed a single string. Returns 1-D float32 array of length dim."""
|
||||
...
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
"""Embed a list of strings. Returns list of 1-D float32 arrays."""
|
||||
...
|
||||
|
||||
|
||||
# ── sentence-transformers backend ─────────────────────────────────────────────
|
||||
|
||||
class SentenceTransformerEmbedder:
|
||||
"""Local in-process embedding via the sentence-transformers library.
|
||||
|
||||
The model is downloaded from HuggingFace on first instantiation and cached
|
||||
at ~/.cache/huggingface/. Subsequent starts use the local cache.
|
||||
"""
|
||||
|
||||
def __init__(self, model_name: str = _MODEL, device: str = _DEVICE) -> None:
|
||||
from sentence_transformers import SentenceTransformer # type: ignore[import]
|
||||
logger.info("Loading embedding model %r on device %r ...", model_name, device)
|
||||
self._model = SentenceTransformer(model_name, device=device)
|
||||
self._model_name = model_name
|
||||
# Infer dimension from a test embed rather than hard-coding
|
||||
self._dim: int = int(self._model.encode("test").shape[0])
|
||||
logger.info("Embedding model ready — dim=%d", self._dim)
|
||||
|
||||
@property
|
||||
def dim(self) -> int:
|
||||
return self._dim
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return self._model_name
|
||||
|
||||
def embed(self, text: str) -> np.ndarray:
|
||||
vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
|
||||
return vec.astype(np.float32)
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
if not texts:
|
||||
return []
|
||||
vecs = self._model.encode(
|
||||
texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=32
|
||||
)
|
||||
return [v.astype(np.float32) for v in vecs]
|
||||
|
||||
|
||||
# ── Ollama backend ────────────────────────────────────────────────────────────
|
||||
|
||||
class OllamaEmbedder:
|
||||
"""HTTP embedding via a running Ollama instance."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = _MODEL,
|
||||
llm_url: str = _LLM_URL,
|
||||
timeout: float = 30.0,
|
||||
) -> None:
|
||||
import httpx # already a project dependency
|
||||
self._model_name = model_name
|
||||
self._url = f"{llm_url.rstrip('/')}/api/embeddings"
|
||||
self._timeout = timeout
|
||||
self._client = httpx.Client(timeout=timeout)
|
||||
# Probe dimension with a test call
|
||||
self._dim = self._probe_dim()
|
||||
|
||||
def _probe_dim(self) -> int:
|
||||
try:
|
||||
vec = self._raw_embed("probe")
|
||||
return len(vec)
|
||||
except Exception as exc:
|
||||
logger.warning("Ollama dim probe failed (%s) — defaulting to 768", exc)
|
||||
return 768
|
||||
|
||||
def _raw_embed(self, text: str) -> list[float]:
|
||||
resp = self._client.post(
|
||||
self._url, json={"model": self._model_name, "prompt": text}
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("embedding") or []
|
||||
|
||||
@property
|
||||
def dim(self) -> int:
|
||||
return self._dim
|
||||
|
||||
@property
|
||||
def model_name(self) -> str:
|
||||
return self._model_name
|
||||
|
||||
def embed(self, text: str) -> np.ndarray:
|
||||
vec = self._raw_embed(text)
|
||||
return np.array(vec, dtype=np.float32)
|
||||
|
||||
def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
|
||||
return [self.embed(t) for t in texts]
|
||||
|
||||
|
||||
# ── Singleton factory ─────────────────────────────────────────────────────────
|
||||
|
||||
_embedder: Embedder | None = None
|
||||
|
||||
|
||||
def get_embedder() -> Embedder | None:
|
||||
"""Return the configured embedder singleton, or None when unavailable.
|
||||
|
||||
Lazy-initialises on first call. Callers should check EMBEDDING_AVAILABLE
|
||||
or test for None rather than calling this unconditionally.
|
||||
"""
|
||||
global _embedder, EMBEDDING_AVAILABLE
|
||||
if _embedder is not None:
|
||||
return _embedder
|
||||
|
||||
if _BACKEND == "sentence_transformers":
|
||||
try:
|
||||
_embedder = SentenceTransformerEmbedder(_MODEL, _DEVICE)
|
||||
EMBEDDING_AVAILABLE = True
|
||||
except ImportError:
|
||||
logger.warning(
|
||||
"sentence-transformers not installed — embeddings disabled. "
|
||||
"Install with: pip install sentence-transformers"
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to load sentence-transformers model %r: %s", _MODEL, exc)
|
||||
|
||||
elif _BACKEND == "ollama":
|
||||
try:
|
||||
_embedder = OllamaEmbedder(_MODEL, _LLM_URL)
|
||||
EMBEDDING_AVAILABLE = True
|
||||
except Exception as exc:
|
||||
logger.warning("Ollama embedder init failed: %s", exc)
|
||||
|
||||
else:
|
||||
logger.warning("Unknown TURNSTONE_EMBED_BACKEND %r — embeddings disabled", _BACKEND)
|
||||
|
||||
return _embedder
|
||||
|
||||
|
||||
# ── BLOB serialisation helpers ────────────────────────────────────────────────
|
||||
|
||||
def pack_vector(vec: np.ndarray) -> bytes:
|
||||
"""Serialise a float32 numpy vector to a SQLite BLOB."""
|
||||
arr = vec.astype(np.float32)
|
||||
return struct.pack(f"{len(arr)}f", *arr.tolist())
|
||||
|
||||
|
||||
def unpack_vector(blob: bytes) -> np.ndarray:
|
||||
"""Deserialise a SQLite BLOB back to a float32 numpy vector."""
|
||||
n = len(blob) // 4 # 4 bytes per float32
|
||||
return np.array(struct.unpack(f"{n}f", blob), dtype=np.float32)
|
||||
|
||||
|
||||
def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
|
||||
"""Cosine similarity between two L2-normalised vectors.
|
||||
|
||||
Both vectors are re-normalised defensively so callers need not pre-normalise.
|
||||
Returns 0.0 when either vector has zero norm.
|
||||
"""
|
||||
norm_a = np.linalg.norm(a)
|
||||
norm_b = np.linalg.norm(b)
|
||||
if norm_a == 0.0 or norm_b == 0.0:
|
||||
return 0.0
|
||||
return float(np.dot(a, b) / (norm_a * norm_b))
|
||||
|
|
@ -6,7 +6,7 @@ import sqlite3
|
|||
import uuid
|
||||
from pathlib import Path
|
||||
|
||||
from app.ingest.base import now_iso
|
||||
from app.glean.base import now_iso
|
||||
from app.services.models import Incident, ReceivedBundle
|
||||
from app.services.search import SearchResult, entries_in_window, search
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ class RetrievedEntry:
|
|||
|
||||
entry_id: str
|
||||
source_id: str # log file path or service name
|
||||
sequence: int # original line number — ingest order, not wall-clock order
|
||||
sequence: int # original line number — glean order, not wall-clock order
|
||||
timestamp_raw: str | None # timestamp as it appeared in the log
|
||||
timestamp_iso: str | None # parsed to ISO 8601 for sorting; None if unparseable
|
||||
ingest_time: str # when Turnstone indexed this entry (wall clock)
|
||||
|
|
@ -25,7 +25,7 @@ class RetrievedEntry:
|
|||
|
||||
@dataclass(frozen=True)
|
||||
class LogPattern:
|
||||
"""A named regex pattern for tagging entries at ingest time."""
|
||||
"""A named regex pattern for tagging entries at glean time."""
|
||||
|
||||
name: str # e.g. "device_disconnect", "auth_failure"
|
||||
pattern: str # regex string
|
||||
|
|
|
|||
|
|
@ -451,9 +451,8 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
|
|||
else:
|
||||
suppressed += 1
|
||||
|
||||
# When did we last ingest anything?
|
||||
last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone()
|
||||
last_ingested: str | None = last_row["t"] if last_row else None
|
||||
last_gleaned: str | None = last_row["t"] if last_row else None
|
||||
|
||||
conn.close()
|
||||
|
||||
|
|
@ -465,7 +464,7 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
|
|||
"source_health": source_health,
|
||||
"recent_criticals": recent_criticals,
|
||||
"suppressed_criticals": suppressed,
|
||||
"last_ingested": last_ingested,
|
||||
"last_gleaned": last_gleaned,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,10 +1,10 @@
|
|||
"""Periodic batch ingest scheduler with optional CF submission.
|
||||
"""Periodic batch glean scheduler with optional CF submission.
|
||||
|
||||
Runs ingest_sources on a configurable interval (TURNSTONE_INGEST_INTERVAL env var,
|
||||
Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var,
|
||||
default 900s / 15 min). Set to 0 to disable.
|
||||
|
||||
When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote
|
||||
Turnstone instance (the CF receiving store) after each ingest run.
|
||||
Turnstone instance (the CF receiving store) after each glean run.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
@ -19,7 +19,7 @@ from typing import Any
|
|||
|
||||
import httpx
|
||||
|
||||
from app.ingest.pipeline import ingest_sources
|
||||
from app.glean.pipeline import glean_sources
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -96,14 +96,14 @@ async def submit_matched(
|
|||
if not entries:
|
||||
return {"ok": True, "submitted": 0, "skipped": True}
|
||||
|
||||
url = f"{submit_endpoint.rstrip('/')}/turnstone/api/ingest/batch"
|
||||
url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch"
|
||||
payload = {"source_host": source_host, "entries": entries}
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
resp = await client.post(url, json=payload)
|
||||
resp.raise_for_status()
|
||||
result = resp.json()
|
||||
submitted = result.get("ingested", len(entries))
|
||||
submitted = result.get("gleaned", len(entries))
|
||||
_state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat()
|
||||
_state.last_submit_count = submitted
|
||||
_state.last_submit_error = None
|
||||
|
|
@ -121,10 +121,15 @@ async def run_once(
|
|||
pattern_file: Path | None = None,
|
||||
submit_endpoint: str | None = None,
|
||||
source_host: str = "unknown",
|
||||
force: bool = False,
|
||||
) -> dict[str, Any]:
|
||||
"""Ingest all sources once, then submit matched entries if configured."""
|
||||
"""Ingest all sources once, then submit matched entries if configured.
|
||||
|
||||
Pass ``force=True`` to bypass fingerprint checks and re-glean all local
|
||||
file sources regardless of whether they appear unchanged.
|
||||
"""
|
||||
if _lock.locked():
|
||||
return {"ok": False, "error": "ingest already running", "skipped": True}
|
||||
return {"ok": False, "error": "glean already running", "skipped": True}
|
||||
|
||||
async with _lock:
|
||||
_state.running = True
|
||||
|
|
@ -133,7 +138,7 @@ async def run_once(
|
|||
loop = asyncio.get_running_loop()
|
||||
stats: dict[str, int] = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: ingest_sources(sources_file, db_path, pattern_file),
|
||||
lambda: glean_sources(sources_file, db_path, pattern_file, force=force),
|
||||
)
|
||||
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
|
||||
_state.last_run_at = started.isoformat()
|
||||
|
|
@ -141,14 +146,14 @@ async def run_once(
|
|||
_state.last_stats = stats
|
||||
_state.last_error = None
|
||||
_state.run_count += 1
|
||||
logger.info("Batch ingest complete in %.1fs — %s", duration, stats)
|
||||
logger.info("Batch glean complete in %.1fs — %s", duration, stats)
|
||||
except Exception as exc:
|
||||
duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
|
||||
_state.last_run_at = started.isoformat()
|
||||
_state.last_duration_s = round(duration, 2)
|
||||
_state.last_error = str(exc)
|
||||
_state.run_count += 1
|
||||
logger.error("Batch ingest failed: %s", exc)
|
||||
logger.error("Batch glean failed: %s", exc)
|
||||
_state.running = False
|
||||
return {"ok": False, "error": str(exc)}
|
||||
finally:
|
||||
|
|
@ -168,7 +173,7 @@ async def scheduler_loop(
|
|||
submit_endpoint: str | None = None,
|
||||
source_host: str = "unknown",
|
||||
) -> None:
|
||||
"""Run ingest + optional submission every interval_s seconds until cancelled."""
|
||||
"""Run glean + optional submission every interval_s seconds until cancelled."""
|
||||
logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file)
|
||||
if submit_endpoint:
|
||||
logger.info("Submission enabled — endpoint: %s", submit_endpoint)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
"""Live watch: tail active log sources and ingest entries in near-real-time.
|
||||
"""Live watch: tail active log sources and glean entries in near-real-time.
|
||||
|
||||
Each WatchSource runs a subprocess (journalctl -f, podman/docker logs -f)
|
||||
in a daemon thread and pipes lines through the existing ingestors into SQLite.
|
||||
|
|
@ -18,12 +18,12 @@ from typing import Iterator
|
|||
|
||||
import yaml
|
||||
|
||||
from app.ingest import journald as journald_parser, syslog as syslog_parser
|
||||
from app.ingest import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
|
||||
from app.ingest import qbittorrent as qbit_parser, caddy as caddy_parser
|
||||
from app.ingest.pipeline import _detect_format
|
||||
from app.ingest.base import _compile, load_patterns, now_iso
|
||||
from app.ingest.pipeline import _write_batch, _SCHEMA
|
||||
from app.glean import journald as journald_parser, syslog as syslog_parser
|
||||
from app.glean import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
|
||||
from app.glean import qbittorrent as qbit_parser, caddy as caddy_parser
|
||||
from app.glean.pipeline import _detect_format
|
||||
from app.glean.base import _compile, load_patterns, now_iso
|
||||
from app.glean.pipeline import _write_batch, _SCHEMA
|
||||
from app.services.search import build_fts_index
|
||||
from app.services.models import RetrievedEntry
|
||||
|
||||
|
|
@ -85,7 +85,7 @@ class WatchSource:
|
|||
"source_id": self.config.source_id,
|
||||
"type": self.config.source_type,
|
||||
"running": self._thread is not None and self._thread.is_alive(),
|
||||
"entries_ingested": self._entry_count,
|
||||
"entries_gleaned": self._entry_count,
|
||||
"last_event": self._last_event,
|
||||
"error": self._error,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ notification agent:
|
|||
## Webhook URL
|
||||
|
||||
```
|
||||
http://<turnstone-host>:8534/turnstone/api/ingest/tautulli
|
||||
http://<turnstone-host>:8534/turnstone/api/glean/tautulli
|
||||
```
|
||||
|
||||
Replace `<turnstone-host>` with the hostname or IP of the machine running
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
"""Turnstone Harvester — collect logs and ship them to a Turnstone instance.
|
||||
|
||||
Subcommands:
|
||||
push Read sources.yaml, POST each log file to Turnstone /api/ingest/upload
|
||||
push Read sources.yaml, POST each log file to Turnstone /api/glean/upload
|
||||
incident Tag an incident on the remote Turnstone instance
|
||||
|
||||
Usage:
|
||||
|
|
@ -97,8 +97,8 @@ def cmd_push(args: argparse.Namespace) -> int:
|
|||
logger.warning("No sources defined in %s", sources_path)
|
||||
return 0
|
||||
|
||||
upload_url = args.url.rstrip("/") + "/turnstone/api/ingest/upload"
|
||||
total_ingested = 0
|
||||
upload_url = args.url.rstrip("/") + "/turnstone/api/glean/upload"
|
||||
total_gleaned = 0
|
||||
errors = 0
|
||||
|
||||
for src in sources:
|
||||
|
|
@ -110,9 +110,9 @@ def cmd_push(args: argparse.Namespace) -> int:
|
|||
logger.info("Pushing %s (%s) ...", src_id, src_path)
|
||||
try:
|
||||
result = _post_file(upload_url, src_path, src_id)
|
||||
count = result.get("ingested", 0)
|
||||
total_ingested += count
|
||||
logger.info(" %s: %d entries ingested", src_id, count)
|
||||
count = result.get("gleaned", 0)
|
||||
total_gleaned += count
|
||||
logger.info(" %s: %d entries gleaned", src_id, count)
|
||||
except urllib.error.HTTPError as exc:
|
||||
logger.error(" %s: HTTP %d — %s", src_id, exc.code, exc.read().decode(errors="replace"))
|
||||
errors += 1
|
||||
|
|
@ -120,7 +120,7 @@ def cmd_push(args: argparse.Namespace) -> int:
|
|||
logger.error(" %s: %s", src_id, exc)
|
||||
errors += 1
|
||||
|
||||
logger.info("Done. Total ingested: %d entries, errors: %d", total_ingested, errors)
|
||||
logger.info("Done. Total gleaned: %d entries, errors: %d", total_gleaned, errors)
|
||||
return 1 if errors else 0
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -46,6 +46,6 @@ sources:
|
|||
# Wazuh SIEM — alerts.json on the Wazuh manager
|
||||
# Turnstone auto-detects this format; source_id is qualified per agent automatically.
|
||||
# For push-based ingestion from Wazuh custom integrations, use:
|
||||
# POST /api/ingest/wazuh/alert (single alert JSON body)
|
||||
# POST /api/glean/wazuh/alert (single alert JSON body)
|
||||
# - id: wazuh
|
||||
# path: /var/ossec/logs/alerts/alerts.json
|
||||
|
|
|
|||
30
manage.sh
30
manage.sh
|
|
@ -120,9 +120,9 @@ usage() {
|
|||
echo -e " ${GREEN}dev${NC} uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})"
|
||||
echo ""
|
||||
echo " Data:"
|
||||
echo -e " ${GREEN}ingest PATH [DB]${NC} Ingest a log file or corpus directory"
|
||||
echo -e " ${GREEN}ingest-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and ingest"
|
||||
echo -e " ${GREEN}ingest-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH"
|
||||
echo -e " ${GREEN}glean PATH [DB]${NC} Glean a log file or corpus directory"
|
||||
echo -e " ${GREEN}glean-plex [HOST]${NC} Pull Plex log from Cass (or HOST) and glean"
|
||||
echo -e " ${GREEN}glean-qbit [HOST]${NC} Pull qBittorrent log locally or from HOST via SSH"
|
||||
echo -e " ${GREEN}build-fts${NC} Rebuild the FTS search index"
|
||||
echo ""
|
||||
echo " Tests:"
|
||||
|
|
@ -134,8 +134,8 @@ usage() {
|
|||
echo " Examples:"
|
||||
echo " ./manage.sh start"
|
||||
echo " ./manage.sh dev"
|
||||
echo " ./manage.sh ingest corpus/raw/"
|
||||
echo " ./manage.sh ingest corpus/raw/ data/custom.db"
|
||||
echo " ./manage.sh glean corpus/raw/"
|
||||
echo " ./manage.sh glean corpus/raw/ data/custom.db"
|
||||
echo ""
|
||||
}
|
||||
|
||||
|
|
@ -231,15 +231,15 @@ case "$CMD" in
|
|||
(cd web && npm run dev -- --port "$VITE_PORT")
|
||||
;;
|
||||
|
||||
ingest)
|
||||
glean)
|
||||
if [[ $# -lt 1 ]]; then
|
||||
error "Usage: ./manage.sh ingest <file_or_dir> [DB_PATH]"
|
||||
error "Usage: ./manage.sh glean <file_or_dir> [DB_PATH]"
|
||||
fi
|
||||
info "Ingesting $1 → ${2:-$DB}…"
|
||||
"$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}"
|
||||
info "Gleaning $1 → ${2:-$DB}…"
|
||||
"$PYTHON" scripts/glean_corpus.py "$1" "${2:-$DB}"
|
||||
;;
|
||||
|
||||
ingest-plex)
|
||||
glean-plex)
|
||||
PLEX_HOST="${1:-cass}"
|
||||
PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs"
|
||||
TMP_DIR="/tmp/turnstone-plex-$$"
|
||||
|
|
@ -264,16 +264,16 @@ case "$CMD" in
|
|||
ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path"
|
||||
done
|
||||
|
||||
info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
|
||||
info "Gleaning ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
|
||||
for f in "$TMP_DIR"/*.log; do
|
||||
"$PYTHON" scripts/ingest_corpus.py "$f" "$DB"
|
||||
"$PYTHON" scripts/glean_corpus.py "$f" "$DB"
|
||||
done
|
||||
rm -rf "$TMP_DIR"
|
||||
info "Done. Restarting server…"
|
||||
exec bash "$0" restart
|
||||
;;
|
||||
|
||||
ingest-qbit)
|
||||
glean-qbit)
|
||||
QBIT_HOST="${1:-}"
|
||||
# Default log locations in priority order
|
||||
QBIT_LOG_PATHS=(
|
||||
|
|
@ -316,8 +316,8 @@ case "$CMD" in
|
|||
info " ← ${LOCAL_LOG}"
|
||||
fi
|
||||
|
||||
info "Ingesting into ${DB}…"
|
||||
"$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB"
|
||||
info "Gleaning into ${DB}…"
|
||||
"$PYTHON" scripts/glean_corpus.py "${TMP_DIR}"/*.log "$DB"
|
||||
rm -rf "$TMP_DIR"
|
||||
info "Done. Restarting server…"
|
||||
exec bash "$0" restart
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
# Turnstone pattern library — named regex patterns for log tagging at ingest time.
|
||||
# Turnstone pattern library — named regex patterns for log tagging at glean time.
|
||||
# Each matched pattern name is stored on RetrievedEntry.matched_patterns and
|
||||
# used to boost retrieval relevance for diagnostic queries.
|
||||
#
|
||||
|
|
@ -128,6 +128,21 @@ patterns:
|
|||
severity: ERROR
|
||||
description: NFS mount or RPC timeout
|
||||
|
||||
- name: service_crash_loop
|
||||
pattern: "(restart counter is at [0-9]|start request repeated too quickly|Restart limit hit)"
|
||||
severity: WARN
|
||||
description: systemd service crash-looping — restart counter incrementing or rate-limit hit; check for DNS resolution failures, missing dependencies, or bad config
|
||||
|
||||
- name: pkg_daemon_restart
|
||||
pattern: "(invoke-rc\\.d|Unit process.*(apt-get|dpkg|preinst).*remains running after unit stopped|Stopped.*service.*openssh|Restarting.*OpenBSD Secure Shell)"
|
||||
severity: WARN
|
||||
description: Package manager restarted a system daemon — active SSH or service sessions may have been interrupted
|
||||
|
||||
- name: ssh_forward_conflict
|
||||
pattern: "(channel_setup_fwd_listener_tcpip: cannot listen to port|error: bind.*Address already in use)"
|
||||
severity: WARN
|
||||
description: SSH port-forward conflict — previous session port still bound; stale sessions accumulating or rapid reconnects
|
||||
|
||||
# Add device/service-specific patterns below this line:
|
||||
|
||||
- name: qbit_tracker_error
|
||||
|
|
|
|||
|
|
@ -1,15 +1,15 @@
|
|||
# Turnstone log sources — Heimdall cluster ingest.
|
||||
# Turnstone log sources — Heimdall cluster glean.
|
||||
# Covers: Heimdall (local), Navi, Sif, Cass, Strahl (SSH-collected),
|
||||
# Docker services on Heimdall, and network device syslog.
|
||||
#
|
||||
# Collected by scripts/collect_cluster_logs.sh before each ingest run.
|
||||
# Collected by scripts/collect_cluster_logs.sh before each glean run.
|
||||
# All paths are container-side (/data/ = bind-mount of /devl/turnstone-cluster/data/).
|
||||
#
|
||||
# Cron (collect + ingest, every 15 min):
|
||||
# Cron (collect + glean, every 15 min):
|
||||
# */15 * * * * bash /Library/Development/CircuitForge/turnstone/scripts/collect_cluster_logs.sh && \
|
||||
# docker exec turnstone-cluster python scripts/ingest_corpus.py \
|
||||
# docker exec turnstone-cluster python scripts/glean_corpus.py \
|
||||
# --sources /patterns/sources-cluster.yaml --db /data/turnstone.db \
|
||||
# >> /var/log/turnstone-cluster-ingest.log 2>&1
|
||||
# >> /var/log/turnstone-cluster-glean.log 2>&1
|
||||
|
||||
sources:
|
||||
# ── Heimdall (local) ─────────────────────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
# Turnstone log sources — edit this file to add or remove services.
|
||||
# NOTE: the system-journal entry requires export_journal.sh to run on the HOST
|
||||
# before the container ingest step. See crontab setup instructions in the README.
|
||||
# Run ingest manually:
|
||||
# sudo podman exec turnstone python scripts/ingest_corpus.py \
|
||||
# before the container glean step. See crontab setup instructions in the README.
|
||||
# Run glean manually:
|
||||
# sudo podman exec turnstone python scripts/glean_corpus.py \
|
||||
# --sources /patterns/sources.yaml --db /data/turnstone.db
|
||||
#
|
||||
# Paths here are container-side paths under the /opt bind mount.
|
||||
|
|
@ -12,7 +12,7 @@
|
|||
sources:
|
||||
# ── System (exported by export_journal.sh on the host) ───────────────────
|
||||
# journal-export.jsonl and dmesg-export.txt are written to /opt/turnstone/data/
|
||||
# by the export script before each ingest run.
|
||||
# by the export script before each glean run.
|
||||
- id: system-journal
|
||||
path: /data/journal-export.jsonl
|
||||
|
||||
|
|
@ -73,7 +73,7 @@ sources:
|
|||
|
||||
# ── MQTT / IoT (live — subscribe mode, no path needed) ───────────────────
|
||||
# Requires: pip install circuitforge-core[mqtt]
|
||||
# These sources are handled by the live MQTT subscriber task (not batch ingest).
|
||||
# These sources are handled by the live MQTT subscriber task (not batch glean).
|
||||
# Uncomment and configure to enable.
|
||||
#
|
||||
# Meshtastic MQTT bridge (node must have MQTT uplink enabled):
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
# podman-standalone.sh — Turnstone rootful Podman setup (no Compose)
|
||||
#
|
||||
# For hosts running system Podman (non-rootless) with systemd.
|
||||
# Turnstone is a diagnostic log intelligence layer — ingest service logs,
|
||||
# Turnstone is a diagnostic log intelligence layer — glean service logs,
|
||||
# search by symptom, and view incidents in a lightweight web UI.
|
||||
#
|
||||
# ── Prerequisites ────────────────────────────────────────────────────────────
|
||||
|
|
@ -28,18 +28,18 @@
|
|||
# sudo systemctl daemon-reload
|
||||
# sudo systemctl enable --now turnstone
|
||||
#
|
||||
# ── Ingesting logs ────────────────────────────────────────────────────────────
|
||||
# ── Gleaning logs ─────────────────────────────────────────────────────────────
|
||||
# All service logs under /opt are accessible inside the container.
|
||||
# Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/).
|
||||
#
|
||||
# To ingest all sources (run manually or via cron):
|
||||
# To glean all sources (run manually or via cron):
|
||||
#
|
||||
# sudo podman exec turnstone python scripts/ingest_corpus.py \
|
||||
# sudo podman exec turnstone python scripts/glean_corpus.py \
|
||||
# --sources /patterns/sources.yaml --db /data/turnstone.db
|
||||
#
|
||||
# Example cron (every 15 minutes, add to root's crontab with: sudo crontab -e):
|
||||
# */15 * * * * podman exec turnstone python scripts/ingest_corpus.py \
|
||||
# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-ingest.log 2>&1
|
||||
# */15 * * * * podman exec turnstone python scripts/glean_corpus.py \
|
||||
# --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1
|
||||
#
|
||||
# To add a new log source: edit /opt/turnstone/patterns/sources.yaml — no restart needed.
|
||||
#
|
||||
|
|
@ -73,7 +73,7 @@ TZ=America/Los_Angeles
|
|||
#
|
||||
# ── Orchard submission (opt-in telemetry) ────────────────────────────────────
|
||||
# Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF
|
||||
# receiving instance after each ingest run. Only matched entries are sent —
|
||||
# receiving instance after each glean run. Only matched entries are sent —
|
||||
# no raw log content. Used to build Avocet training data.
|
||||
#
|
||||
# export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/xander
|
||||
|
|
@ -142,8 +142,8 @@ echo "Check container health with:"
|
|||
echo " sudo podman ps"
|
||||
echo " sudo podman logs turnstone"
|
||||
echo ""
|
||||
echo "To ingest all sources now:"
|
||||
echo " sudo podman exec turnstone python scripts/ingest_corpus.py \\"
|
||||
echo "To glean all sources now:"
|
||||
echo " sudo podman exec turnstone python scripts/glean_corpus.py \\"
|
||||
echo " --sources /patterns/sources.yaml --db /data/turnstone.db"
|
||||
echo ""
|
||||
echo "To add a new source: edit /opt/turnstone/patterns/sources.yaml — no restart needed."
|
||||
|
|
|
|||
|
|
@ -6,3 +6,4 @@ aiofiles>=23.0.0
|
|||
python-multipart>=0.0.9
|
||||
dateparser>=1.2.0
|
||||
httpx>=0.27.0
|
||||
paramiko
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
"""CLI: build (or update) the FTS5 full-text search index after ingest."""
|
||||
"""CLI: build (or update) the FTS5 full-text search index after glean."""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
|
|
@ -13,7 +13,7 @@ if __name__ == "__main__":
|
|||
|
||||
if not db_path.exists():
|
||||
print(f"ERROR: database not found: {db_path}", file=sys.stderr)
|
||||
print("Run ingest first: python scripts/ingest_corpus.py", file=sys.stderr)
|
||||
print("Run glean first: python scripts/glean_corpus.py", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Building FTS index for {db_path} ...")
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ SSH_OPTS="-o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=no"
|
|||
PYTHON=/devl/miniconda3/envs/cf/bin/python
|
||||
INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/ingest_corpus.py"
|
||||
DB=/devl/turnstone-cluster/data/turnstone.db
|
||||
LOG=/devl/turnstone-cluster/data/ingest.log
|
||||
LOG=/devl/turnstone-cluster/data/glean.log
|
||||
|
||||
mkdir -p "${DATA_DIR}"
|
||||
|
||||
|
|
@ -141,7 +141,7 @@ fi
|
|||
# Remote journals (explicit source IDs via YAML)
|
||||
${INGEST} --sources /devl/turnstone-cluster/patterns/sources-cluster.yaml --db "${DB}"
|
||||
|
||||
# Docker and Plex logs (source IDs derived from filenames by directory ingest)
|
||||
# Docker and Plex logs (source IDs derived from filenames by directory glean)
|
||||
for dir in "${HEIMDALL_DIR}" "${NAVI_DIR}" "${STRAHL_DIR}" "${PLEX_DIR}"; do
|
||||
[[ -d "${dir}" ]] && ls "${dir}"/*.jsonl "${dir}"/*.log 2>/dev/null | grep -q . && \
|
||||
${INGEST} "${dir}" "${DB}" || true
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
#!/usr/bin/env bash
|
||||
# Export recent system messages to files the Turnstone container can ingest.
|
||||
# Export recent system messages to files the Turnstone container can glean.
|
||||
#
|
||||
# Exports:
|
||||
# journal-export.jsonl — journald (if journalctl is available)
|
||||
|
|
@ -11,11 +11,11 @@
|
|||
# Usage (standalone):
|
||||
# sudo bash /opt/turnstone/scripts/export_journal.sh
|
||||
#
|
||||
# Cron (combined with ingest):
|
||||
# Cron (combined with glean):
|
||||
# */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \
|
||||
# podman exec turnstone python scripts/ingest_corpus.py \
|
||||
# --sources /patterns/sources.yaml --db /data/turnstone.db \
|
||||
# >> /var/log/turnstone-ingest.log 2>&1
|
||||
# >> /var/log/turnstone-glean.log 2>&1
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database.
|
||||
"""CLI: glean a log file or corpus directory into the Turnstone SQLite database.
|
||||
|
||||
Usage:
|
||||
# Single file or directory (legacy)
|
||||
python scripts/ingest_corpus.py <file_or_dir> [db_path]
|
||||
python scripts/glean_corpus.py <file_or_dir> [db_path]
|
||||
|
||||
# Sources config (multi-service)
|
||||
python scripts/ingest_corpus.py --sources <sources.yaml> [--db <db_path>]
|
||||
python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>]
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
|
|
@ -17,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
from app.ingest.pipeline import ingest, ingest_file, ingest_sources
|
||||
from app.glean.pipeline import glean_dir, glean_file, glean_sources
|
||||
|
||||
|
||||
def _print_stats(stats: dict[str, int]) -> None:
|
||||
|
|
@ -33,33 +33,33 @@ if __name__ == "__main__":
|
|||
if not args:
|
||||
print(
|
||||
"Usage:\n"
|
||||
" ingest_corpus.py <file_or_dir> [db_path]\n"
|
||||
" ingest_corpus.py --sources <sources.yaml> [--db <db_path>]",
|
||||
" glean_corpus.py <file_or_dir> [db_path]\n"
|
||||
" glean_corpus.py --sources <sources.yaml> [--db <db_path>]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if args[0] == "--sources":
|
||||
if len(args) < 2:
|
||||
print("Usage: ingest_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
|
||||
print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
sources_file = Path(args[1])
|
||||
db_path = Path("data/turnstone.db")
|
||||
if "--db" in args:
|
||||
db_path = Path(args[args.index("--db") + 1])
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Ingesting sources from {sources_file} → {db_path}")
|
||||
stats = ingest_sources(sources_file, db_path)
|
||||
print(f"Gleaning sources from {sources_file} → {db_path}")
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
_print_stats(stats)
|
||||
else:
|
||||
target = Path(args[0])
|
||||
db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db")
|
||||
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
print(f"Ingesting {target} → {db_path}")
|
||||
print(f"Gleaning {target} → {db_path}")
|
||||
if target.is_file():
|
||||
stats = ingest_file(target, db_path)
|
||||
stats = glean_file(target, db_path)
|
||||
elif target.is_dir():
|
||||
stats = ingest(target, db_path)
|
||||
stats = glean_dir(target, db_path)
|
||||
else:
|
||||
print(f"Error: {target} is not a file or directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
|
@ -3,7 +3,7 @@ import sqlite3
|
|||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
from app.ingest.doc_upload import ingest_upload
|
||||
from app.glean.doc_upload import glean_upload
|
||||
from app.context.store import list_facts, list_documents
|
||||
from app.context.chunker import UnsupportedDocType
|
||||
|
||||
|
|
@ -40,7 +40,7 @@ services:
|
|||
ports:
|
||||
- "32400:32400"
|
||||
"""
|
||||
result = ingest_upload(db, "docker-compose.yml", yaml_bytes)
|
||||
result = glean_upload(db, "docker-compose.yml", yaml_bytes)
|
||||
assert result["doc_type"] == "yaml"
|
||||
assert result["facts_written"] >= 1
|
||||
assert result["chunks_written"] >= 1
|
||||
|
|
@ -53,7 +53,7 @@ services:
|
|||
|
||||
def test_ingest_markdown_no_facts(db):
|
||||
md = b"# Runbook\n\nRestart plex with `systemctl restart plex`."
|
||||
result = ingest_upload(db, "runbook.md", md)
|
||||
result = glean_upload(db, "runbook.md", md)
|
||||
assert result["doc_type"] == "markdown"
|
||||
assert result["facts_written"] == 0
|
||||
assert result["chunks_written"] >= 1
|
||||
|
|
@ -61,4 +61,4 @@ def test_ingest_markdown_no_facts(db):
|
|||
|
||||
def test_ingest_raises_on_bad_type(db):
|
||||
with pytest.raises(UnsupportedDocType):
|
||||
ingest_upload(db, "report.pdf", b"data")
|
||||
glean_upload(db, "report.pdf", b"data")
|
||||
|
|
|
|||
|
|
@ -1,13 +1,17 @@
|
|||
"""Tests for app/context/embedder.py — graceful no-op without sqlite-vec."""
|
||||
"""Tests for app/context/embedder.py — delegates to app.services.embeddings."""
|
||||
import sqlite3
|
||||
import struct
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from app.context import embedder as emb_mod
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def db(tmp_path):
|
||||
@pytest.fixture()
|
||||
def db(tmp_path: Path) -> Path:
|
||||
db_path = tmp_path / "t.db"
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
conn.executescript("""
|
||||
|
|
@ -20,34 +24,78 @@ def db(tmp_path):
|
|||
REFERENCES context_documents(id) ON DELETE CASCADE,
|
||||
chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
|
||||
);
|
||||
INSERT INTO context_documents VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
|
||||
INSERT INTO context_documents
|
||||
VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
|
||||
INSERT INTO context_chunks VALUES ('c1','d1',0,'hello world',NULL);
|
||||
INSERT INTO context_chunks VALUES ('c2','d1',1,'second chunk',NULL);
|
||||
""")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return db_path
|
||||
|
||||
|
||||
def test_embed_skipped_when_extension_absent(db):
|
||||
with patch.object(emb_mod, "EMBEDDING_AVAILABLE", False):
|
||||
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
|
||||
def _mock_embedder(dim: int = 3) -> MagicMock:
|
||||
"""Return a mock Embedder that returns constant dim-length vectors."""
|
||||
m = MagicMock()
|
||||
m.dim = dim
|
||||
m.embed_batch.return_value = [np.zeros(dim, dtype=np.float32)] * 10
|
||||
return m
|
||||
|
||||
|
||||
class TestEmbedChunks:
|
||||
def test_returns_zero_when_no_embedder(self, db: Path) -> None:
|
||||
with patch("app.context.embedder.get_embedder", return_value=None):
|
||||
count = emb_mod.embed_chunks(db, "d1")
|
||||
assert count == 0
|
||||
|
||||
|
||||
def test_embed_calls_ollama_when_available(db):
|
||||
import httpx
|
||||
|
||||
class FakeResponse:
|
||||
status_code = 200
|
||||
def raise_for_status(self): pass
|
||||
def json(self): return {"embedding": [0.1, 0.2, 0.3]}
|
||||
|
||||
with patch.object(emb_mod, "EMBEDDING_AVAILABLE", True), \
|
||||
patch("app.context.embedder.httpx.post", return_value=FakeResponse()):
|
||||
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
|
||||
assert count == 1
|
||||
# Verify blob was written
|
||||
def test_returns_zero_when_no_unembedded_chunks(self, db: Path) -> None:
|
||||
# Pre-fill both chunks with a blob
|
||||
blob = struct.pack("3f", 0.1, 0.2, 0.3)
|
||||
conn = sqlite3.connect(str(db))
|
||||
row = conn.execute("SELECT embedding FROM context_chunks WHERE id='c1'").fetchone()
|
||||
conn.execute("UPDATE context_chunks SET embedding=?", (blob,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
assert row[0] is not None
|
||||
|
||||
embedder = _mock_embedder()
|
||||
with patch("app.context.embedder.get_embedder", return_value=embedder):
|
||||
count = emb_mod.embed_chunks(db, "d1")
|
||||
assert count == 0
|
||||
embedder.embed_batch.assert_not_called()
|
||||
|
||||
def test_embeds_all_null_chunks(self, db: Path) -> None:
|
||||
embedder = _mock_embedder(dim=3)
|
||||
with patch("app.context.embedder.get_embedder", return_value=embedder):
|
||||
count = emb_mod.embed_chunks(db, "d1")
|
||||
assert count == 2 # two chunks in fixture
|
||||
|
||||
def test_blobs_written_to_db(self, db: Path) -> None:
|
||||
vec = np.array([0.1, 0.2, 0.3], dtype=np.float32)
|
||||
embedder = _mock_embedder(dim=3)
|
||||
embedder.embed_batch.return_value = [vec, vec]
|
||||
|
||||
with patch("app.context.embedder.get_embedder", return_value=embedder):
|
||||
emb_mod.embed_chunks(db, "d1")
|
||||
|
||||
conn = sqlite3.connect(str(db))
|
||||
rows = conn.execute(
|
||||
"SELECT embedding FROM context_chunks WHERE document_id='d1'"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
for (blob,) in rows:
|
||||
assert blob is not None
|
||||
unpacked = struct.unpack(f"{len(blob)//4}f", blob)
|
||||
assert len(unpacked) == 3
|
||||
|
||||
def test_legacy_llm_url_param_accepted(self, db: Path) -> None:
|
||||
"""Ensure backward-compat signature still works (llm_url ignored)."""
|
||||
embedder = _mock_embedder()
|
||||
with patch("app.context.embedder.get_embedder", return_value=embedder):
|
||||
count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434", "nomic-embed-text")
|
||||
assert count == 2
|
||||
|
||||
def test_embed_batch_error_returns_zero(self, db: Path) -> None:
|
||||
embedder = _mock_embedder()
|
||||
embedder.embed_batch.side_effect = RuntimeError("model exploded")
|
||||
with patch("app.context.embedder.get_embedder", return_value=embedder):
|
||||
count = emb_mod.embed_chunks(db, "d1")
|
||||
assert count == 0
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@
|
|||
import sqlite3
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
|
||||
|
||||
def test_context_tables_created(tmp_path):
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from unittest.mock import MagicMock, patch
|
|||
@pytest.fixture
|
||||
def client(tmp_path):
|
||||
from fastapi.testclient import TestClient
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import app.rest as rest_module
|
||||
|
||||
db = tmp_path / "test.db"
|
||||
|
|
@ -25,7 +25,7 @@ def client(tmp_path):
|
|||
@pytest.fixture
|
||||
def client_with_candidate(tmp_path):
|
||||
from fastapi.testclient import TestClient
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import app.rest as rest_module
|
||||
import sqlite3, uuid
|
||||
|
||||
|
|
|
|||
245
tests/test_diagnose_classifier.py
Normal file
245
tests/test_diagnose_classifier.py
Normal file
|
|
@ -0,0 +1,245 @@
|
|||
"""Tests for app/services/diagnose/classifier.py — SeverityClassifier.
|
||||
|
||||
All ML-path tests mock ``transformers.pipeline`` so no model weights are
|
||||
downloaded during the test suite.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import FrozenInstanceError
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import app.services.diagnose.classifier as clf_module
|
||||
from app.services.diagnose.classifier import SeverityClassifier
|
||||
from app.services.diagnose.models import ClassifiedTimeline, EventCluster, TimelineResult
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_ml_singleton():
|
||||
"""Ensure the module-level ML singleton is cleared before and after each test."""
|
||||
clf_module._ml_classifier = None
|
||||
yield
|
||||
clf_module._ml_classifier = None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test-object builders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_cluster(
|
||||
representative_text: str = "test log",
|
||||
pattern_tags: tuple[str, ...] = (),
|
||||
severity: str = "INFO",
|
||||
) -> EventCluster:
|
||||
return EventCluster(
|
||||
cluster_id="abc123",
|
||||
entries=("e1",),
|
||||
start_iso=None,
|
||||
end_iso=None,
|
||||
duration_seconds=0.0,
|
||||
source_ids=("src",),
|
||||
pattern_tags=pattern_tags,
|
||||
severity=severity, # type: ignore[arg-type]
|
||||
burst=False,
|
||||
gap_before_seconds=0.0,
|
||||
representative_text=representative_text,
|
||||
)
|
||||
|
||||
|
||||
def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
|
||||
return TimelineResult(
|
||||
clusters=clusters,
|
||||
total_entries=0,
|
||||
window_start=None,
|
||||
window_end=None,
|
||||
gap_count=0,
|
||||
burst_count=0,
|
||||
dominant_sources=(),
|
||||
)
|
||||
|
||||
|
||||
def _mock_hf_pipeline(label: str, score: float) -> MagicMock:
|
||||
"""Return a mock HF pipeline callable that always yields one result."""
|
||||
pipe = MagicMock()
|
||||
pipe.return_value = [{"label": label, "score": score}]
|
||||
return pipe
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Path A — ML classification
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMLPath:
|
||||
def test_ml_error_maps_to_error(self) -> None:
|
||||
"""ML returning ERROR with score 0.98 → cluster severity ERROR."""
|
||||
pipe = _mock_hf_pipeline("ERROR", 0.98)
|
||||
with patch(
|
||||
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
|
||||
):
|
||||
clf = SeverityClassifier(model_id="fake/model")
|
||||
result = clf.classify(_make_timeline(((_make_cluster("disk error detected")),)))
|
||||
|
||||
assert result.cluster_severities["abc123"] == "ERROR"
|
||||
assert result.classifier_used == "ml"
|
||||
assert result.model_id == "fake/model"
|
||||
|
||||
def test_ml_critical_promotion(self) -> None:
|
||||
"""ERROR + score > 0.95 + 'kernel panic' in text → promoted to CRITICAL."""
|
||||
pipe = _mock_hf_pipeline("ERROR", 0.97)
|
||||
with patch(
|
||||
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
|
||||
):
|
||||
clf = SeverityClassifier(model_id="fake/model")
|
||||
result = clf.classify(
|
||||
_make_timeline((_make_cluster("kernel panic: not syncing VFS"),))
|
||||
)
|
||||
|
||||
assert result.cluster_severities["abc123"] == "CRITICAL"
|
||||
|
||||
def test_ml_debug_demotion(self) -> None:
|
||||
"""INFO + score < 0.4 → demoted to DEBUG."""
|
||||
pipe = _mock_hf_pipeline("INFO", 0.3)
|
||||
with patch(
|
||||
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
|
||||
):
|
||||
clf = SeverityClassifier(model_id="fake/model")
|
||||
result = clf.classify(_make_timeline((_make_cluster("routine ping"),)))
|
||||
|
||||
assert result.cluster_severities["abc123"] == "DEBUG"
|
||||
|
||||
def test_ml_warning_maps_to_warn(self) -> None:
|
||||
"""ML returning WARNING → mapped to WARN."""
|
||||
pipe = _mock_hf_pipeline("WARNING", 0.85)
|
||||
with patch(
|
||||
"app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
|
||||
):
|
||||
clf = SeverityClassifier(model_id="fake/model")
|
||||
result = clf.classify(_make_timeline((_make_cluster("low disk space"),)))
|
||||
|
||||
assert result.cluster_severities["abc123"] == "WARN"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Path B — pattern_tags fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestPatternTagsPath:
|
||||
def test_pattern_tags_resolve_error_severity(self, tmp_path: Path) -> None:
|
||||
"""Cluster with pattern_tag 'service_crash_loop' → ERROR from pattern file."""
|
||||
pattern_yaml = tmp_path / "default.yaml"
|
||||
pattern_yaml.write_text(
|
||||
"patterns:\n"
|
||||
" - name: service_crash_loop\n"
|
||||
" pattern: crash\n"
|
||||
" severity: ERROR\n"
|
||||
" description: Service crashed in a loop\n"
|
||||
)
|
||||
clf = SeverityClassifier(model_id="", pattern_file=pattern_yaml)
|
||||
cluster = _make_cluster(
|
||||
representative_text="service crashed",
|
||||
pattern_tags=("service_crash_loop",),
|
||||
)
|
||||
result = clf.classify(_make_timeline((cluster,)))
|
||||
|
||||
assert result.cluster_severities["abc123"] == "ERROR"
|
||||
assert result.classifier_used == "pattern_tags"
|
||||
assert result.model_id is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Path C — regex fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestRegexPath:
|
||||
def test_regex_detects_error(self) -> None:
|
||||
"""No ML, no pattern file: 'ERROR: disk full' → ERROR via regex."""
|
||||
clf = SeverityClassifier(model_id="")
|
||||
result = clf.classify(
|
||||
_make_timeline((_make_cluster("ERROR: disk full"),))
|
||||
)
|
||||
|
||||
assert result.cluster_severities["abc123"] == "ERROR"
|
||||
assert result.classifier_used == "regex"
|
||||
|
||||
def test_regex_defaults_to_info_when_no_match(self) -> None:
|
||||
"""No severity keyword in text → defaults to INFO."""
|
||||
clf = SeverityClassifier(model_id="")
|
||||
result = clf.classify(
|
||||
_make_timeline((_make_cluster("mount: disk mounted successfully"),))
|
||||
)
|
||||
|
||||
assert result.cluster_severities["abc123"] == "INFO"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fallback behaviour
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestImportErrorFallback:
|
||||
def test_transformers_import_error_falls_back_to_pattern_tags(
|
||||
self, tmp_path: Path
|
||||
) -> None:
|
||||
"""ImportError from transformers → clean fallback to pattern_tags path."""
|
||||
pattern_yaml = tmp_path / "default.yaml"
|
||||
pattern_yaml.write_text(
|
||||
"patterns:\n"
|
||||
" - name: auth_failure\n"
|
||||
" pattern: auth\n"
|
||||
" severity: ERROR\n"
|
||||
" description: Auth failure\n"
|
||||
)
|
||||
|
||||
def _raising_get_ml(*_args: Any, **_kwargs: Any) -> None:
|
||||
raise ImportError("No module named 'transformers'")
|
||||
|
||||
with patch(
|
||||
"app.services.diagnose.classifier._get_ml_classifier",
|
||||
side_effect=_raising_get_ml,
|
||||
):
|
||||
clf = SeverityClassifier(model_id="fake/model", pattern_file=pattern_yaml)
|
||||
cluster = _make_cluster(
|
||||
representative_text="auth failed",
|
||||
pattern_tags=("auth_failure",),
|
||||
)
|
||||
result = clf.classify(_make_timeline((cluster,)))
|
||||
|
||||
# ML was attempted (classifier_used == "ml") but pattern_tags resolved it
|
||||
assert result.classifier_used == "ml"
|
||||
assert result.cluster_severities["abc123"] == "ERROR"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Edge cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestEdgeCases:
|
||||
def test_empty_timeline_produces_empty_severities(self) -> None:
|
||||
"""TimelineResult with no clusters → empty cluster_severities, no crash."""
|
||||
clf = SeverityClassifier(model_id="")
|
||||
result = clf.classify(_make_timeline())
|
||||
|
||||
assert isinstance(result, ClassifiedTimeline)
|
||||
assert result.cluster_severities == {}
|
||||
assert result.classifier_used == "regex"
|
||||
|
||||
def test_classified_timeline_is_frozen(self) -> None:
|
||||
"""ClassifiedTimeline must be frozen (FrozenInstanceError on mutation)."""
|
||||
clf = SeverityClassifier(model_id="")
|
||||
result = clf.classify(_make_timeline((_make_cluster(),)))
|
||||
|
||||
with pytest.raises(FrozenInstanceError):
|
||||
result.classifier_used = "ml" # type: ignore[misc]
|
||||
486
tests/test_diagnose_hypothesizer.py
Normal file
486
tests/test_diagnose_hypothesizer.py
Normal file
|
|
@ -0,0 +1,486 @@
|
|||
"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer.
|
||||
|
||||
All tests use mocking; no real LLM calls are made.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.hypothesizer import RootCauseHypothesizer
|
||||
from app.services.diagnose.models import (
|
||||
ClassifiedTimeline,
|
||||
EventCluster,
|
||||
Hypothesis,
|
||||
TimelineResult,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_cluster(
|
||||
cluster_id: str = "c1",
|
||||
representative_text: str = "kernel: oom-killer invoked",
|
||||
severity: str = "ERROR",
|
||||
source_ids: tuple[str, ...] = ("syslog",),
|
||||
pattern_tags: tuple[str, ...] = ("oom",),
|
||||
start_iso: str | None = "2024-01-01T00:00:00+00:00",
|
||||
) -> EventCluster:
|
||||
return EventCluster(
|
||||
cluster_id=cluster_id,
|
||||
entries=("e1",),
|
||||
start_iso=start_iso,
|
||||
end_iso=None,
|
||||
duration_seconds=1.0,
|
||||
source_ids=source_ids,
|
||||
pattern_tags=pattern_tags,
|
||||
severity=severity, # type: ignore[arg-type]
|
||||
burst=False,
|
||||
gap_before_seconds=0.0,
|
||||
representative_text=representative_text,
|
||||
)
|
||||
|
||||
|
||||
def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
|
||||
return TimelineResult(
|
||||
clusters=clusters,
|
||||
total_entries=len(clusters),
|
||||
window_start=None,
|
||||
window_end=None,
|
||||
gap_count=0,
|
||||
burst_count=0,
|
||||
dominant_sources=(),
|
||||
)
|
||||
|
||||
|
||||
def _make_classified(
|
||||
clusters: tuple[EventCluster, ...] = (),
|
||||
cluster_severities: dict | None = None,
|
||||
) -> ClassifiedTimeline:
|
||||
if cluster_severities is None:
|
||||
cluster_severities = {c.cluster_id: c.severity for c in clusters}
|
||||
return ClassifiedTimeline(
|
||||
timeline=_make_timeline(clusters),
|
||||
cluster_severities=cluster_severities,
|
||||
classifier_used="pattern_tags",
|
||||
model_id=None,
|
||||
)
|
||||
|
||||
|
||||
def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
|
||||
return RetrievedContext(
|
||||
facts=[],
|
||||
chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}],
|
||||
)
|
||||
|
||||
|
||||
def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock:
|
||||
"""Build a mock httpx.Response that returns the given list as JSON."""
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": json.dumps(items)}}]
|
||||
}
|
||||
return mock_resp
|
||||
|
||||
|
||||
_SAMPLE_HYPOTHESES = [
|
||||
{
|
||||
"title": "OOM killer terminated critical process",
|
||||
"description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.",
|
||||
"confidence": 0.85,
|
||||
"severity": "CRITICAL",
|
||||
"supporting_clusters": ["c1"],
|
||||
},
|
||||
{
|
||||
"title": "Disk I/O saturation",
|
||||
"description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.",
|
||||
"confidence": 0.6,
|
||||
"severity": "ERROR",
|
||||
"supporting_clusters": ["c2"],
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 1: Valid JSON response returns correct Hypothesis objects
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_valid_json_response_returns_hypotheses():
|
||||
"""Valid LLM JSON array produces a list of Hypothesis objects with correct fields."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="why is memory failing?",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
assert isinstance(results[0], Hypothesis)
|
||||
assert results[0].title == "OOM killer terminated critical process"
|
||||
assert results[0].confidence == pytest.approx(0.85)
|
||||
assert results[0].severity == "CRITICAL"
|
||||
assert results[0].supporting_cluster_ids == ("c1",)
|
||||
assert results[1].title == "Disk I/O saturation"
|
||||
assert results[1].severity == "ERROR"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 2: hypothesis_id is a non-empty UUID string on each result
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
_UUID_RE = re.compile(
|
||||
r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
|
||||
)
|
||||
|
||||
|
||||
def test_hypothesis_id_is_uuid():
|
||||
"""Each returned Hypothesis carries a distinct UUID v4 hypothesis_id."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 2
|
||||
for h in results:
|
||||
assert h.hypothesis_id, "hypothesis_id must not be empty"
|
||||
assert _UUID_RE.match(h.hypothesis_id), (
|
||||
f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4"
|
||||
)
|
||||
# Each ID must be distinct
|
||||
ids = [h.hypothesis_id for h in results]
|
||||
assert len(set(ids)) == len(ids), "hypothesis_ids must be unique"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 3: Malformed JSON response returns [] with a logged warning
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_malformed_json_returns_empty_and_warns(caplog):
|
||||
"""When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
bad_resp = MagicMock()
|
||||
bad_resp.status_code = 200
|
||||
bad_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "not valid json"}}]
|
||||
}
|
||||
|
||||
import logging
|
||||
with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert results == []
|
||||
assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 4: Non-list JSON (dict) returns []
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_non_list_json_returns_empty(caplog):
|
||||
"""When the LLM returns a JSON object instead of an array, hypothesize() returns []."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
dict_resp = MagicMock()
|
||||
dict_resp.status_code = 200
|
||||
dict_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": '{"error": "oops"}'}}]
|
||||
}
|
||||
|
||||
import logging
|
||||
with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert results == []
|
||||
assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 5: Empty clusters returns [] without any LLM call
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_empty_clusters_returns_empty_no_llm_call():
|
||||
"""ClassifiedTimeline with no clusters returns [] and never calls the LLM."""
|
||||
classified = _make_classified(clusters=())
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
with patch("httpx.post") as mock_post:
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert results == []
|
||||
mock_post.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 6: No LLM URL returns [] without any HTTP call
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_no_llm_url_returns_empty_no_http_call():
|
||||
"""When llm_url is None, hypothesize() returns [] immediately with no HTTP requests."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
with patch("httpx.post") as mock_post:
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url=None,
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert results == []
|
||||
mock_post.assert_not_called()
|
||||
|
||||
|
||||
def test_empty_llm_url_returns_empty_no_http_call():
|
||||
"""When llm_url is empty string, hypothesize() returns [] immediately."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
with patch("httpx.post") as mock_post:
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert results == []
|
||||
mock_post.assert_not_called()
|
||||
|
||||
|
||||
def test_no_llm_model_returns_empty_no_http_call():
|
||||
"""When llm_model is None, hypothesize() returns [] immediately."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
with patch("httpx.post") as mock_post:
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model=None,
|
||||
)
|
||||
|
||||
assert results == []
|
||||
mock_post.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 7: max_hypotheses is respected
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_max_hypotheses_respected():
|
||||
"""When LLM returns more items than max_hypotheses, only max_hypotheses are returned."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer(max_hypotheses=3)
|
||||
|
||||
six_items = [
|
||||
{
|
||||
"title": f"Hypothesis {i}",
|
||||
"description": "Some description. A second sentence. Third sentence here.",
|
||||
"confidence": 0.5,
|
||||
"severity": "ERROR",
|
||||
"supporting_clusters": ["c1"],
|
||||
}
|
||||
for i in range(6)
|
||||
]
|
||||
mock_resp = _llm_json_response(six_items)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 8: Severity validation — WARNING → WARN, garbage → ERROR
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_severity_warning_maps_to_warn():
|
||||
"""'WARNING' from the LLM is normalised to 'WARN'."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
items = [
|
||||
{
|
||||
"title": "A warning severity hypothesis",
|
||||
"description": "Test description. Second sentence. Third.",
|
||||
"confidence": 0.7,
|
||||
"severity": "WARNING",
|
||||
"supporting_clusters": ["c1"],
|
||||
}
|
||||
]
|
||||
mock_resp = _llm_json_response(items)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].severity == "WARN"
|
||||
|
||||
|
||||
def test_severity_garbage_maps_to_error():
|
||||
"""An unrecognised severity string from the LLM defaults to 'ERROR'."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
items = [
|
||||
{
|
||||
"title": "A garbage severity hypothesis",
|
||||
"description": "Test description. Second sentence. Third.",
|
||||
"confidence": 0.4,
|
||||
"severity": "GARBAGE",
|
||||
"supporting_clusters": ["c1"],
|
||||
}
|
||||
]
|
||||
mock_resp = _llm_json_response(items)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].severity == "ERROR"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 9: Confidence field works with string floats from the LLM
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_confidence_string_float_coercion():
|
||||
"""A confidence value returned as a string by the LLM is coerced to float via float()."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
items = [
|
||||
{
|
||||
"title": "String confidence test",
|
||||
"description": "Some description. Second sentence. Third.",
|
||||
"confidence": "0.8", # LLM returned a string, not a float
|
||||
"severity": "INFO",
|
||||
"supporting_clusters": ["c1"],
|
||||
}
|
||||
]
|
||||
mock_resp = _llm_json_response(items)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert isinstance(results[0].confidence, float)
|
||||
assert results[0].confidence == pytest.approx(0.8)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 10: Non-numeric confidence string falls back to default 0.5
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_non_numeric_confidence_uses_default():
|
||||
"""LLM returning 'high' for confidence should not raise and defaults to 0.5."""
|
||||
cluster = _make_cluster()
|
||||
classified = _make_classified(clusters=(cluster,))
|
||||
ctx = _make_ctx()
|
||||
hypothesizer = RootCauseHypothesizer()
|
||||
|
||||
items = [
|
||||
{
|
||||
"title": "t",
|
||||
"description": "d",
|
||||
"confidence": "high",
|
||||
"severity": "ERROR",
|
||||
"supporting_clusters": [],
|
||||
}
|
||||
]
|
||||
mock_resp = _llm_json_response(items)
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
results = hypothesizer.hypothesize(
|
||||
classified, ctx, query="test",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert len(results) == 1
|
||||
assert isinstance(results[0].confidence, float)
|
||||
assert results[0].confidence == pytest.approx(0.5)
|
||||
489
tests/test_diagnose_pipeline.py
Normal file
489
tests/test_diagnose_pipeline.py
Normal file
|
|
@ -0,0 +1,489 @@
|
|||
"""Tests for app/services/diagnose/pipeline.py and __init__.py feature flag wiring.
|
||||
|
||||
All tests use mocking; no real LLM, ML, or DB calls are made.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.models import (
|
||||
ClassifiedTimeline,
|
||||
Hypothesis,
|
||||
RankedHypothesis,
|
||||
TimelineResult,
|
||||
)
|
||||
from app.services.search import SearchResult
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Shared helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_search_result(
|
||||
entry_id: str = "e1",
|
||||
source_id: str = "syslog",
|
||||
timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
|
||||
severity: str | None = "ERROR",
|
||||
text: str = "ssh: invalid user",
|
||||
) -> SearchResult:
|
||||
return SearchResult(
|
||||
entry_id=entry_id,
|
||||
source_id=source_id,
|
||||
sequence=1,
|
||||
timestamp_iso=timestamp_iso,
|
||||
severity=severity,
|
||||
repeat_count=1,
|
||||
out_of_order=False,
|
||||
matched_patterns=["ssh_fail"],
|
||||
text=text,
|
||||
rank=1.0,
|
||||
)
|
||||
|
||||
|
||||
def _make_ctx() -> RetrievedContext:
|
||||
return RetrievedContext(facts=[], chunks=[])
|
||||
|
||||
|
||||
def _make_timeline(n_clusters: int = 2) -> TimelineResult:
|
||||
return TimelineResult(
|
||||
clusters=tuple(),
|
||||
total_entries=5,
|
||||
window_start="2026-01-01T00:00:00+00:00",
|
||||
window_end="2026-01-01T01:00:00+00:00",
|
||||
gap_count=0,
|
||||
burst_count=1,
|
||||
dominant_sources=("syslog",),
|
||||
)
|
||||
|
||||
|
||||
def _make_classified(timeline: TimelineResult | None = None) -> ClassifiedTimeline:
|
||||
tl = timeline or _make_timeline()
|
||||
return ClassifiedTimeline(
|
||||
timeline=tl,
|
||||
cluster_severities={},
|
||||
classifier_used="regex",
|
||||
model_id=None,
|
||||
)
|
||||
|
||||
|
||||
def _make_hypothesis(
|
||||
hypothesis_id: str = "h1",
|
||||
title: str = "SSH flood",
|
||||
confidence: float = 0.87,
|
||||
severity: str = "CRITICAL",
|
||||
) -> Hypothesis:
|
||||
return Hypothesis(
|
||||
hypothesis_id=hypothesis_id,
|
||||
title=title,
|
||||
description="Multiple failed SSH attempts.",
|
||||
confidence=confidence,
|
||||
supporting_cluster_ids=("c1",),
|
||||
runbook_refs=(),
|
||||
severity=severity, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
|
||||
def _make_ranked(hypothesis: Hypothesis | None = None, suppress: bool = False) -> RankedHypothesis:
|
||||
h = hypothesis or _make_hypothesis()
|
||||
return RankedHypothesis(
|
||||
hypothesis=h,
|
||||
novelty_score=0.95,
|
||||
similarity_to_known=0.05,
|
||||
suppress=suppress,
|
||||
suppression_reason="similar to known" if suppress else None,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper: collect all events from run_pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
async def _collect_pipeline_events(**kwargs) -> list[dict[str, Any]]:
|
||||
"""Run run_pipeline and collect all yielded events into a list."""
|
||||
from app.services.diagnose.pipeline import run_pipeline
|
||||
events = []
|
||||
async for event in run_pipeline(**kwargs):
|
||||
events.append(event)
|
||||
return events
|
||||
|
||||
|
||||
def _default_pipeline_kwargs(entries=None, db_path=None) -> dict:
|
||||
return dict(
|
||||
db_path=db_path or Path("/tmp/fake.db"),
|
||||
entries=entries or [_make_search_result()],
|
||||
ctx=_make_ctx(),
|
||||
query="ssh brute force",
|
||||
since="2026-01-01T00:00:00+00:00",
|
||||
until="2026-01-01T01:00:00+00:00",
|
||||
llm_url=None,
|
||||
llm_model=None,
|
||||
llm_api_key=None,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mock factories for all 5 stage classes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _mock_all_stages(
|
||||
hypotheses=None,
|
||||
ranked=None,
|
||||
synthesis_text="VERDICT: CRITICAL — SSH flood (87% confidence)",
|
||||
):
|
||||
"""Return a dict of patch targets and their mock return values."""
|
||||
timeline = _make_timeline()
|
||||
classified = _make_classified(timeline)
|
||||
hyps = hypotheses if hypotheses is not None else [_make_hypothesis()]
|
||||
rnk = ranked if ranked is not None else [_make_ranked()]
|
||||
|
||||
mock_reconstructor = MagicMock()
|
||||
mock_reconstructor.return_value.reconstruct.return_value = timeline
|
||||
|
||||
mock_classifier = MagicMock()
|
||||
mock_classifier.return_value.classify.return_value = classified
|
||||
|
||||
mock_hypothesizer = MagicMock()
|
||||
mock_hypothesizer.return_value.hypothesize.return_value = hyps
|
||||
|
||||
mock_suppressor = MagicMock()
|
||||
mock_suppressor.return_value.suppress.return_value = rnk
|
||||
|
||||
mock_synthesizer = MagicMock()
|
||||
mock_synthesizer.return_value.synthesize.return_value = synthesis_text
|
||||
|
||||
return {
|
||||
"app.services.diagnose.pipeline.TimelineReconstructor": mock_reconstructor,
|
||||
"app.services.diagnose.pipeline.SeverityClassifier": mock_classifier,
|
||||
"app.services.diagnose.pipeline.RootCauseHypothesizer": mock_hypothesizer,
|
||||
"app.services.diagnose.pipeline.FalsePositiveSuppressor": mock_suppressor,
|
||||
"app.services.diagnose.pipeline.SummarySynthesizer": mock_synthesizer,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 1. Feature flag off: legacy summarize() path runs, not run_pipeline
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFeatureFlagOff:
|
||||
@pytest.mark.asyncio
|
||||
async def test_legacy_path_when_flag_off(self):
|
||||
"""With MULTI_AGENT_ENABLED=False, run_pipeline is never called."""
|
||||
from app.services import diagnose as diagnose_module
|
||||
|
||||
entries = [_make_search_result()]
|
||||
|
||||
with (
|
||||
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
|
||||
patch("app.services.diagnose.search", return_value=entries),
|
||||
patch("app.services.diagnose.entries_in_window", return_value=[]),
|
||||
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
|
||||
patch("app.services.diagnose.format_context_block", return_value=None),
|
||||
patch("app.services.diagnose.run_pipeline") as mock_pipeline,
|
||||
patch("app.services.diagnose.summarize", return_value=None),
|
||||
):
|
||||
events = []
|
||||
async for event in diagnose_module.diagnose_stream(
|
||||
db_path=Path("/tmp/fake.db"),
|
||||
query="ssh failures",
|
||||
llm_url=None,
|
||||
llm_model=None,
|
||||
):
|
||||
events.append(event)
|
||||
|
||||
# run_pipeline must NOT have been called
|
||||
mock_pipeline.assert_not_called()
|
||||
|
||||
# SSE sequence must end with done
|
||||
types = [e["type"] for e in events]
|
||||
assert "done" in types
|
||||
assert types[-1] == "done"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_legacy_done_event_is_last(self):
|
||||
"""Legacy path: done is always the last event."""
|
||||
from app.services import diagnose as diagnose_module
|
||||
|
||||
with (
|
||||
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
|
||||
patch("app.services.diagnose.search", return_value=[]),
|
||||
patch("app.services.diagnose.entries_in_window", return_value=[]),
|
||||
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
|
||||
patch("app.services.diagnose.format_context_block", return_value=None),
|
||||
):
|
||||
events = []
|
||||
async for event in diagnose_module.diagnose_stream(
|
||||
db_path=Path("/tmp/fake.db"),
|
||||
query="check logs",
|
||||
):
|
||||
events.append(event)
|
||||
|
||||
assert events[-1] == {"type": "done"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 2. Feature flag on, all stages mocked: verify SSE event sequence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestFeatureFlagOn:
|
||||
@pytest.mark.asyncio
|
||||
async def test_pipeline_stage_events_in_order(self):
|
||||
"""pipeline_stage events must be emitted stages 1→2→3→4 in order."""
|
||||
mocks = _mock_all_stages()
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
|
||||
stages = [e["stage"] for e in stage_events]
|
||||
assert stages == [1, 2, 3, 4]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_hypotheses_event_after_stage4(self):
|
||||
"""hypotheses event must appear after pipeline_stage stage=4."""
|
||||
mocks = _mock_all_stages()
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
stage4_idx = next(
|
||||
i for i, e in enumerate(events)
|
||||
if e.get("type") == "pipeline_stage" and e.get("stage") == 4
|
||||
)
|
||||
hyp_idx = next(i for i, e in enumerate(events) if e.get("type") == "hypotheses")
|
||||
assert hyp_idx > stage4_idx
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_reasoning_event_emitted(self):
|
||||
"""reasoning event must be present when synthesizer returns text."""
|
||||
mocks = _mock_all_stages(synthesis_text="VERDICT: CRITICAL — SSH flood")
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
reasoning_events = [e for e in events if e.get("type") == "reasoning"]
|
||||
assert len(reasoning_events) == 1
|
||||
assert "VERDICT" in reasoning_events[0]["text"]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_done_event_is_last(self):
|
||||
"""done must always be the last event in the pipeline sequence."""
|
||||
mocks = _mock_all_stages()
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
assert events[-1] == {"type": "done"}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_pipeline_wired_from_diagnose_stream(self):
|
||||
"""diagnose_stream routes through run_pipeline when flag is on."""
|
||||
from app.services import diagnose as diagnose_module
|
||||
|
||||
entries = [_make_search_result()]
|
||||
|
||||
async def fake_pipeline(**kwargs):
|
||||
yield {"type": "status", "message": "Building timeline…"}
|
||||
yield {"type": "pipeline_stage", "stage": 1, "name": "timeline", "message": "Built 1 clusters, 0 bursts"}
|
||||
yield {"type": "done"}
|
||||
|
||||
with (
|
||||
patch.object(diagnose_module, "MULTI_AGENT_ENABLED", True),
|
||||
patch("app.services.diagnose.search", return_value=entries),
|
||||
patch("app.services.diagnose.entries_in_window", return_value=[]),
|
||||
patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
|
||||
patch("app.services.diagnose.format_context_block", return_value=None),
|
||||
patch("app.services.diagnose.run_pipeline", side_effect=fake_pipeline),
|
||||
):
|
||||
events = []
|
||||
async for event in diagnose_module.diagnose_stream(
|
||||
db_path=Path("/tmp/fake.db"),
|
||||
query="ssh failures",
|
||||
):
|
||||
events.append(event)
|
||||
|
||||
types = [e["type"] for e in events]
|
||||
assert "pipeline_stage" in types
|
||||
assert types[-1] == "done"
|
||||
# Legacy summarize() must NOT have been called — done event came from pipeline
|
||||
assert types.count("done") == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 3. Empty entries: pipeline completes with done
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEmptyEntries:
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_entries_pipeline_completes(self):
|
||||
"""Pipeline with entries=[] must still complete and emit done."""
|
||||
mocks = _mock_all_stages(hypotheses=[], ranked=[])
|
||||
kwargs = _default_pipeline_kwargs(entries=[])
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
types = [e["type"] for e in events]
|
||||
assert "done" in types
|
||||
assert types[-1] == "done"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_empty_entries_all_stage_events_present(self):
|
||||
"""Even with empty entries, all 4 pipeline_stage events are emitted."""
|
||||
mocks = _mock_all_stages(hypotheses=[], ranked=[])
|
||||
kwargs = _default_pipeline_kwargs(entries=[])
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
|
||||
assert len(stage_events) == 4
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 4. No LLM: Stage 3 and Stage 5 return empty/fallback; done still emitted
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestNoLLM:
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_llm_pipeline_completes_with_done(self):
|
||||
"""No llm_url/llm_model: pipeline runs all stages and emits done."""
|
||||
mocks = _mock_all_stages(hypotheses=[], ranked=[], synthesis_text="VERDICT: UNKNOWN — no hypotheses generated")
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
# llm_url and llm_model already None in default kwargs
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
assert events[-1] == {"type": "done"}
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_llm_no_reasoning_event_when_synthesis_empty(self):
|
||||
"""When synthesizer returns empty string, no reasoning event is emitted."""
|
||||
mocks = _mock_all_stages(synthesis_text="")
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
reasoning_events = [e for e in events if e.get("type") == "reasoning"]
|
||||
assert len(reasoning_events) == 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# 5. Stage 1 cluster count in pipeline_stage message
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestStage1Message:
|
||||
@pytest.mark.asyncio
|
||||
async def test_stage1_message_contains_cluster_count(self):
|
||||
"""pipeline_stage stage=1 message must report cluster count."""
|
||||
timeline = TimelineResult(
|
||||
clusters=tuple(),
|
||||
total_entries=10,
|
||||
window_start=None,
|
||||
window_end=None,
|
||||
gap_count=0,
|
||||
burst_count=3,
|
||||
dominant_sources=("syslog",),
|
||||
)
|
||||
classified = _make_classified(timeline)
|
||||
|
||||
mock_reconstructor = MagicMock()
|
||||
mock_reconstructor.return_value.reconstruct.return_value = timeline
|
||||
mock_classifier = MagicMock()
|
||||
mock_classifier.return_value.classify.return_value = classified
|
||||
mock_hypothesizer = MagicMock()
|
||||
mock_hypothesizer.return_value.hypothesize.return_value = []
|
||||
mock_suppressor = MagicMock()
|
||||
mock_suppressor.return_value.suppress.return_value = []
|
||||
mock_synthesizer = MagicMock()
|
||||
mock_synthesizer.return_value.synthesize.return_value = "VERDICT: INFO — nothing found"
|
||||
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mock_reconstructor),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mock_classifier),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mock_hypothesizer),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mock_suppressor),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mock_synthesizer),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
|
||||
# 0 clusters (empty tuple), 3 bursts
|
||||
assert "0" in stage1["message"] # cluster count
|
||||
assert "3" in stage1["message"] # burst count
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_stage1_name_is_timeline(self):
|
||||
"""pipeline_stage stage=1 must have name='timeline'."""
|
||||
mocks = _mock_all_stages()
|
||||
kwargs = _default_pipeline_kwargs()
|
||||
|
||||
with (
|
||||
patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
|
||||
patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
|
||||
patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
|
||||
patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
|
||||
patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
|
||||
):
|
||||
events = await _collect_pipeline_events(**kwargs)
|
||||
|
||||
stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
|
||||
assert stage1["name"] == "timeline"
|
||||
432
tests/test_diagnose_suppressor.py
Normal file
432
tests/test_diagnose_suppressor.py
Normal file
|
|
@ -0,0 +1,432 @@
|
|||
"""Tests for app/services/diagnose/suppressor.py — FalsePositiveSuppressor.
|
||||
|
||||
All tests use mocking; no real model downloads are made.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
import app.services.diagnose.suppressor as sup_module
|
||||
from app.services.diagnose.models import Hypothesis, RankedHypothesis
|
||||
from app.services.diagnose.suppressor import FalsePositiveSuppressor
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_hypothesis(
|
||||
title: str = "Test",
|
||||
description: str = "A test hypothesis.",
|
||||
confidence: float = 0.8,
|
||||
severity: str = "ERROR",
|
||||
) -> Hypothesis:
|
||||
return Hypothesis(
|
||||
hypothesis_id="test-id",
|
||||
title=title,
|
||||
description=description,
|
||||
confidence=confidence,
|
||||
supporting_cluster_ids=(),
|
||||
runbook_refs=(),
|
||||
severity=severity, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
|
||||
def _make_db_with_incidents(incidents: list[tuple[str, str]], db_path: Path) -> Path:
|
||||
"""Create a temporary SQLite database with resolved incidents. Returns the db path."""
|
||||
with sqlite3.connect(str(db_path)) as conn:
|
||||
conn.execute(
|
||||
"CREATE TABLE incidents "
|
||||
"(id INTEGER PRIMARY KEY, label TEXT, notes TEXT, ended_at TEXT)"
|
||||
)
|
||||
for label, notes in incidents:
|
||||
conn.execute(
|
||||
"INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
|
||||
(label, notes, "2024-01-01T00:00:00"),
|
||||
)
|
||||
conn.commit()
|
||||
return db_path
|
||||
|
||||
|
||||
def _make_empty_db(db_path: Path) -> Path:
|
||||
"""Create a temporary SQLite DB with no incidents table."""
|
||||
with sqlite3.connect(str(db_path)) as conn:
|
||||
conn.execute("CREATE TABLE unrelated (id INTEGER PRIMARY KEY)")
|
||||
conn.commit()
|
||||
return db_path
|
||||
|
||||
|
||||
def _make_mock_embedder(
|
||||
embed_return: list[float] | None = None,
|
||||
embed_batch_return: list[list[float]] | None = None,
|
||||
) -> MagicMock:
|
||||
"""Build a mock embedder with controllable embed/embed_batch responses."""
|
||||
embedder = MagicMock()
|
||||
|
||||
# Default: unit vector along first dimension
|
||||
default_vec = [1.0] + [0.0] * 383
|
||||
|
||||
raw_single = embed_return if embed_return is not None else default_vec
|
||||
raw_batch = embed_batch_return if embed_batch_return is not None else [default_vec]
|
||||
|
||||
# Wrap scalars in numpy-like MagicMock with .tolist()
|
||||
def _wrap(vec: list[float]) -> MagicMock:
|
||||
m = MagicMock()
|
||||
m.tolist.return_value = vec
|
||||
return m
|
||||
|
||||
embedder.embed.return_value = _wrap(raw_single)
|
||||
embedder.embed_batch.return_value = [_wrap(v) for v in raw_batch]
|
||||
return embedder
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Autouse fixture: reset module-level cache between tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_suppressor_cache():
|
||||
sup_module._corpus_cache.clear()
|
||||
yield
|
||||
sup_module._corpus_cache.clear()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 1: No model configured — passthrough, ranked by confidence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_no_model_passthrough_ranked_by_confidence(tmp_path):
|
||||
"""model_id='' → all novelty_score=1.0, suppress=False, ranked by confidence desc."""
|
||||
h_low = _make_hypothesis(title="Low", confidence=0.3)
|
||||
h_high = _make_hypothesis(title="High", confidence=0.9)
|
||||
h_mid = _make_hypothesis(title="Mid", confidence=0.6)
|
||||
|
||||
db_path = tmp_path / "turnstone.db"
|
||||
suppressor = FalsePositiveSuppressor(model_id="")
|
||||
results = suppressor.suppress([h_low, h_high, h_mid], db_path)
|
||||
|
||||
assert len(results) == 3
|
||||
assert all(isinstance(r, RankedHypothesis) for r in results)
|
||||
assert all(r.novelty_score == pytest.approx(1.0) for r in results)
|
||||
assert all(r.similarity_to_known == pytest.approx(0.0) for r in results)
|
||||
assert all(r.suppress is False for r in results)
|
||||
assert all(r.suppression_reason is None for r in results)
|
||||
# Ranked by confidence descending
|
||||
confidences = [r.hypothesis.confidence for r in results]
|
||||
assert confidences == sorted(confidences, reverse=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 2: High similarity → suppressed
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_high_similarity_suppresses_hypothesis(tmp_path):
|
||||
"""Hypothesis with embedding nearly identical to corpus → suppress=True."""
|
||||
identical_vec = [1.0] + [0.0] * 383
|
||||
corpus_vec = [1.0] + [0.0] * 383 # cosine similarity = 1.0
|
||||
|
||||
mock_embedder = _make_mock_embedder(
|
||||
embed_return=identical_vec,
|
||||
embed_batch_return=[corpus_vec],
|
||||
)
|
||||
|
||||
db_path = _make_db_with_incidents(
|
||||
[("OOM killer", "Memory pressure caused OOM kill")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
assert result.suppress is True
|
||||
assert result.suppression_reason is not None
|
||||
assert "Similar to resolved incident" in result.suppression_reason
|
||||
assert result.similarity_to_known == pytest.approx(1.0, abs=0.01)
|
||||
assert result.novelty_score == pytest.approx(0.0, abs=0.01)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 3: Low similarity → not suppressed
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_low_similarity_does_not_suppress(tmp_path):
|
||||
"""Hypothesis with embedding orthogonal to corpus → suppress=False."""
|
||||
hypothesis_vec = [1.0] + [0.0] * 383
|
||||
corpus_vec = [0.0, 1.0] + [0.0] * 382 # orthogonal → similarity = 0.0
|
||||
|
||||
mock_embedder = _make_mock_embedder(
|
||||
embed_return=hypothesis_vec,
|
||||
embed_batch_return=[corpus_vec],
|
||||
)
|
||||
|
||||
db_path = _make_db_with_incidents(
|
||||
[("Disk I/O", "Storage saturation caused latency")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
assert result.suppress is False
|
||||
assert result.suppression_reason is None
|
||||
assert result.similarity_to_known == pytest.approx(0.0, abs=0.01)
|
||||
assert result.novelty_score == pytest.approx(1.0, abs=0.01)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 3b: Borderline similarity — exactly at threshold vs. just below
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_similarity_threshold_boundary(tmp_path):
|
||||
"""similarity == threshold is suppressed; similarity just below threshold is not.
|
||||
|
||||
This test locks down the boundary semantics: suppress when max_sim >= threshold,
|
||||
not when novelty_score < threshold (the inverted form that was the original bug).
|
||||
With threshold=0.85:
|
||||
- similarity=0.85 → suppressed (at boundary, inclusive)
|
||||
- similarity=0.84 → NOT suppressed (just below)
|
||||
"""
|
||||
db_path = _make_db_with_incidents(
|
||||
[("Disk I/O", "Storage saturation caused latency")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
|
||||
# Corpus unit vector along first axis
|
||||
corpus_vec = [1.0] + [0.0] * 383
|
||||
|
||||
for sim_value, expected_suppress in [(0.85, True), (0.84, False)]:
|
||||
# Build a hypothesis embedding whose cosine similarity to corpus_vec ≈ sim_value.
|
||||
# query = [sim, sqrt(1 - sim^2), 0, ...] → cosine sim = sim exactly.
|
||||
import math
|
||||
hyp_vec = [sim_value, math.sqrt(max(0.0, 1.0 - sim_value ** 2))] + [0.0] * 382
|
||||
|
||||
mock_embedder = _make_mock_embedder(
|
||||
embed_return=hyp_vec,
|
||||
embed_batch_return=[corpus_vec],
|
||||
)
|
||||
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
result = results[0]
|
||||
assert result.suppress is expected_suppress, (
|
||||
f"similarity={sim_value:.2f}: expected suppress={expected_suppress}, "
|
||||
f"got suppress={result.suppress} (similarity_to_known={result.similarity_to_known:.4f})"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 4: Empty hypotheses list returns []
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_empty_hypotheses_returns_empty(tmp_path):
|
||||
"""suppress([]) → [] regardless of model or db state."""
|
||||
db_path = tmp_path / "turnstone.db"
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model")
|
||||
results = suppressor.suppress([], db_path)
|
||||
assert results == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 5: Ranking by novelty_score * confidence
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_ranking_by_novelty_times_confidence(tmp_path):
|
||||
"""Results are sorted by novelty_score * confidence descending."""
|
||||
# Hypothesis A: novelty=0.9, confidence=0.5 → score=0.45
|
||||
# Hypothesis B: novelty=0.5, confidence=0.9 → score=0.45 (tie, order stable-ish)
|
||||
# Hypothesis C: novelty=0.8, confidence=0.9 → score=0.72 (highest)
|
||||
# Expected order: C, then A or B
|
||||
|
||||
# We'll use orthogonal embeddings to get predictable similarities.
|
||||
# Corpus has 3 incidents with different embeddings.
|
||||
# We'll control novelty_score by setting similarity carefully.
|
||||
|
||||
# Simplest: set up so each hypothesis gets a specific similarity to its corpus.
|
||||
# corpus_embs[0] = [1,0,0,...], [0,1,0,...], [0,0,1,...] — unit vectors
|
||||
# hyp A embed = [cos(0.1), sin(0.1), 0...] → sim to corpus[0] = cos(0.1) ≈ 0.995 high
|
||||
# This gets complex. Instead, mock _load_embedder to return None and rely
|
||||
# on passthrough with controlled confidence, then verify confidence-based ranking.
|
||||
# Then do a second test variant with manual novelty injection via embed return values.
|
||||
|
||||
# Simpler approach: create 3 hypotheses and verify output is sorted correctly
|
||||
# by providing distinct embeddings that produce known similarities.
|
||||
|
||||
# Corpus: single vector [1, 0, 0, ...]
|
||||
corpus_vec = [1.0] + [0.0] * 383
|
||||
|
||||
# H_A: similarity = 0.1 → novelty = 0.9, confidence = 0.5 → score = 0.45
|
||||
angle_a = math.acos(0.1)
|
||||
vec_a = [0.1, math.sin(angle_a)] + [0.0] * 382
|
||||
|
||||
# H_B: similarity = 0.5 → novelty = 0.5, confidence = 0.9 → score = 0.45
|
||||
angle_b = math.acos(0.5)
|
||||
vec_b = [0.5, math.sin(angle_b)] + [0.0] * 382
|
||||
|
||||
# H_C: similarity = 0.2 → novelty = 0.8, confidence = 0.9 → score = 0.72 (highest)
|
||||
angle_c = math.acos(0.2)
|
||||
vec_c = [0.2, math.sin(angle_c)] + [0.0] * 382
|
||||
|
||||
h_a = _make_hypothesis(title="A", confidence=0.5)
|
||||
h_b = _make_hypothesis(title="B", confidence=0.9)
|
||||
h_c = _make_hypothesis(title="C", confidence=0.9)
|
||||
|
||||
call_count = [0]
|
||||
vecs_in_order = [vec_a, vec_b, vec_c]
|
||||
|
||||
def side_effect_embed(text: str) -> MagicMock:
|
||||
m = MagicMock()
|
||||
m.tolist.return_value = vecs_in_order[call_count[0] % len(vecs_in_order)]
|
||||
call_count[0] += 1
|
||||
return m
|
||||
|
||||
mock_embedder = MagicMock()
|
||||
batch_m = MagicMock()
|
||||
batch_m.tolist.return_value = corpus_vec
|
||||
mock_embedder.embed_batch.return_value = [batch_m]
|
||||
mock_embedder.embed.side_effect = side_effect_embed
|
||||
|
||||
db_path = _make_db_with_incidents(
|
||||
[("OOM", "Memory exhaustion")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([h_a, h_b, h_c], db_path)
|
||||
|
||||
assert len(results) == 3
|
||||
titles = [r.hypothesis.title for r in results]
|
||||
# H_C should be first (highest novelty*confidence score)
|
||||
assert titles[0] == "C", f"Expected C first, got {titles}"
|
||||
# Verify sort is descending by novelty*confidence
|
||||
scores = [r.novelty_score * r.hypothesis.confidence for r in results]
|
||||
assert scores == sorted(scores, reverse=True)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 6: DB with no resolved incidents → novelty_score=1.0
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_no_resolved_incidents_in_db_passthrough(tmp_path):
|
||||
"""When incidents table is empty, all hypotheses get novelty_score=1.0."""
|
||||
db_path = _make_db_with_incidents([], tmp_path / "turnstone.db") # table exists but zero rows
|
||||
mock_embedder = _make_mock_embedder()
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model")
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].novelty_score == pytest.approx(1.0)
|
||||
assert results[0].suppress is False
|
||||
# embed_batch should NOT have been called (empty corpus short-circuits)
|
||||
mock_embedder.embed_batch.assert_not_called()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 7: DB query failure → graceful fallback, no crash
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_db_query_failure_graceful_fallback(tmp_path):
|
||||
"""When the incidents table is missing, suppress() returns passthrough without raising."""
|
||||
db_path = _make_empty_db(tmp_path / "turnstone.db") # no 'incidents' table
|
||||
mock_embedder = _make_mock_embedder()
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model")
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
results = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].novelty_score == pytest.approx(1.0)
|
||||
assert results[0].suppress is False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 8: Embedding service unavailable (returns None) → graceful fallback
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_embedding_service_unavailable_passthrough(tmp_path):
|
||||
"""When get_embedder() returns None, suppress() falls back without crashing."""
|
||||
db_path = _make_db_with_incidents(
|
||||
[("OOM", "Memory pressure")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model")
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=None):
|
||||
results = suppressor.suppress([_make_hypothesis(confidence=0.7)], db_path)
|
||||
|
||||
assert len(results) == 1
|
||||
assert results[0].novelty_score == pytest.approx(1.0)
|
||||
assert results[0].suppress is False
|
||||
assert results[0].suppression_reason is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test 9: Corpus cache invalidated when corpus changes
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_corpus_cache_invalidated_on_corpus_change(tmp_path):
|
||||
"""When the corpus changes between calls, embed_batch is called again."""
|
||||
# First DB: one incident
|
||||
db_path = _make_db_with_incidents(
|
||||
[("OOM", "Memory pressure")],
|
||||
tmp_path / "turnstone.db",
|
||||
)
|
||||
|
||||
corpus_vec_1 = [1.0] + [0.0] * 383
|
||||
corpus_vec_2 = [0.0, 1.0] + [0.0] * 382
|
||||
|
||||
hyp_vec = [1.0] + [0.0] * 383
|
||||
|
||||
# embedder will be called twice for embed_batch (different corpus each time)
|
||||
mock_embedder = MagicMock()
|
||||
single_m = MagicMock()
|
||||
single_m.tolist.return_value = hyp_vec
|
||||
|
||||
batch_m1 = MagicMock()
|
||||
batch_m1.tolist.return_value = corpus_vec_1
|
||||
batch_m2 = MagicMock()
|
||||
batch_m2.tolist.return_value = corpus_vec_2
|
||||
|
||||
mock_embedder.embed.return_value = single_m
|
||||
mock_embedder.embed_batch.side_effect = [[batch_m1], [batch_m2]]
|
||||
|
||||
suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
|
||||
|
||||
with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
|
||||
# First call — populates cache
|
||||
results_1 = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
assert mock_embedder.embed_batch.call_count == 1
|
||||
|
||||
# Mutate the DB to add a second incident (changes corpus)
|
||||
with sqlite3.connect(str(db_path)) as conn:
|
||||
conn.execute(
|
||||
"INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
|
||||
("Disk I/O", "Storage saturation", "2024-01-02T00:00:00"),
|
||||
)
|
||||
conn.commit()
|
||||
|
||||
# Second call — corpus changed, should re-embed
|
||||
results_2 = suppressor.suppress([_make_hypothesis()], db_path)
|
||||
assert mock_embedder.embed_batch.call_count == 2, (
|
||||
"embed_batch should be called again when corpus changes"
|
||||
)
|
||||
|
||||
assert len(results_1) == 1
|
||||
assert len(results_2) == 1
|
||||
285
tests/test_diagnose_synthesizer.py
Normal file
285
tests/test_diagnose_synthesizer.py
Normal file
|
|
@ -0,0 +1,285 @@
|
|||
"""Tests for app/services/diagnose/synthesizer.py — SummarySynthesizer.
|
||||
|
||||
All tests use mocking; no real LLM calls are made.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
from app.context.retriever import RetrievedContext
|
||||
from app.services.diagnose.models import Hypothesis, RankedHypothesis, TimelineResult
|
||||
from app.services.diagnose.synthesizer import SummarySynthesizer
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixture helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_hypothesis(
|
||||
hypothesis_id: str = "h1",
|
||||
title: str = "SSH flood from external IPs",
|
||||
description: str = "Repeated failed login attempts from multiple IPs.",
|
||||
confidence: float = 0.87,
|
||||
severity: str = "CRITICAL",
|
||||
) -> Hypothesis:
|
||||
return Hypothesis(
|
||||
hypothesis_id=hypothesis_id,
|
||||
title=title,
|
||||
description=description,
|
||||
confidence=confidence,
|
||||
supporting_cluster_ids=("c1",),
|
||||
runbook_refs=(),
|
||||
severity=severity, # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
|
||||
def _make_ranked(
|
||||
hypothesis: Hypothesis | None = None,
|
||||
novelty_score: float = 0.95,
|
||||
similarity_to_known: float = 0.05,
|
||||
suppress: bool = False,
|
||||
suppression_reason: str | None = None,
|
||||
) -> RankedHypothesis:
|
||||
h = hypothesis or _make_hypothesis()
|
||||
return RankedHypothesis(
|
||||
hypothesis=h,
|
||||
novelty_score=novelty_score,
|
||||
similarity_to_known=similarity_to_known,
|
||||
suppress=suppress,
|
||||
suppression_reason=suppression_reason,
|
||||
)
|
||||
|
||||
|
||||
def _make_timeline(
|
||||
total_entries: int = 42,
|
||||
n_clusters: int = 3,
|
||||
) -> TimelineResult:
|
||||
return TimelineResult(
|
||||
clusters=tuple(),
|
||||
total_entries=total_entries,
|
||||
window_start="2026-01-01T00:00:00+00:00",
|
||||
window_end="2026-01-01T01:00:00+00:00",
|
||||
gap_count=1,
|
||||
burst_count=2,
|
||||
dominant_sources=("syslog", "auth"),
|
||||
)
|
||||
|
||||
|
||||
def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
|
||||
return RetrievedContext(
|
||||
facts=[{"category": "network", "key": "host", "value": "heimdall", "source": "facts"}],
|
||||
chunks=chunks or [{"filename": "runbook.md", "text": "Restart sshd if flooded"}],
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestSynthesizerWithHypotheses:
|
||||
"""With hypotheses, result must contain VERDICT."""
|
||||
|
||||
def test_returns_verdict_string_with_llm(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
ranked = [_make_ranked()]
|
||||
timeline = _make_timeline()
|
||||
ctx = _make_ctx()
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)\nTIMELINE: lots of hits."}}]
|
||||
}
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
result = synthesizer.synthesize(
|
||||
ranked=ranked,
|
||||
timeline=timeline,
|
||||
ctx=ctx,
|
||||
query="ssh brute force",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert "VERDICT" in result
|
||||
|
||||
def test_returns_nonempty_string(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
ranked = [_make_ranked()]
|
||||
timeline = _make_timeline()
|
||||
ctx = _make_ctx()
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)"}}]
|
||||
}
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
result = synthesizer.synthesize(
|
||||
ranked=ranked,
|
||||
timeline=timeline,
|
||||
ctx=ctx,
|
||||
query="why is auth failing",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
class TestSynthesizerSuppressedHypotheses:
|
||||
"""Suppressed hypotheses must be excluded from the LLM prompt."""
|
||||
|
||||
def test_suppressed_hypotheses_excluded_from_prompt(self):
|
||||
suppressed = _make_ranked(
|
||||
hypothesis=_make_hypothesis(
|
||||
hypothesis_id="h2",
|
||||
title="Wazuh alert processing backlog",
|
||||
severity="ERROR",
|
||||
confidence=0.72,
|
||||
),
|
||||
suppress=True,
|
||||
suppression_reason="similar to 2025-04 SSH incident",
|
||||
novelty_score=0.1,
|
||||
)
|
||||
active = _make_ranked(
|
||||
hypothesis=_make_hypothesis(
|
||||
hypothesis_id="h1",
|
||||
title="SSH flood from external IPs",
|
||||
severity="CRITICAL",
|
||||
confidence=0.87,
|
||||
),
|
||||
suppress=False,
|
||||
novelty_score=0.95,
|
||||
)
|
||||
|
||||
captured_messages: list = []
|
||||
|
||||
def fake_post(url, json=None, headers=None, timeout=None):
|
||||
if json and "payload" in json:
|
||||
captured_messages.extend(json["payload"].get("messages", []))
|
||||
elif json and "messages" in json:
|
||||
captured_messages.extend(json.get("messages", []))
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood"}}]
|
||||
}
|
||||
return mock_resp
|
||||
|
||||
synthesizer = SummarySynthesizer()
|
||||
with patch("httpx.post", side_effect=fake_post):
|
||||
synthesizer.synthesize(
|
||||
ranked=[active, suppressed],
|
||||
timeline=_make_timeline(),
|
||||
ctx=_make_ctx(),
|
||||
query="auth failures",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
# The user message should contain the active hypothesis title
|
||||
# and NOT contain the suppressed one (or mark it suppressed)
|
||||
user_content = next(
|
||||
(m["content"] for m in captured_messages if m.get("role") == "user"), ""
|
||||
)
|
||||
assert "SSH flood from external IPs" in user_content
|
||||
# Wazuh should not appear as a standalone top-level hypothesis
|
||||
# (suppressed items are excluded from the active list sent to the LLM)
|
||||
assert "Wazuh alert processing backlog" not in user_content
|
||||
|
||||
|
||||
class TestSynthesizerNoLLM:
|
||||
"""No LLM configured: must return deterministic fallback (not empty)."""
|
||||
|
||||
def test_no_llm_url_returns_fallback(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
ranked = [_make_ranked()]
|
||||
timeline = _make_timeline()
|
||||
ctx = _make_ctx()
|
||||
|
||||
result = synthesizer.synthesize(
|
||||
ranked=ranked,
|
||||
timeline=timeline,
|
||||
ctx=ctx,
|
||||
query="disk errors",
|
||||
)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
assert "VERDICT" in result
|
||||
|
||||
def test_no_llm_model_returns_fallback(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
ranked = [_make_ranked()]
|
||||
|
||||
result = synthesizer.synthesize(
|
||||
ranked=ranked,
|
||||
timeline=_make_timeline(),
|
||||
ctx=_make_ctx(),
|
||||
query="oom killer",
|
||||
llm_url="http://localhost:11434",
|
||||
# llm_model omitted
|
||||
)
|
||||
|
||||
assert "VERDICT" in result
|
||||
assert "SSH flood from external IPs" in result
|
||||
|
||||
def test_llm_failure_returns_fallback(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
ranked = [_make_ranked()]
|
||||
|
||||
with patch("httpx.post", side_effect=ConnectionError("refused")):
|
||||
result = synthesizer.synthesize(
|
||||
ranked=ranked,
|
||||
timeline=_make_timeline(),
|
||||
ctx=_make_ctx(),
|
||||
query="why is disk full",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert "VERDICT" in result
|
||||
assert len(result) > 0
|
||||
|
||||
|
||||
class TestSynthesizerEmptyRanked:
|
||||
"""Empty ranked list: must return deterministic fallback text, not raise."""
|
||||
|
||||
def test_empty_ranked_no_llm_returns_fallback(self):
|
||||
synthesizer = SummarySynthesizer()
|
||||
result = synthesizer.synthesize(
|
||||
ranked=[],
|
||||
timeline=_make_timeline(),
|
||||
ctx=_make_ctx(),
|
||||
query="check everything",
|
||||
)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
assert "VERDICT" in result
|
||||
|
||||
def test_empty_ranked_with_llm_returns_fallback_or_llm_text(self):
|
||||
"""Even with empty ranked, we attempt LLM and return something."""
|
||||
synthesizer = SummarySynthesizer()
|
||||
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.status_code = 200
|
||||
mock_resp.json.return_value = {
|
||||
"choices": [{"message": {"content": "VERDICT: UNKNOWN — no hypotheses generated"}}]
|
||||
}
|
||||
|
||||
with patch("httpx.post", return_value=mock_resp):
|
||||
result = synthesizer.synthesize(
|
||||
ranked=[],
|
||||
timeline=_make_timeline(),
|
||||
ctx=_make_ctx(),
|
||||
query="nothing found",
|
||||
llm_url="http://localhost:11434",
|
||||
llm_model="llama3",
|
||||
)
|
||||
|
||||
assert isinstance(result, str)
|
||||
assert len(result) > 0
|
||||
234
tests/test_diagnose_timeline.py
Normal file
234
tests/test_diagnose_timeline.py
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
"""Tests for app/services/diagnose/timeline.py — TimelineReconstructor."""
|
||||
from __future__ import annotations
|
||||
|
||||
from app.services.diagnose.timeline import TimelineReconstructor
|
||||
from app.services.diagnose.models import TimelineResult
|
||||
from app.services.search import SearchResult
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_entry(
|
||||
entry_id: str = "e1",
|
||||
source_id: str = "src-a",
|
||||
timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
|
||||
severity: str | None = "INFO",
|
||||
rank: float = 0.0,
|
||||
text: str = "log line",
|
||||
matched_patterns: list[str] | None = None,
|
||||
sequence: int = 1,
|
||||
) -> SearchResult:
|
||||
return SearchResult(
|
||||
entry_id=entry_id,
|
||||
source_id=source_id,
|
||||
sequence=sequence,
|
||||
timestamp_iso=timestamp_iso,
|
||||
severity=severity,
|
||||
repeat_count=1,
|
||||
out_of_order=False,
|
||||
matched_patterns=matched_patterns or [],
|
||||
text=text,
|
||||
rank=rank,
|
||||
)
|
||||
|
||||
|
||||
def _ts(offset_seconds: int) -> str:
|
||||
"""Return an ISO timestamp offset_seconds after 2026-01-01T00:00:00+00:00."""
|
||||
from datetime import datetime, timezone, timedelta
|
||||
base = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
|
||||
dt = base + timedelta(seconds=offset_seconds)
|
||||
return dt.isoformat()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test cases
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestEmptyInput:
|
||||
def test_empty_returns_empty_timeline(self):
|
||||
rt = TimelineReconstructor()
|
||||
result = rt.reconstruct([])
|
||||
assert result == TimelineResult(
|
||||
clusters=(),
|
||||
total_entries=0,
|
||||
gap_count=0,
|
||||
burst_count=0,
|
||||
window_start=None,
|
||||
window_end=None,
|
||||
dominant_sources=(),
|
||||
)
|
||||
|
||||
|
||||
class TestSingleEntry:
|
||||
def test_single_entry_one_cluster(self):
|
||||
rt = TimelineReconstructor()
|
||||
entry = _make_entry(entry_id="e1", timestamp_iso=_ts(0))
|
||||
result = rt.reconstruct([entry])
|
||||
assert len(result.clusters) == 1
|
||||
cluster = result.clusters[0]
|
||||
assert cluster.gap_before_seconds == 0.0
|
||||
assert cluster.burst is False
|
||||
assert result.total_entries == 1
|
||||
|
||||
|
||||
class TestClusteringWithinWindow:
|
||||
def test_two_entries_10s_apart_same_cluster(self):
|
||||
rt = TimelineReconstructor(cluster_window_seconds=30)
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(10)),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert len(result.clusters) == 1
|
||||
assert len(result.clusters[0].entries) == 2
|
||||
|
||||
|
||||
class TestClusteringOutsideWindow:
|
||||
def test_two_entries_60s_apart_two_clusters(self):
|
||||
rt = TimelineReconstructor(cluster_window_seconds=30)
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(60)),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert len(result.clusters) == 2
|
||||
second_cluster = result.clusters[1]
|
||||
assert second_cluster.gap_before_seconds >= 60.0
|
||||
|
||||
def test_gap_count_correct_for_60s_gap(self):
|
||||
rt = TimelineReconstructor(cluster_window_seconds=30)
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(60)),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert result.gap_count == 1
|
||||
|
||||
|
||||
class TestBurst:
|
||||
def test_15_entries_within_3s_is_burst(self):
|
||||
rt = TimelineReconstructor(
|
||||
cluster_window_seconds=30,
|
||||
burst_threshold=10,
|
||||
burst_window_seconds=5,
|
||||
)
|
||||
# All 15 entries within a 3-second window — well under burst_window=5
|
||||
entries = [
|
||||
_make_entry(entry_id=f"e{i}", timestamp_iso=_ts(i % 3), sequence=i)
|
||||
for i in range(15)
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
# All should land in one cluster
|
||||
assert len(result.clusters) == 1
|
||||
assert result.clusters[0].burst is True
|
||||
assert result.burst_count == 1
|
||||
|
||||
|
||||
class TestNullTimestamps:
|
||||
def test_null_timestamp_joins_current_cluster(self):
|
||||
rt = TimelineReconstructor(cluster_window_seconds=30)
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", timestamp_iso=None),
|
||||
]
|
||||
# Should not raise, and null entry should join the existing cluster
|
||||
result = rt.reconstruct(entries)
|
||||
assert len(result.clusters) == 1
|
||||
assert "e2" in result.clusters[0].entries
|
||||
|
||||
def test_null_timestamp_does_not_start_new_cluster(self):
|
||||
rt = TimelineReconstructor(cluster_window_seconds=30)
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", timestamp_iso=None),
|
||||
_make_entry(entry_id="e3", timestamp_iso=_ts(5)),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
# e3 is within 30s of e1, so all three in one cluster
|
||||
assert len(result.clusters) == 1
|
||||
|
||||
def test_all_null_timestamps_one_cluster_no_crash(self):
|
||||
rt = TimelineReconstructor()
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=None),
|
||||
_make_entry(entry_id="e2", timestamp_iso=None),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert len(result.clusters) == 1
|
||||
cluster = result.clusters[0]
|
||||
assert cluster.start_iso is None
|
||||
assert cluster.end_iso is None
|
||||
assert result.window_start is None
|
||||
assert result.window_end is None
|
||||
|
||||
|
||||
class TestDominantSources:
|
||||
def test_dominant_sources_ordered_by_count_descending(self):
|
||||
rt = TimelineReconstructor()
|
||||
# src-b has 3 entries, src-a has 1
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", source_id="src-a", timestamp_iso=_ts(0)),
|
||||
_make_entry(entry_id="e2", source_id="src-b", timestamp_iso=_ts(1)),
|
||||
_make_entry(entry_id="e3", source_id="src-b", timestamp_iso=_ts(2)),
|
||||
_make_entry(entry_id="e4", source_id="src-b", timestamp_iso=_ts(3)),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert result.dominant_sources[0] == "src-b"
|
||||
assert result.dominant_sources[1] == "src-a"
|
||||
|
||||
|
||||
class TestRepresentativeText:
|
||||
def test_representative_text_uses_highest_rank(self):
|
||||
rt = TimelineReconstructor()
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=-5.0, text="low score"),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=-1.0, text="high score"),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert result.clusters[0].representative_text == "high score"
|
||||
|
||||
def test_representative_text_tiebreak_on_longest_text(self):
|
||||
rt = TimelineReconstructor()
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=0.0, text="short"),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=0.0, text="much longer text here"),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert result.clusters[0].representative_text == "much longer text here"
|
||||
|
||||
|
||||
class TestClusterId:
|
||||
def test_cluster_id_is_12_char_hex(self):
|
||||
rt = TimelineReconstructor()
|
||||
entry = _make_entry(entry_id="abc123", timestamp_iso=_ts(0))
|
||||
result = rt.reconstruct([entry])
|
||||
cluster_id = result.clusters[0].cluster_id
|
||||
assert len(cluster_id) == 12
|
||||
assert all(c in "0123456789abcdef" for c in cluster_id)
|
||||
|
||||
|
||||
class TestSeverity:
|
||||
def test_critical_wins_over_error(self):
|
||||
rt = TimelineReconstructor()
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0), severity="ERROR"),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(1), severity="CRITICAL"),
|
||||
_make_entry(entry_id="e3", timestamp_iso=_ts(2), severity="INFO"),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
assert result.clusters[0].severity == "CRITICAL"
|
||||
|
||||
|
||||
class TestPatternTags:
|
||||
def test_pattern_tags_union_across_entries(self):
|
||||
rt = TimelineReconstructor()
|
||||
entries = [
|
||||
_make_entry(entry_id="e1", timestamp_iso=_ts(0), matched_patterns=["oom-killer"]),
|
||||
_make_entry(entry_id="e2", timestamp_iso=_ts(1), matched_patterns=["disk-full"]),
|
||||
]
|
||||
result = rt.reconstruct(entries)
|
||||
tags = set(result.clusters[0].pattern_tags)
|
||||
assert "oom-killer" in tags
|
||||
assert "disk-full" in tags
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
"""Tests for the dmesg log ingestor."""
|
||||
"""Tests for the dmesg log gleaner."""
|
||||
from __future__ import annotations
|
||||
|
||||
from app.ingest.dmesg_log import is_dmesg_log, parse
|
||||
from app.glean.dmesg_log import is_dmesg_log, parse
|
||||
|
||||
RELATIVE_SAMPLE = """\
|
||||
[ 0.000000] Linux version 6.8.0-65-generic
|
||||
236
tests/test_glean_fingerprint.py
Normal file
236
tests/test_glean_fingerprint.py
Normal file
|
|
@ -0,0 +1,236 @@
|
|||
"""Tests for fingerprint-based incremental glean skipping (issue #30).
|
||||
|
||||
Verifies that _glean_files() (and its public wrappers) skip local files whose
|
||||
mtime+size fingerprint has not changed since the last glean, and that force=True
|
||||
bypasses that check.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from app.glean.pipeline import (
|
||||
_fingerprint,
|
||||
_fp_unchanged,
|
||||
_save_fingerprint,
|
||||
ensure_schema,
|
||||
glean_dir,
|
||||
glean_file,
|
||||
)
|
||||
from app.glean.base import now_iso
|
||||
|
||||
|
||||
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture()
|
||||
def db_path(tmp_path: Path) -> Path:
|
||||
path = tmp_path / "test.db"
|
||||
ensure_schema(path)
|
||||
return path
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def log_file(tmp_path: Path) -> Path:
|
||||
"""A minimal plaintext log file."""
|
||||
f = tmp_path / "test.log"
|
||||
f.write_text("May 24 10:00:00 heimdall kernel: test message\n")
|
||||
return f
|
||||
|
||||
|
||||
# ── Unit: fingerprint helpers ──────────────────────────────────────────────────
|
||||
|
||||
class TestFingerprintHelpers:
|
||||
def test_fingerprint_returns_mtime_and_size(self, log_file: Path) -> None:
|
||||
mtime, size = _fingerprint(log_file)
|
||||
st = log_file.stat()
|
||||
assert mtime == st.st_mtime
|
||||
assert size == st.st_size
|
||||
|
||||
def test_fp_unchanged_returns_false_when_no_record(self, db_path: Path, log_file: Path) -> None:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
mtime, size = _fingerprint(log_file)
|
||||
assert _fp_unchanged(conn, log_file, mtime, size) is False
|
||||
conn.close()
|
||||
|
||||
def test_fp_unchanged_returns_true_after_save(self, db_path: Path, log_file: Path) -> None:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
mtime, size = _fingerprint(log_file)
|
||||
_save_fingerprint(conn, log_file, mtime, size, now_iso())
|
||||
conn.commit()
|
||||
assert _fp_unchanged(conn, log_file, mtime, size) is True
|
||||
conn.close()
|
||||
|
||||
def test_fp_unchanged_returns_false_on_size_change(self, db_path: Path, log_file: Path) -> None:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
mtime, size = _fingerprint(log_file)
|
||||
_save_fingerprint(conn, log_file, mtime, size, now_iso())
|
||||
conn.commit()
|
||||
# Simulate size change (new content appended)
|
||||
assert _fp_unchanged(conn, log_file, mtime, size + 1) is False
|
||||
conn.close()
|
||||
|
||||
def test_fp_unchanged_returns_false_on_mtime_change(self, db_path: Path, log_file: Path) -> None:
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
mtime, size = _fingerprint(log_file)
|
||||
_save_fingerprint(conn, log_file, mtime, size, now_iso())
|
||||
conn.commit()
|
||||
assert _fp_unchanged(conn, log_file, mtime + 1.0, size) is False
|
||||
conn.close()
|
||||
|
||||
def test_save_fingerprint_upserts(self, db_path: Path, log_file: Path) -> None:
|
||||
"""Second save with different values replaces the first (UPSERT semantics)."""
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
_save_fingerprint(conn, log_file, 1000.0, 100, "2026-01-01T00:00:00Z")
|
||||
conn.commit()
|
||||
_save_fingerprint(conn, log_file, 2000.0, 200, "2026-01-02T00:00:00Z")
|
||||
conn.commit()
|
||||
row = conn.execute(
|
||||
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log_file),),
|
||||
).fetchone()
|
||||
assert row == (2000.0, 200)
|
||||
conn.close()
|
||||
|
||||
|
||||
# ── Integration: glean_file skipping ─────────────────────────────────────────
|
||||
|
||||
class TestGleanFileFingerprint:
|
||||
def test_first_glean_writes_fingerprint(self, db_path: Path, log_file: Path) -> None:
|
||||
glean_file(log_file, db_path)
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
row = conn.execute(
|
||||
"SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log_file),),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row is not None
|
||||
mtime, size = _fingerprint(log_file)
|
||||
assert row == (mtime, size)
|
||||
|
||||
def test_second_glean_skips_unchanged_file(self, db_path: Path, log_file: Path) -> None:
|
||||
stats_first = glean_file(log_file, db_path)
|
||||
count_first = sum(stats_first.values())
|
||||
|
||||
# Re-glean without touching the file — should produce 0 new entries.
|
||||
stats_second = glean_file(log_file, db_path)
|
||||
count_second = sum(stats_second.values())
|
||||
|
||||
assert count_first >= 1, "First glean should find at least one entry"
|
||||
assert count_second == 0, "Second glean should skip unchanged file"
|
||||
|
||||
def test_second_glean_runs_when_file_grows(self, db_path: Path, log_file: Path) -> None:
|
||||
glean_file(log_file, db_path)
|
||||
|
||||
# Append a new line and update mtime by rewriting.
|
||||
original = log_file.read_text()
|
||||
log_file.write_text(original + "May 24 10:01:00 heimdall kernel: second message\n")
|
||||
|
||||
stats_second = glean_file(log_file, db_path)
|
||||
# INSERT OR IGNORE means the original entry won't re-count, but parsing
|
||||
# does happen — at minimum the new line is processed.
|
||||
assert sum(stats_second.values()) >= 0 # glean ran (not skipped)
|
||||
|
||||
# Confirm fingerprint updated to new size.
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
row = conn.execute(
|
||||
"SELECT size FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log_file),),
|
||||
).fetchone()
|
||||
conn.close()
|
||||
assert row is not None
|
||||
assert row[0] == log_file.stat().st_size
|
||||
|
||||
def test_force_bypasses_fingerprint(self, db_path: Path, log_file: Path) -> None:
|
||||
glean_file(log_file, db_path)
|
||||
|
||||
# Without force: skipped.
|
||||
stats_no_force = glean_file(log_file, db_path)
|
||||
assert sum(stats_no_force.values()) == 0
|
||||
|
||||
# With force: glean runs (INSERT OR IGNORE means count may be 0, but
|
||||
# we verify the fingerprint was re-saved with a fresh gleaned_at).
|
||||
conn_before = sqlite3.connect(str(db_path))
|
||||
ts_before = conn_before.execute(
|
||||
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log_file),),
|
||||
).fetchone()[0]
|
||||
conn_before.close()
|
||||
|
||||
time.sleep(0.01) # ensure gleaned_at advances
|
||||
glean_file(log_file, db_path, force=True)
|
||||
|
||||
conn_after = sqlite3.connect(str(db_path))
|
||||
ts_after = conn_after.execute(
|
||||
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log_file),),
|
||||
).fetchone()[0]
|
||||
conn_after.close()
|
||||
|
||||
assert ts_after > ts_before, "force=True should update gleaned_at timestamp"
|
||||
|
||||
|
||||
# ── Integration: glean_dir skipping ──────────────────────────────────────────
|
||||
|
||||
class TestGleanDirFingerprint:
|
||||
def test_glean_dir_skips_unchanged_on_second_run(self, db_path: Path, tmp_path: Path) -> None:
|
||||
log1 = tmp_path / "a.log"
|
||||
log2 = tmp_path / "b.log"
|
||||
log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
|
||||
log2.write_text("May 24 10:00:00 heimdall kernel: msg two\n")
|
||||
|
||||
glean_dir(tmp_path, db_path)
|
||||
|
||||
stats_second = glean_dir(tmp_path, db_path)
|
||||
assert sum(stats_second.values()) == 0, "Both unchanged files should be skipped"
|
||||
|
||||
def test_glean_dir_force_reruns_all(self, db_path: Path, tmp_path: Path) -> None:
|
||||
log1 = tmp_path / "a.log"
|
||||
log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
|
||||
|
||||
glean_dir(tmp_path, db_path)
|
||||
|
||||
# force=True: runs even though nothing changed; INSERT OR IGNORE keeps DB clean.
|
||||
conn_before = sqlite3.connect(str(db_path))
|
||||
ts_before = conn_before.execute(
|
||||
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log1),),
|
||||
).fetchone()[0]
|
||||
conn_before.close()
|
||||
|
||||
time.sleep(0.01)
|
||||
glean_dir(tmp_path, db_path, force=True)
|
||||
|
||||
conn_after = sqlite3.connect(str(db_path))
|
||||
ts_after = conn_after.execute(
|
||||
"SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
|
||||
(str(log1),),
|
||||
).fetchone()[0]
|
||||
conn_after.close()
|
||||
|
||||
assert ts_after > ts_before
|
||||
|
||||
|
||||
# ── Schema: ensure fingerprints table created ─────────────────────────────────
|
||||
|
||||
class TestEnsureSchema:
|
||||
def test_fingerprints_table_exists_after_ensure_schema(self, tmp_path: Path) -> None:
|
||||
db = tmp_path / "fresh.db"
|
||||
ensure_schema(db)
|
||||
conn = sqlite3.connect(str(db))
|
||||
tables = {
|
||||
row[0]
|
||||
for row in conn.execute(
|
||||
"SELECT name FROM sqlite_master WHERE type='table'"
|
||||
).fetchall()
|
||||
}
|
||||
conn.close()
|
||||
assert "glean_fingerprints" in tables
|
||||
|
||||
def test_ensure_schema_idempotent(self, tmp_path: Path) -> None:
|
||||
"""Calling ensure_schema twice on the same DB must not raise."""
|
||||
db = tmp_path / "fresh.db"
|
||||
ensure_schema(db)
|
||||
ensure_schema(db) # second call — should be a no-op
|
||||
444
tests/test_glean_pipeline_ssh.py
Normal file
444
tests/test_glean_pipeline_ssh.py
Normal file
|
|
@ -0,0 +1,444 @@
|
|||
"""Tests for SSH source handling in app/glean/pipeline.py.
|
||||
|
||||
Verifies that glean_sources() correctly:
|
||||
- Dispatches SSH sources to SSHTransport (local sources unchanged)
|
||||
- Routes each glean-type to the right command builder + parser
|
||||
- Writes parsed entries to SQLite
|
||||
- Gracefully skips sources on SSHConnectionError or SSHCommandError
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from app.glean.pipeline import glean_sources, ensure_schema
|
||||
from app.glean.ssh import SSHConnectionError, SSHCommandError
|
||||
|
||||
|
||||
# ── Shared fixtures ───────────────────────────────────────────────────────────
|
||||
|
||||
JOURNALD_LINE = json.dumps({
|
||||
"__REALTIME_TIMESTAMP": "1747000000000000",
|
||||
"PRIORITY": "3",
|
||||
"MESSAGE": "SSH brute force detected from 192.168.1.99",
|
||||
"SYSLOG_IDENTIFIER": "sshd",
|
||||
"_HOSTNAME": "rack01",
|
||||
}) + "\n"
|
||||
|
||||
SYSLOG_LINE = "May 20 22:00:00 rack01 sshd[1234]: Failed password for invalid user admin\n"
|
||||
|
||||
PLAINTEXT_LINE = "2026-05-20 22:00:00 ERROR app crashed with exit code 1\n"
|
||||
|
||||
DOCKER_LINE = "2026-05-20T22:00:00.000000000Z stderr F container startup failed\n"
|
||||
|
||||
|
||||
def _ssh_sources_yaml(sources: list[dict]) -> str:
|
||||
return yaml.dump({"sources": sources})
|
||||
|
||||
|
||||
def _mock_transport(lines: list[str] | None = None):
|
||||
"""Return a mock SSHTransport context manager whose exec_stream yields given lines."""
|
||||
mock_t = MagicMock()
|
||||
mock_t.exec_stream.return_value = iter(lines or [])
|
||||
return mock_t
|
||||
|
||||
|
||||
def _patch_transport(mock_t):
|
||||
"""Patch SSHTransport in pipeline so __enter__ returns mock_t."""
|
||||
p = patch("app.glean.pipeline.SSHTransport")
|
||||
MockClass = p.start()
|
||||
MockClass.return_value.__enter__.return_value = mock_t
|
||||
MockClass.return_value.__exit__.return_value = None
|
||||
return p, MockClass
|
||||
|
||||
|
||||
def _entry_count(db_path: Path) -> int:
|
||||
conn = sqlite3.connect(db_path)
|
||||
n = conn.execute("SELECT COUNT(*) FROM log_entries").fetchone()[0]
|
||||
conn.close()
|
||||
return n
|
||||
|
||||
|
||||
# ── journald type ─────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSSHJournaldGlean:
|
||||
def test_journald_entries_written_to_db(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "journald"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([JOURNALD_LINE])
|
||||
p, MockClass = _patch_transport(mock_t)
|
||||
try:
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
assert _entry_count(db_path) >= 1
|
||||
assert any("rack01" in k for k in stats)
|
||||
|
||||
def test_journald_args_passed_to_command_builder(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "journald", "args": ["--since", "1 hour ago"]}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([JOURNALD_LINE])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
# The command passed to exec_stream must contain the args
|
||||
call_args = mock_t.exec_stream.call_args[0][0]
|
||||
assert "--since" in call_args
|
||||
assert "1 hour ago" in call_args
|
||||
|
||||
def test_journald_unit_shorthand(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "journald", "unit": "sshd"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
call_args = mock_t.exec_stream.call_args[0][0]
|
||||
assert "sshd" in call_args
|
||||
|
||||
|
||||
# ── syslog type ───────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSSHSyslogGlean:
|
||||
def test_syslog_entries_written_to_db(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01-syslog",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "syslog", "path": "/var/log/syslog"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([SYSLOG_LINE])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
assert _entry_count(db_path) >= 1
|
||||
|
||||
def test_syslog_command_contains_path(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "syslog", "path": "/var/log/auth.log"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
call_args = mock_t.exec_stream.call_args[0][0]
|
||||
assert "/var/log/auth.log" in call_args
|
||||
|
||||
|
||||
# ── plaintext type ────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSSHPlaintextGlean:
|
||||
def test_plaintext_entries_written_to_db(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01-app",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "plaintext", "path": "/var/log/app/error.log"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([PLAINTEXT_LINE])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
assert _entry_count(db_path) >= 1
|
||||
|
||||
def test_plaintext_command_contains_path(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "plaintext", "path": "/opt/myapp/app.log"}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
call_args = mock_t.exec_stream.call_args[0][0]
|
||||
assert "/opt/myapp/app.log" in call_args
|
||||
|
||||
|
||||
# ── docker type ───────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSSHDockerGlean:
|
||||
def test_docker_single_container_command_issued(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "docker", "containers": ["myapp"]}],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([DOCKER_LINE])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
call_args = mock_t.exec_stream.call_args[0][0]
|
||||
assert "myapp" in call_args
|
||||
|
||||
def test_docker_multiple_containers_exec_per_container(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "docker", "containers": ["app", "nginx"]}],
|
||||
}]))
|
||||
|
||||
mock_t = MagicMock()
|
||||
mock_t.exec_stream.return_value = iter([])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
# One exec_stream call per container
|
||||
assert mock_t.exec_stream.call_count == 2
|
||||
all_cmds = " ".join(c[0][0] for c in mock_t.exec_stream.call_args_list)
|
||||
assert "app" in all_cmds
|
||||
assert "nginx" in all_cmds
|
||||
|
||||
|
||||
# ── error handling ────────────────────────────────────────────────────────────
|
||||
|
||||
class TestSSHGleanErrorHandling:
|
||||
def test_connection_error_skips_source_returns_empty_stats(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "unreachable",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.99.99",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "journald"}],
|
||||
}]))
|
||||
|
||||
with patch("app.glean.pipeline.SSHTransport") as MockClass:
|
||||
MockClass.return_value.__enter__.side_effect = SSHConnectionError("no route")
|
||||
MockClass.return_value.__exit__.return_value = None
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
|
||||
assert _entry_count(db_path) == 0
|
||||
# Stats for the source should either be absent or zero
|
||||
for v in stats.values():
|
||||
assert v == 0
|
||||
|
||||
def test_command_error_skips_item_continues_next(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
# Two glean items: first raises SSHCommandError, second yields a valid line
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [
|
||||
{"type": "journald"},
|
||||
{"type": "syslog", "path": "/var/log/syslog"},
|
||||
],
|
||||
}]))
|
||||
|
||||
mock_t = MagicMock()
|
||||
# side_effect list: exception instances are raised; other values are returned
|
||||
mock_t.exec_stream.side_effect = [
|
||||
SSHCommandError("journalctl: command not found"), # raised on first call
|
||||
iter([SYSLOG_LINE]), # returned on second call
|
||||
]
|
||||
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
# Should not raise — bad item is skipped, good item is processed
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
# The syslog line should have been written
|
||||
assert _entry_count(db_path) >= 1
|
||||
|
||||
def test_unknown_glean_type_skipped(self, tmp_path):
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "mqtt"}], # not a valid remote type
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
stats = glean_sources(sources_file, db_path) # must not raise
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
assert _entry_count(db_path) == 0
|
||||
|
||||
|
||||
# ── mixed local + SSH sources ─────────────────────────────────────────────────
|
||||
|
||||
class TestMixedLocalAndSSH:
|
||||
def test_local_and_ssh_both_processed(self, tmp_path):
|
||||
# Local syslog file
|
||||
local_log = tmp_path / "local.log"
|
||||
local_log.write_text(SYSLOG_LINE)
|
||||
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([
|
||||
{"id": "local-syslog", "path": str(local_log)},
|
||||
{
|
||||
"id": "remote01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [{"type": "syslog", "path": "/var/log/syslog"}],
|
||||
},
|
||||
]))
|
||||
|
||||
mock_t = _mock_transport([SYSLOG_LINE])
|
||||
p, _ = _patch_transport(mock_t)
|
||||
try:
|
||||
stats = glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
# Both sources should have contributed entries
|
||||
assert _entry_count(db_path) >= 2
|
||||
assert "local-syslog" in stats
|
||||
assert any("remote01" in k for k in stats)
|
||||
|
||||
def test_local_only_sources_never_calls_ssh(self, tmp_path):
|
||||
local_log = tmp_path / "local.log"
|
||||
local_log.write_text(SYSLOG_LINE)
|
||||
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([
|
||||
{"id": "local", "path": str(local_log)},
|
||||
]))
|
||||
|
||||
with patch("app.glean.pipeline.SSHTransport") as MockClass:
|
||||
glean_sources(sources_file, db_path)
|
||||
MockClass.assert_not_called()
|
||||
|
||||
|
||||
# ── multiple glean items per SSH source ───────────────────────────────────────
|
||||
|
||||
class TestMultipleGleanItemsPerHost:
|
||||
def test_one_connection_multiple_commands(self, tmp_path):
|
||||
"""One SSHTransport instance is shared across all glean items for a host."""
|
||||
sources_file = tmp_path / "sources.yaml"
|
||||
db_path = tmp_path / "test.db"
|
||||
sources_file.write_text(_ssh_sources_yaml([{
|
||||
"id": "rack01",
|
||||
"transport": "ssh",
|
||||
"host": "192.168.1.10",
|
||||
"user": "admin",
|
||||
"key_path": "~/.ssh/id_ed25519",
|
||||
"glean": [
|
||||
{"type": "journald"},
|
||||
{"type": "syslog", "path": "/var/log/syslog"},
|
||||
{"type": "plaintext", "path": "/var/log/app.log"},
|
||||
],
|
||||
}]))
|
||||
|
||||
mock_t = _mock_transport([])
|
||||
p, MockClass = _patch_transport(mock_t)
|
||||
try:
|
||||
glean_sources(sources_file, db_path)
|
||||
finally:
|
||||
p.stop()
|
||||
|
||||
# SSHTransport() should be instantiated only once for the one host
|
||||
MockClass.assert_called_once()
|
||||
# exec_stream should be called once per glean item
|
||||
assert mock_t.exec_stream.call_count == 3
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
"""Tests for the qBittorrent log ingestor."""
|
||||
"""Tests for the qBittorrent log gleaner."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from app.ingest.qbittorrent import is_qbit_log, parse
|
||||
from app.glean.qbittorrent import is_qbit_log, parse
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Classic format sample (pre-5.x GUI builds)
|
||||
185
tests/test_glean_ssh.py
Normal file
185
tests/test_glean_ssh.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
"""Tests for SSH transport layer (app/glean/ssh.py).
|
||||
|
||||
All SSH network I/O is mocked — no real SSH connection required.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
from unittest.mock import MagicMock, patch, call
|
||||
|
||||
import pytest
|
||||
|
||||
from app.glean.ssh import (
|
||||
SSHTransport,
|
||||
SSHConnectionError,
|
||||
SSHCommandError,
|
||||
_build_journald_command,
|
||||
_build_syslog_command,
|
||||
_build_plaintext_command,
|
||||
_build_docker_command,
|
||||
)
|
||||
|
||||
|
||||
# ── Command builders ──────────────────────────────────────────────────────────
|
||||
|
||||
class TestBuildJournaldCommand:
|
||||
def test_no_args_returns_base_command(self):
|
||||
cmd = _build_journald_command({})
|
||||
assert "journalctl" in cmd
|
||||
assert "-o json" in cmd
|
||||
|
||||
def test_args_list_appended(self):
|
||||
cmd = _build_journald_command({"args": ["--since", "2 hours ago", "--unit", "sshd"]})
|
||||
assert "--since" in cmd
|
||||
assert "2 hours ago" in cmd
|
||||
assert "--unit" in cmd
|
||||
assert "sshd" in cmd
|
||||
|
||||
def test_unit_shorthand(self):
|
||||
cmd = _build_journald_command({"unit": "docker"})
|
||||
assert "--unit docker" in cmd or "--unit=docker" in cmd
|
||||
|
||||
|
||||
class TestBuildSyslogCommand:
|
||||
def test_returns_cat_command(self):
|
||||
cmd = _build_syslog_command({"path": "/var/log/syslog"})
|
||||
assert "cat" in cmd
|
||||
assert "/var/log/syslog" in cmd
|
||||
|
||||
def test_default_path_when_omitted(self):
|
||||
cmd = _build_syslog_command({})
|
||||
assert "cat" in cmd
|
||||
assert "/var/log" in cmd
|
||||
|
||||
|
||||
class TestBuildPlaintextCommand:
|
||||
def test_cat_with_path(self):
|
||||
cmd = _build_plaintext_command({"path": "/var/log/app/error.log"})
|
||||
assert "cat" in cmd
|
||||
assert "/var/log/app/error.log" in cmd
|
||||
|
||||
def test_raises_without_path(self):
|
||||
with pytest.raises((ValueError, KeyError)):
|
||||
_build_plaintext_command({})
|
||||
|
||||
|
||||
class TestBuildDockerCommand:
|
||||
def test_single_container(self):
|
||||
cmd = _build_docker_command({"containers": ["myapp"]})
|
||||
assert "myapp" in cmd
|
||||
|
||||
def test_multiple_containers_returns_list(self):
|
||||
cmds = _build_docker_command({"containers": ["app", "nginx"]})
|
||||
# Multiple containers → must produce a command per container OR joined
|
||||
assert "app" in (cmds if isinstance(cmds, str) else " ".join(cmds))
|
||||
assert "nginx" in (cmds if isinstance(cmds, str) else " ".join(cmds))
|
||||
|
||||
def test_raises_without_containers(self):
|
||||
with pytest.raises((ValueError, KeyError)):
|
||||
_build_docker_command({})
|
||||
|
||||
|
||||
# ── SSHTransport context manager ──────────────────────────────────────────────
|
||||
|
||||
def _mock_ssh_client(stdout_lines: list[str] | None = None):
|
||||
"""Return a mock SSHClient whose exec_command yields the given lines."""
|
||||
client = MagicMock()
|
||||
stdout = MagicMock()
|
||||
stdout.__iter__ = MagicMock(return_value=iter(stdout_lines or []))
|
||||
stderr = MagicMock()
|
||||
stderr.read.return_value = b""
|
||||
client.exec_command.return_value = (MagicMock(), stdout, stderr)
|
||||
return client
|
||||
|
||||
|
||||
class TestSSHTransportConnect:
|
||||
def test_connects_with_key_path(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
MockClient.return_value = _mock_ssh_client()
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
|
||||
pass
|
||||
MockClient.return_value.connect.assert_called_once()
|
||||
call_kwargs = MockClient.return_value.connect.call_args
|
||||
assert call_kwargs.kwargs.get("hostname") == "10.0.0.1" or \
|
||||
call_kwargs.args[0] == "10.0.0.1"
|
||||
|
||||
def test_disconnects_on_exit(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
mock_client = _mock_ssh_client()
|
||||
MockClient.return_value = mock_client
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
|
||||
pass
|
||||
mock_client.close.assert_called_once()
|
||||
|
||||
def test_disconnects_on_exception(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
mock_client = _mock_ssh_client()
|
||||
MockClient.return_value = mock_client
|
||||
with pytest.raises(RuntimeError):
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
|
||||
raise RuntimeError("boom")
|
||||
mock_client.close.assert_called_once()
|
||||
|
||||
def test_raises_ssh_connection_error_on_auth_failure(self, tmp_path):
|
||||
import paramiko
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
MockClient.return_value.connect.side_effect = paramiko.AuthenticationException("denied")
|
||||
with pytest.raises(SSHConnectionError, match="auth"):
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
|
||||
pass
|
||||
|
||||
def test_raises_ssh_connection_error_on_no_route(self, tmp_path):
|
||||
import paramiko
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
MockClient.return_value.connect.side_effect = paramiko.SSHException("no route")
|
||||
with pytest.raises(SSHConnectionError):
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
|
||||
pass
|
||||
|
||||
|
||||
class TestSSHTransportExecStream:
|
||||
def test_yields_stdout_lines(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
lines = ["line one\n", "line two\n", "line three\n"]
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
MockClient.return_value = _mock_ssh_client(lines)
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
|
||||
result = list(t.exec_stream("echo hello"))
|
||||
assert result == lines
|
||||
|
||||
def test_raises_ssh_command_error_on_nonzero_exit(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
mock_client = _mock_ssh_client([])
|
||||
# Simulate non-zero exit code
|
||||
channel = MagicMock()
|
||||
channel.recv_exit_status.return_value = 1
|
||||
mock_client.exec_command.return_value[1].channel = channel
|
||||
mock_client.exec_command.return_value[2].read.return_value = b"command not found"
|
||||
MockClient.return_value = mock_client
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
|
||||
with pytest.raises(SSHCommandError, match="command not found"):
|
||||
list(t.exec_stream("notacommand"))
|
||||
|
||||
def test_strips_trailing_newlines(self, tmp_path):
|
||||
key_file = tmp_path / "id_ed25519"
|
||||
key_file.write_bytes(b"fake-key")
|
||||
lines = [" line with spaces \n"]
|
||||
with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
|
||||
MockClient.return_value = _mock_ssh_client(lines)
|
||||
with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
|
||||
# exec_stream should yield the raw lines; stripping is parser's job
|
||||
result = list(t.exec_stream("echo hello"))
|
||||
assert len(result) == 1
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
"""Tests for the syslog (RFC 3164) ingestor."""
|
||||
"""Tests for the syslog (RFC 3164) gleaner."""
|
||||
from __future__ import annotations
|
||||
|
||||
from app.ingest.syslog import is_syslog, parse
|
||||
from app.glean.syslog import is_syslog, parse
|
||||
|
||||
SYSLOG_SAMPLE = """\
|
||||
May 11 14:23:01 xanderland sshd[1234]: Accepted publickey for x from 192.168.1.1 port 54321 ssh2
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
"""Tests for the Tautulli webhook ingestor."""
|
||||
"""Tests for the Tautulli webhook gleaner."""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
from unittest.mock import patch
|
||||
|
||||
from app.ingest.tautulli import is_tautulli_payload, parse_webhook
|
||||
from app.glean.tautulli import is_tautulli_payload, parse_webhook
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -253,7 +253,7 @@ class TestEndpoint:
|
|||
@pytest.fixture
|
||||
def client(self, tmp_path):
|
||||
from fastapi.testclient import TestClient
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import app.rest as rest_module
|
||||
|
||||
db = tmp_path / "test.db"
|
||||
|
|
@ -267,14 +267,14 @@ class TestEndpoint:
|
|||
|
||||
def test_missing_action_returns_400(self, client):
|
||||
resp = client.post(
|
||||
"/turnstone/api/ingest/tautulli",
|
||||
"/turnstone/api/glean/tautulli",
|
||||
json={"session_key": "x"},
|
||||
)
|
||||
assert resp.status_code == 400
|
||||
|
||||
def test_wrong_token_returns_403(self, tmp_path):
|
||||
from fastapi.testclient import TestClient
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import app.rest as rest_module
|
||||
|
||||
db = tmp_path / "test.db"
|
||||
|
|
@ -288,7 +288,7 @@ class TestEndpoint:
|
|||
patch.object(rest_module, "_compiled_patterns", []):
|
||||
with TestClient(rest_module.app, raise_server_exceptions=True) as c:
|
||||
resp = c.post(
|
||||
"/turnstone/api/ingest/tautulli",
|
||||
"/turnstone/api/glean/tautulli",
|
||||
json=_ERROR_PAYLOAD,
|
||||
headers={"X-Tautulli-Token": "wrong"},
|
||||
)
|
||||
|
|
@ -296,7 +296,7 @@ class TestEndpoint:
|
|||
|
||||
def test_valid_payload_returns_200(self, client):
|
||||
resp = client.post(
|
||||
"/turnstone/api/ingest/tautulli",
|
||||
"/turnstone/api/glean/tautulli",
|
||||
json=_ERROR_PAYLOAD,
|
||||
)
|
||||
assert resp.status_code == 200
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
"""Tests for the Wazuh alert ingestor."""
|
||||
"""Tests for the Wazuh alert gleaner."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
from app.ingest.wazuh import is_wazuh_alert, parse
|
||||
from app.ingest.pipeline import _detect_format
|
||||
from app.glean.wazuh import is_wazuh_alert, parse
|
||||
from app.glean.pipeline import _detect_format
|
||||
|
||||
_ALERT = {
|
||||
"timestamp": "2024-01-15T10:23:45.123+0000",
|
||||
|
|
@ -8,7 +8,7 @@ from pathlib import Path
|
|||
|
||||
class TestSchema:
|
||||
def test_blocklist_candidates_table_exists(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
conn = sqlite3.connect(str(db))
|
||||
|
|
@ -16,7 +16,7 @@ class TestSchema:
|
|||
assert "blocklist_candidates" in tables
|
||||
|
||||
def test_blocklist_candidates_columns(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
conn = sqlite3.connect(str(db))
|
||||
|
|
@ -28,7 +28,7 @@ class TestSchema:
|
|||
}
|
||||
|
||||
def test_status_default_is_pending(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import uuid
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
|
|
@ -89,7 +89,7 @@ class TestTelemetry:
|
|||
class TestExtraction:
|
||||
@pytest.fixture
|
||||
def db(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
p = tmp_path / "test.db"
|
||||
ensure_schema(p)
|
||||
return p
|
||||
|
|
@ -195,7 +195,7 @@ class TestExtraction:
|
|||
class TestCandidateManagement:
|
||||
@pytest.fixture
|
||||
def db_with_candidate(self, tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
import sqlite3, uuid
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ def test_keywords_cleaned_of_extra_spaces():
|
|||
|
||||
|
||||
def test_diagnose_with_explicit_window_sets_time_detected(tmp_path):
|
||||
from app.ingest.pipeline import ensure_schema
|
||||
from app.glean.pipeline import ensure_schema
|
||||
db = tmp_path / "test.db"
|
||||
ensure_schema(db)
|
||||
result = diagnose(db, query="plex", since="2026-05-11T14:00:00+00:00", until="2026-05-11T15:00:00+00:00")
|
||||
|
|
|
|||
|
|
@ -104,7 +104,7 @@
|
|||
<p v-if="severityFilter" class="mb-1">No {{ severityFilter }} entries in this result set.</p>
|
||||
<template v-else>
|
||||
<p class="mb-1">No log evidence found for "{{ lastQuery }}"</p>
|
||||
<p class="text-sm">Check the Sources tab to confirm data is ingested, or try a broader description.</p>
|
||||
<p class="text-sm">Check the Sources tab to confirm data is gleaned, or try a broader description.</p>
|
||||
</template>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@
|
|||
class="w-2 h-2 rounded-full flex-shrink-0"
|
||||
></span>
|
||||
<span :class="watchActive ? 'text-green-400' : 'text-text-dim'" class="text-xs">
|
||||
{{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual ingest mode' }}
|
||||
{{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual glean mode' }}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
|
@ -20,8 +20,8 @@
|
|||
class="flex items-center gap-2 rounded border border-surface-border bg-surface-raised px-4 py-2.5 text-xs text-text-dim"
|
||||
>
|
||||
<span class="text-sev-warn">⚠</span>
|
||||
<span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span>. Waiting for new entries to arrive.</span>
|
||||
<span v-else>Last ingested: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span> — 24h counts reflect this window, not today.</span>
|
||||
<span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span>. Waiting for new entries to arrive.</span>
|
||||
<span v-else>Last gleaned: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span> — 24h counts reflect this window, not today.</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -171,7 +171,7 @@ interface StatsResponse {
|
|||
criticals_24h: number
|
||||
errors_24h: number
|
||||
suppressed_criticals: number
|
||||
last_ingested: string | null
|
||||
last_gleaned: string | null
|
||||
source_health: SourceHealth[]
|
||||
recent_criticals: Array<{
|
||||
entry_id: string
|
||||
|
|
@ -186,7 +186,7 @@ interface WatchSourceStatus {
|
|||
source_id: string
|
||||
type: string
|
||||
running: boolean
|
||||
entries_ingested: number
|
||||
entries_gleaned: number
|
||||
last_event: string | null
|
||||
error: string | null
|
||||
}
|
||||
|
|
@ -211,8 +211,8 @@ const watchActive = computed(() =>
|
|||
)
|
||||
|
||||
const isStale = computed(() => {
|
||||
if (!stats.value?.last_ingested) return false
|
||||
const age = Date.now() - new Date(stats.value.last_ingested).getTime()
|
||||
if (!stats.value?.last_gleaned) return false
|
||||
const age = Date.now() - new Date(stats.value.last_gleaned).getTime()
|
||||
return age > 25 * 60 * 60 * 1000 // older than 25h
|
||||
})
|
||||
|
||||
|
|
|
|||
|
|
@ -106,7 +106,7 @@
|
|||
</div>
|
||||
<div v-else class="text-center">
|
||||
<p class="text-base mb-1">No results for "{{ store.query }}"</p>
|
||||
<p class="text-sm">Try broader terms or check the Sources tab to confirm data is ingested.</p>
|
||||
<p class="text-sm">Try broader terms or check the Sources tab to confirm data is gleaned.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
<div class="mb-6 flex items-start justify-between gap-4">
|
||||
<div>
|
||||
<h1 class="text-text-primary text-xl font-semibold mb-1">Log Sources</h1>
|
||||
<p class="text-text-dim text-sm">All hosts and services in the ingested corpus.</p>
|
||||
<p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p>
|
||||
</div>
|
||||
<label class="btn-secondary text-sm cursor-pointer shrink-0">
|
||||
<span>Upload log file</span>
|
||||
|
|
@ -21,12 +21,12 @@
|
|||
|
||||
<div v-else-if="sources.length === 0" class="text-text-dim py-12 text-center">
|
||||
<p class="mb-1">No log sources found.</p>
|
||||
<p class="text-sm">Run the ingest pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/ingest_corpus.py</code></p>
|
||||
<p class="text-sm">Run the glean pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/glean_corpus.py</code></p>
|
||||
</div>
|
||||
|
||||
<div v-else class="rounded border border-surface-border overflow-hidden">
|
||||
<div class="overflow-x-auto">
|
||||
<table class="w-full text-sm min-w-[560px]">
|
||||
<table class="w-full text-sm min-w-[620px]">
|
||||
<thead class="bg-surface-raised border-b border-surface-border">
|
||||
<tr>
|
||||
<th class="text-left px-4 py-2.5 text-text-dim font-medium text-xs uppercase tracking-wider">Source</th>
|
||||
|
|
@ -40,29 +40,72 @@
|
|||
<tbody>
|
||||
<tr
|
||||
v-for="src in sources"
|
||||
:key="src.source_id"
|
||||
:key="src.id"
|
||||
class="border-b border-surface-border hover:bg-surface-raised transition-colors"
|
||||
>
|
||||
<td class="px-4 py-2.5 text-accent">{{ src.source_id }}</td>
|
||||
<td class="px-4 py-2.5 text-text-muted text-right tabular-nums">{{ src.entry_count.toLocaleString() }}</td>
|
||||
<!-- Source name + badges -->
|
||||
<td class="px-4 py-2.5">
|
||||
<div class="flex flex-wrap items-center gap-1.5">
|
||||
<span class="text-accent font-mono text-xs">{{ src.id }}</span>
|
||||
<!-- SSH transport badge -->
|
||||
<span
|
||||
v-if="src.transport === 'ssh'"
|
||||
class="inline-flex items-center gap-1 px-1.5 py-0.5 rounded text-[10px] font-medium
|
||||
bg-blue-900/30 text-blue-400 border border-blue-800/40"
|
||||
:title="`SSH: ${src.user}@${src.host}`"
|
||||
>
|
||||
<svg class="w-2.5 h-2.5" viewBox="0 0 16 16" fill="currentColor" aria-hidden="true">
|
||||
<path d="M2 3a1 1 0 011-1h10a1 1 0 011 1v2a1 1 0 01-1 1H3a1 1 0 01-1-1V3zm0 5a1 1 0 011-1h4a1 1 0 110 2H3a1 1 0 01-1-1zm0 4a1 1 0 011-1h2a1 1 0 110 2H3a1 1 0 01-1-1z"/>
|
||||
</svg>
|
||||
ssh
|
||||
</span>
|
||||
<!-- Glean-type pills for SSH sources -->
|
||||
<span
|
||||
v-for="gtype in (src.glean_types ?? [])"
|
||||
:key="gtype"
|
||||
class="px-1.5 py-0.5 rounded text-[10px] font-medium
|
||||
bg-surface-raised text-text-dim border border-surface-border"
|
||||
>{{ gtype }}</span>
|
||||
<!-- Upload badge for DB-only sources not in sources.yaml -->
|
||||
<span
|
||||
v-if="src.dbOnly"
|
||||
class="px-1.5 py-0.5 rounded text-[10px] font-medium
|
||||
bg-surface-raised text-text-dim border border-surface-border"
|
||||
>uploaded</span>
|
||||
</div>
|
||||
<!-- SSH host subtitle -->
|
||||
<div v-if="src.transport === 'ssh'" class="text-text-dim text-xs mt-0.5 font-mono">
|
||||
{{ src.user }}@{{ src.host }}
|
||||
</div>
|
||||
</td>
|
||||
|
||||
<!-- Entry count -->
|
||||
<td class="px-4 py-2.5 text-text-muted text-right tabular-nums">
|
||||
{{ src.entry_count.toLocaleString() }}
|
||||
</td>
|
||||
|
||||
<!-- Error count -->
|
||||
<td class="px-4 py-2.5 text-right tabular-nums">
|
||||
<span :class="src.error_count > 0 ? 'text-sev-error' : 'text-text-dim'">
|
||||
{{ src.error_count.toLocaleString() }}
|
||||
</span>
|
||||
</td>
|
||||
|
||||
<td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.earliest) }}</td>
|
||||
<td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.latest) }}</td>
|
||||
|
||||
<!-- Actions -->
|
||||
<td class="px-4 py-2.5">
|
||||
<div class="flex items-center justify-end gap-2">
|
||||
<button
|
||||
:disabled="busy.has(src.source_id)"
|
||||
@click="reingest(src.source_id)"
|
||||
:disabled="busy.has(src.id) || src.dbOnly"
|
||||
@click="reglean(src.id)"
|
||||
class="text-text-dim hover:text-accent transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
|
||||
title="Re-ingest from sources.yaml"
|
||||
>{{ busy.has(src.source_id) ? '…' : 'reingest' }}</button>
|
||||
:title="src.dbOnly ? 'Not in sources.yaml — cannot re-glean' : 'Re-glean from sources.yaml'"
|
||||
>{{ busy.has(src.id) ? '…' : 'reglean' }}</button>
|
||||
<button
|
||||
:disabled="busy.has(src.source_id)"
|
||||
@click="deleteSource(src.source_id)"
|
||||
:disabled="busy.has(src.id)"
|
||||
@click="deleteSource(src.id)"
|
||||
class="text-text-dim hover:text-sev-error transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
|
||||
title="Delete all entries for this source"
|
||||
>delete</button>
|
||||
|
|
@ -78,9 +121,36 @@
|
|||
|
||||
<script setup lang="ts">
|
||||
import { ref, onMounted } from 'vue'
|
||||
import type { LogSource } from '@/stores/search'
|
||||
|
||||
const sources = ref<LogSource[]>([])
|
||||
// Unified source row shown in the table (merges configured + DB-only sources).
|
||||
interface SourceRow {
|
||||
id: string
|
||||
transport: 'local' | 'ssh'
|
||||
// SSH-specific
|
||||
host?: string
|
||||
user?: string
|
||||
glean_types?: string[]
|
||||
// Local-specific
|
||||
path?: string
|
||||
// DB stats (always present, default 0/null)
|
||||
entry_count: number
|
||||
error_count: number
|
||||
earliest: string | null
|
||||
latest: string | null
|
||||
// True when this source exists in the DB but not in sources.yaml (e.g. uploads)
|
||||
dbOnly?: boolean
|
||||
}
|
||||
|
||||
interface ConfiguredSource extends Omit<SourceRow, 'dbOnly'> {}
|
||||
interface DbSource {
|
||||
source_id: string
|
||||
entry_count: number
|
||||
error_count: number
|
||||
earliest: string | null
|
||||
latest: string | null
|
||||
}
|
||||
|
||||
const sources = ref<SourceRow[]>([])
|
||||
const loading = ref(true)
|
||||
const busy = ref(new Set<string>())
|
||||
const actionMsg = ref('')
|
||||
|
|
@ -90,11 +160,52 @@ const BASE = import.meta.env.BASE_URL.replace(/\/$/, '')
|
|||
|
||||
async function loadSources(): Promise<void> {
|
||||
try {
|
||||
const res = await fetch(`${BASE}/api/sources`)
|
||||
if (res.ok) {
|
||||
const data = await res.json()
|
||||
sources.value = data.sources
|
||||
// Primary list: configured sources from sources.yaml (enriched with DB stats).
|
||||
// This makes SSH sources visible even before their first glean.
|
||||
const [configuredRes, dbRes] = await Promise.all([
|
||||
fetch(`${BASE}/api/sources/configured`),
|
||||
fetch(`${BASE}/api/sources`),
|
||||
])
|
||||
|
||||
const configuredData = configuredRes.ok ? await configuredRes.json() : { sources: [] }
|
||||
const dbData = dbRes.ok ? await dbRes.json() : { sources: [] }
|
||||
|
||||
const configuredSources: ConfiguredSource[] = configuredData.sources ?? []
|
||||
const dbSources: DbSource[] = dbData.sources ?? []
|
||||
|
||||
// Build a set of all IDs represented by configured sources.
|
||||
// SSH sources own all sub-source IDs like "rack01/journald" too.
|
||||
const coveredIds = new Set<string>()
|
||||
for (const s of configuredSources) {
|
||||
coveredIds.add(s.id)
|
||||
}
|
||||
|
||||
// For SSH sources, also mark sub-source IDs (rack01/…) as covered so they
|
||||
// don't appear as separate "uploaded" rows.
|
||||
for (const s of configuredSources) {
|
||||
if (s.transport === 'ssh') {
|
||||
for (const db of dbSources) {
|
||||
if (db.source_id.startsWith(s.id + '/') || db.source_id === s.id) {
|
||||
coveredIds.add(db.source_id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DB-only sources: uploaded files or manually gleaned sources not in sources.yaml.
|
||||
const dbOnly: SourceRow[] = dbSources
|
||||
.filter(db => !coveredIds.has(db.source_id))
|
||||
.map(db => ({
|
||||
id: db.source_id,
|
||||
transport: 'local' as const,
|
||||
entry_count: db.entry_count,
|
||||
error_count: db.error_count,
|
||||
earliest: db.earliest,
|
||||
latest: db.latest,
|
||||
dbOnly: true,
|
||||
}))
|
||||
|
||||
sources.value = [...configuredSources as SourceRow[], ...dbOnly]
|
||||
} finally {
|
||||
loading.value = false
|
||||
}
|
||||
|
|
@ -118,7 +229,13 @@ async function deleteSource(sourceId: string): Promise<void> {
|
|||
const data = await res.json()
|
||||
actionMsg.value = `Deleted ${data.deleted.toLocaleString()} entries for "${sourceId}"`
|
||||
actionError.value = false
|
||||
sources.value = sources.value.filter(s => s.source_id !== sourceId)
|
||||
// Remove DB-only rows; zero-out configured-source stats instead of hiding.
|
||||
sources.value = sources.value
|
||||
.filter(s => !(s.id === sourceId && s.dbOnly))
|
||||
.map(s => s.id === sourceId
|
||||
? { ...s, entry_count: 0, error_count: 0, earliest: null, latest: null }
|
||||
: s
|
||||
)
|
||||
} else {
|
||||
const data = await res.json()
|
||||
actionMsg.value = data.detail ?? 'Delete failed'
|
||||
|
|
@ -129,19 +246,19 @@ async function deleteSource(sourceId: string): Promise<void> {
|
|||
}
|
||||
}
|
||||
|
||||
async function reingest(sourceId: string): Promise<void> {
|
||||
async function reglean(sourceId: string): Promise<void> {
|
||||
setBusy(sourceId, true)
|
||||
actionMsg.value = ''
|
||||
actionError.value = false
|
||||
try {
|
||||
const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/ingest`, { method: 'POST' })
|
||||
const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/glean`, { method: 'POST' })
|
||||
const data = await res.json()
|
||||
if (res.ok) {
|
||||
actionMsg.value = `Re-ingest complete: ${data.ingested.toLocaleString()} new entries for "${sourceId}"`
|
||||
actionMsg.value = `Re-glean complete: ${data.gleaned.toLocaleString()} new entries for "${sourceId}"`
|
||||
actionError.value = false
|
||||
await loadSources()
|
||||
} else {
|
||||
actionMsg.value = data.detail ?? 'Re-ingest failed'
|
||||
actionMsg.value = data.detail ?? 'Re-glean failed'
|
||||
actionError.value = true
|
||||
}
|
||||
} finally {
|
||||
|
|
@ -156,10 +273,10 @@ async function handleUpload(e: Event): Promise<void> {
|
|||
actionError.value = false
|
||||
const form = new FormData()
|
||||
form.append('file', file)
|
||||
const res = await fetch(`${BASE}/api/ingest/upload`, { method: 'POST', body: form })
|
||||
const res = await fetch(`${BASE}/api/glean/upload`, { method: 'POST', body: form })
|
||||
const data = await res.json()
|
||||
if (res.ok) {
|
||||
actionMsg.value = `Uploaded: ${data.ingested.toLocaleString()} entries ingested as "${data.source_id}"`
|
||||
actionMsg.value = `Uploaded: ${data.gleaned.toLocaleString()} entries gleaned as "${data.source_id}"`
|
||||
actionError.value = false
|
||||
await loadSources()
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Reference in a new issue