Merge pull request 'feat(diagnose): 5-stage multi-agent diagnose pipeline (#29)' (#39) from feat/29-multi-agent-diagnose into main

2026-05-25 19:59:34 -07:00 · 2026-05-25 19:59:34 -07:00 · f302f27350
commit f302f27350
parent 63c742a708 25b7ae340b
76 changed files with 6640 additions and 635 deletions
--- a/.env.example
+++ b/.env.example
@ -23,6 +23,6 @@
 # Remote endpoint to push diagnostic bundles for escalation.
 # TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles

-# --- Periodic batch ingest ---
-# Seconds between automatic ingest runs from sources.yaml. Set to 0 to disable.
-# TURNSTONE_INGEST_INTERVAL=900
+# --- Periodic batch glean ---
+# Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
+# TURNSTONE_GLEAN_INTERVAL=900
--- a/README.md
+++ b/README.md
@ -28,8 +28,8 @@ Service logs (journald, Docker, syslog, Caddy, Plex, arr stack, qBittorrent, dme

 ## Features

- **Multi-source ingest** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
- **Pattern tagging** — named regex patterns applied at ingest time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
+- **Multi-source glean** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
+- **Pattern tagging** — named regex patterns applied at glean time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
 - **Full-text search** — SQLite FTS5 index across all ingested entries; filter by source, severity, time window
 - **Natural-language time queries** — "what happened yesterday morning", "show me errors from the last 3 hours"; powered by dateparser
 - **Incident management** — create, label, and track incidents; attach supporting log entries
@ -101,13 +101,13 @@ sources:
    path: /var/log/caddy/access.log
 ```

-For `journald` sources, run `scripts/export_journal.sh` on the host before each ingest (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
+For `journald` sources, run `scripts/export_journal.sh` on the host before each glean (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.

 ---

 ## Pattern library

-Named patterns in `patterns/default.yaml` are matched against every log entry at ingest time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
+Named patterns in `patterns/default.yaml` are matched against every log entry at glean time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.

 ```yaml
 patterns:
@ -157,7 +157,7 @@ Copy `.env.example` to `.env` (or pass as `-e` flags to Docker/Podman). All vari
 | `TURNSTONE_PATTERNS` | `./patterns` | Pattern directory (default.yaml, sources.yaml, watch.yaml). |
 | `TURNSTONE_SOURCE_HOST` | `unknown` | Host identifier stamped on ingested entries. |
 | `TURNSTONE_BUNDLE_ENDPOINT` | — | Remote URL to push diagnostic bundles for escalation. |
-| `TURNSTONE_INGEST_INTERVAL` | `900` | Seconds between automatic batch ingest runs. Set to `0` to disable. |
+| `TURNSTONE_GLEAN_INTERVAL` | `900` | Seconds between automatic batch glean runs. Set to `0` to disable. |

 ---

--- a/app/context/embedder.py
+++ b/app/context/embedder.py
@ -1,64 +1,81 @@
-"""Ollama embedding client with sqlite-vec storage — BSL licensed."""
+"""Context chunk embedding — BSL licensed.
+
+Thin wrapper around app.services.embeddings that handles the DB I/O for
+context_chunks.  All backend configuration (model, device, backend type) is
+delegated to the service layer via TURNSTONE_EMBED_* env vars.
+
+Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue
+to work without changes.
+"""
 from __future__ import annotations

 import logging
 import sqlite3
-import struct
 from pathlib import Path

-import httpx
+from app.services.embeddings import (
+    EMBEDDING_AVAILABLE,  # re-export for backward compat
+    get_embedder,
+    pack_vector,
+)
+
+__all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"]

 logger = logging.getLogger(__name__)

-EMBEDDING_AVAILABLE: bool = False
-
-try:
-    import sqlite_vec  # type: ignore[import]  # noqa: F401
-    EMBEDDING_AVAILABLE = True
-    logger.debug("sqlite-vec loaded — embedding pipeline enabled")
-except ImportError:
-    logger.debug("sqlite-vec not available — embedding pipeline disabled")
-

 def embed_chunks(
    db_path: Path,
    document_id: str,
-    llm_url: str,
-    model: str = "nomic-embed-text",
+    # Legacy params kept for backward compat — ignored when the ST backend is active.
+    llm_url: str = "",
+    model: str = "",
    timeout: float = 60.0,
 ) -> int:
-    """Embed all unembedded chunks for a document. Returns count embedded. No-op when EMBEDDING_AVAILABLE is False."""
-    if not EMBEDDING_AVAILABLE:
+    """Embed all un-embedded chunks for *document_id*.
+
+    Uses the configured embedder (sentence-transformers by default; Ollama when
+    TURNSTONE_EMBED_BACKEND=ollama).  Returns the count of newly embedded chunks.
+    Returns 0 silently when no embedder is available.
+
+    The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when
+    the sentence-transformers backend is active — configure via env vars instead.
+    """
+    embedder = get_embedder()
+    if embedder is None:
        return 0

    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row
+
    rows = conn.execute(
        "SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL",
        (document_id,),
    ).fetchall()

+    if not rows:
+        conn.close()
+        return 0
+
+    texts = [r["text"] for r in rows]
+    ids   = [r["id"]   for r in rows]
+
    count = 0
-    for row in rows:
    try:
-            resp = httpx.post(
-                f"{llm_url.rstrip('/')}/api/embeddings",
-                json={"model": model, "prompt": row["text"]},
-                timeout=timeout,
-            )
-            resp.raise_for_status()
-            vector: list[float] = resp.json().get("embedding") or []
-            if vector:
-                blob = struct.pack(f"{len(vector)}f", *vector)
+        vectors = embedder.embed_batch(texts)
+        for chunk_id, vec in zip(ids, vectors):
+            blob = pack_vector(vec)
            conn.execute(
                "UPDATE context_chunks SET embedding = ? WHERE id = ?",
-                    (blob, row["id"]),
+                (blob, chunk_id),
            )
            count += 1
-        except Exception as exc:
-            logger.warning("Embedding chunk %s failed: %s", row["id"], exc)
-
        conn.commit()
+    except Exception as exc:
+        logger.warning("Batch embedding failed for document %s: %s", document_id, exc)
+    finally:
        conn.close()
+
+    logger.debug("Embedded %d chunk(s) for document %s", count, document_id)
    return count
--- a/app/context/retriever.py
+++ b/app/context/retriever.py
@ -1,10 +1,30 @@
-"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed."""
+"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed.
+
+Two retrieval modes for context_chunks:
+  Vector search  — cosine similarity over stored embeddings (when available)
+  Keyword search — LIKE-based fallback when no embedder is configured
+
+Both modes are called from retrieve_context(); the best available mode is used
+automatically so callers need not check EMBEDDING_AVAILABLE themselves.
+"""
 from __future__ import annotations

+import logging
 import sqlite3
 from dataclasses import dataclass, field
 from pathlib import Path

+import numpy as np
+
+from app.services.embeddings import (
+    EMBEDDING_AVAILABLE,
+    cosine_similarity,
+    get_embedder,
+    unpack_vector,
+)
+
+logger = logging.getLogger(__name__)
+

@dataclass
 class RetrievedContext:
@ -12,6 +32,8 @@ class RetrievedContext:
    chunks: list[dict[str, str]] = field(default_factory=list)


+# ── Structured fact retrieval (always runs) ───────────────────────────────────
+
 def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
    """Keyword match against context_facts. Always runs — Free tier."""
    try:
@ -42,8 +64,68 @@ def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
        return []


-def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
-    """Keyword search across context_chunks. Fallback when no embeddings."""
+# ── Chunk retrieval: vector path ──────────────────────────────────────────────
+
+def _search_chunks_vector(
+    db_path: Path,
+    query: str,
+    top_k: int = 3,
+) -> list[dict[str, str]]:
+    """Cosine similarity search over embedded context_chunks.
+
+    Loads all stored embeddings into memory and scores in-process with numpy.
+    Skips any chunk whose BLOB dimension does not match the current model dim
+    (stale embeddings from a previous model — they will be re-embedded on the
+    next document upload).
+
+    Returns at most *top_k* results ordered by similarity descending.
+    """
+    embedder = get_embedder()
+    if embedder is None:
+        return []
+
+    try:
+        query_vec: np.ndarray = embedder.embed(query)
+        model_dim: int = embedder.dim
+    except Exception as exc:
+        logger.warning("Query embedding failed: %s", exc)
+        return []
+
+    try:
+        conn = sqlite3.connect(str(db_path))
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.row_factory = sqlite3.Row
+        rows = conn.execute(
+            "SELECT cc.id, cc.text, cc.embedding, cd.filename"
+            " FROM context_chunks cc"
+            " JOIN context_documents cd ON cc.document_id = cd.id"
+            " WHERE cc.embedding IS NOT NULL"
+        ).fetchall()
+        conn.close()
+    except sqlite3.OperationalError:
+        return []
+
+    scored: list[tuple[float, dict[str, str]]] = []
+    for row in rows:
+        blob: bytes = row["embedding"]
+        # Guard against blobs from a different-dimension model
+        if len(blob) // 4 != model_dim:
+            continue
+        try:
+            chunk_vec = unpack_vector(blob)
+            score = cosine_similarity(query_vec, chunk_vec)
+            scored.append((score, {"text": row["text"], "filename": row["filename"]}))
+        except Exception:
+            continue
+
+    scored.sort(key=lambda t: t[0], reverse=True)
+    return [item for _, item in scored[:top_k]]
+
+
+# ── Chunk retrieval: keyword fallback ─────────────────────────────────────────
+
+def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]:
+    """LIKE-based keyword search across context_chunks. Fallback when no embedder."""
    try:
        conn = sqlite3.connect(str(db_path))
        conn.execute("PRAGMA journal_mode=WAL")
@ -66,16 +148,29 @@ def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
        return []


+# ── Public interface ──────────────────────────────────────────────────────────
+
 def retrieve_context(db_path: Path, query: str) -> RetrievedContext:
-    """Retrieve structured facts and relevant chunks for a query."""
-    return RetrievedContext(
-        facts=get_relevant_facts(db_path, query),
-        chunks=_search_chunks(db_path, query),
-    )
+    """Retrieve structured facts and relevant chunks for a query.
+
+    Chunk retrieval uses vector search when an embedder is available and at
+    least one embedded chunk exists; falls back to keyword search otherwise.
+    """
+    facts = get_relevant_facts(db_path, query)
+
+    if EMBEDDING_AVAILABLE:
+        chunks = _search_chunks_vector(db_path, query)
+        if not chunks:
+            # Vector search returned nothing (no embedded chunks yet) — fall back.
+            chunks = _search_chunks_keyword(db_path, query)
+    else:
+        chunks = _search_chunks_keyword(db_path, query)
+
+    return RetrievedContext(facts=facts, chunks=chunks)


 def format_context_block(ctx: RetrievedContext) -> str | None:
-    """Format context for injection into LLM prompt. Returns None when empty."""
+    """Format context for injection into an LLM prompt. Returns None when empty."""
    lines: list[str] = []
    if ctx.facts:
        lines.append("Known environment facts:")
--- a/app/ingest/init.py
+++ b/app/ingest/init.py
--- a/app/ingest/base.py
+++ b/app/ingest/base.py
--- a/app/ingest/caddy.py
+++ b/app/ingest/caddy.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, epoch_float_to_iso,
    make_entry_id, now_iso,
 )
--- a/app/ingest/dmesg_log.py
+++ b/app/ingest/dmesg_log.py
@ -18,7 +18,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/doc_upload.py
+++ b/app/ingest/doc_upload.py
@ -10,7 +10,7 @@ from app.context.chunker import process_upload
 from app.context.store import add_document, add_fact


-def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
+def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
    """Process an uploaded file and write to context store. Returns result summary."""
    doc_type, facts, chunks = process_upload(filename, content)

--- a/app/ingest/docker_log.py
+++ b/app/ingest/docker_log.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity,
    make_entry_id, now_iso,
 )
--- a/app/ingest/journald.py
+++ b/app/ingest/journald.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, epoch_micros_to_iso,
    make_entry_id, now_iso, SYSLOG_PRIORITY,
 )
--- a/app/ingest/mqtt_subscriber.py
+++ b/app/ingest/mqtt_subscriber.py
@ -1,10 +1,10 @@
-"""Live MQTT ingest subscriber for Turnstone.
+"""Live MQTT glean subscriber for Turnstone.

 Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker
 in the background. Incoming messages are normalized to RetrievedEntry and
 written to the Turnstone SQLite database as they arrive.

-This runs as an asyncio task alongside the batch ingest scheduler. It is
+This runs as an asyncio task alongside the batch glean scheduler. It is
 started from the FastAPI lifespan in rest.py.

 MQTT source config format in sources.yaml::
--- a/app/glean/pipeline.py
+++ b/app/glean/pipeline.py
@ -0,0 +1,616 @@
+"""Glean pipeline: auto-detect format, parse, write to SQLite."""
+from __future__ import annotations
+
+import json
+import logging
+import re
+import sqlite3
+from pathlib import Path
+from typing import Iterator
+
+import yaml
+
+from app.glean import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
+from app.glean.base import _compile, load_patterns, now_iso
+from app.glean.ssh import (
+    SSHTransport,
+    SSHConnectionError,
+    SSHCommandError,
+    _build_docker_command,
+    _build_journald_command,
+    _build_plaintext_command,
+    _build_syslog_command,
+)
+from app.services.models import LogPattern, RetrievedEntry
+from app.services.search import build_fts_index
+
+logger = logging.getLogger(__name__)
+
+_SCHEMA = """
+CREATE TABLE IF NOT EXISTS log_entries (
+    id              TEXT PRIMARY KEY,
+    source_id       TEXT NOT NULL,
+    sequence        INTEGER NOT NULL,
+    timestamp_raw   TEXT,
+    timestamp_iso   TEXT,
+    ingest_time     TEXT NOT NULL,
+    severity        TEXT,
+    repeat_count    INTEGER DEFAULT 1,
+    out_of_order    INTEGER DEFAULT 0,
+    matched_patterns TEXT DEFAULT '[]',
+    text            TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_source      ON log_entries(source_id);
+CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso);
+CREATE INDEX IF NOT EXISTS idx_ts_repeat   ON log_entries(timestamp_iso, repeat_count);
+CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(severity);
+CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns);
+
+CREATE TABLE IF NOT EXISTS incidents (
+    id          TEXT PRIMARY KEY,
+    label       TEXT NOT NULL,
+    issue_type  TEXT NOT NULL DEFAULT '',
+    started_at  TEXT,
+    ended_at    TEXT,
+    notes       TEXT NOT NULL DEFAULT '',
+    created_at  TEXT NOT NULL,
+    severity    TEXT NOT NULL DEFAULT 'medium'
+);
+CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
+
+CREATE TABLE IF NOT EXISTS received_bundles (
+    id          TEXT PRIMARY KEY,
+    source_host TEXT NOT NULL,
+    issue_type  TEXT NOT NULL DEFAULT '',
+    label       TEXT NOT NULL,
+    severity    TEXT NOT NULL DEFAULT 'medium',
+    started_at  TEXT,
+    bundled_at  TEXT NOT NULL,
+    entry_count INTEGER NOT NULL DEFAULT 0,
+    bundle_json TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
+CREATE INDEX IF NOT EXISTS idx_bundles_type    ON received_bundles(issue_type);
+
+CREATE TABLE IF NOT EXISTS context_facts (
+    id           TEXT PRIMARY KEY,
+    category     TEXT NOT NULL,
+    key          TEXT NOT NULL,
+    value        TEXT NOT NULL,
+    source       TEXT,
+    created_at   TEXT NOT NULL
+);
+CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
+CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
+
+CREATE TABLE IF NOT EXISTS context_documents (
+    id           TEXT PRIMARY KEY,
+    filename     TEXT NOT NULL,
+    doc_type     TEXT NOT NULL,
+    full_text    TEXT NOT NULL,
+    file_size    INTEGER,
+    uploaded_at  TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS context_chunks (
+    id           TEXT PRIMARY KEY,
+    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
+    chunk_index  INTEGER NOT NULL,
+    text         TEXT NOT NULL,
+    embedding    BLOB
+);
+CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
+
+CREATE TABLE IF NOT EXISTS blocklist_candidates (
+    id                 TEXT PRIMARY KEY,
+    domain_or_ip       TEXT NOT NULL,
+    source_device_ip   TEXT,
+    source_device_name TEXT,
+    first_seen         TEXT NOT NULL,
+    last_seen          TEXT NOT NULL,
+    hit_count          INTEGER DEFAULT 1,
+    status             TEXT DEFAULT 'pending',
+    pushed_at          TEXT,
+    log_evidence       TEXT DEFAULT '[]',
+    matched_rule       TEXT,
+    llm_score          REAL,
+    llm_reason         TEXT
+);
+CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
+CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
+CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
+
+CREATE TABLE IF NOT EXISTS glean_fingerprints (
+    path       TEXT PRIMARY KEY,
+    mtime      REAL NOT NULL,
+    size       INTEGER NOT NULL,
+    gleaned_at TEXT NOT NULL
+);
+"""
+
+
+def ensure_schema(db_path: Path) -> None:
+    """Create all tables and apply additive migrations. Safe to call on every startup."""
+    conn = sqlite3.connect(str(db_path))
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.executescript(_SCHEMA)
+    # Additive column migrations — ALTER TABLE silently skips if column exists
+    for stmt in [
+        "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
+    ]:
+        try:
+            conn.execute(stmt)
+        except sqlite3.OperationalError:
+            pass
+    conn.commit()
+    conn.close()
+
+
+def _fingerprint(path: Path) -> tuple[float, int]:
+    """Return (mtime, size) for a file — cheap identity check, no content read needed."""
+    st = path.stat()
+    return st.st_mtime, st.st_size
+
+
+def _fp_unchanged(conn: sqlite3.Connection, path: Path, mtime: float, size: int) -> bool:
+    """Return True only when the stored fingerprint exactly matches (mtime, size).
+
+    A smaller size (log rotation) or a larger size (new lines appended) both
+    return False so the caller re-gleams the file.
+    """
+    row = conn.execute(
+        "SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
+        (str(path),),
+    ).fetchone()
+    if row is None:
+        return False
+    return row[0] == mtime and row[1] == size
+
+
+def _save_fingerprint(
+    conn: sqlite3.Connection,
+    path: Path,
+    mtime: float,
+    size: int,
+    gleaned_at: str,
+) -> None:
+    """Upsert the fingerprint for *path* after a successful glean."""
+    conn.execute(
+        """
+        INSERT OR REPLACE INTO glean_fingerprints (path, mtime, size, gleaned_at)
+        VALUES (?, ?, ?, ?)
+        """,
+        (str(path), mtime, size, gleaned_at),
+    )
+
+
+def _detect_format(first_line: str) -> str:
+    try:
+        obj = json.loads(first_line)
+        if "__REALTIME_TIMESTAMP" in obj:
+            return "journald"
+        if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
+            return "docker"
+        if wazuh.is_wazuh_alert(obj):
+            return "wazuh"
+        if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
+            return "caddy"
+    except (json.JSONDecodeError, AttributeError):
+        pass
+    if plex.is_plex_log(first_line):
+        return "plex"
+    if qbittorrent.is_qbit_log(first_line):
+        return "qbittorrent"
+    if servarr.is_servarr_log(first_line):
+        return "servarr"
+    if dmesg_log.is_dmesg_log(first_line):
+        return "dmesg"
+    if syslog.is_syslog(first_line):
+        return "syslog"
+    return "plaintext"
+
+
+def _parse_file(
+    path: Path,
+    compiled: list[tuple[LogPattern, object]],
+    ingest_time: str,
+    source_id: str | None = None,
+) -> Iterator[RetrievedEntry]:
+    source_id = source_id or path.stem
+
+    with path.open("r", errors="replace") as f:
+        lines = iter(f)
+        try:
+            first = next(lines)
+        except StopIteration:
+            return
+
+        fmt = _detect_format(first.strip())
+        logger.info("Detected format %r for %s", fmt, path.name)
+
+        def all_lines():
+            yield first
+            yield from lines
+
+        if fmt == "journald":
+            yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "wazuh":
+            yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "docker":
+            yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "caddy":
+            yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "plex":
+            yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "qbittorrent":
+            yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "servarr":
+            yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "dmesg":
+            yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
+        elif fmt == "syslog":
+            yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
+        else:
+            yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
+
+
+def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
+    conn.executemany(
+        """
+        INSERT OR IGNORE INTO log_entries
+          (id, source_id, sequence, timestamp_raw, timestamp_iso,
+           ingest_time, severity, repeat_count, out_of_order,
+           matched_patterns, text)
+        VALUES (?,?,?,?,?,?,?,?,?,?,?)
+        """,
+        [
+            (
+                e.entry_id, e.source_id, e.sequence,
+                e.timestamp_raw, e.timestamp_iso, e.ingest_time,
+                e.severity, e.repeat_count, int(e.out_of_order),
+                json.dumps(list(e.matched_patterns)), e.text,
+            )
+            for e in batch
+        ],
+    )
+
+
+def _glean_files(
+    files: list[Path],
+    db_path: Path,
+    pattern_file: Path | None = None,
+    batch_size: int = 1000,
+    source_id_map: dict[Path, str] | None = None,
+    force: bool = False,
+) -> dict[str, int]:
+    pattern_file = pattern_file or Path("patterns/default.yaml")
+    patterns = load_patterns(pattern_file)
+    compiled = _compile(patterns)
+    ingest_time = now_iso()
+    source_id_map = source_id_map or {}
+
+    conn = sqlite3.connect(str(db_path))
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.executescript(_SCHEMA)
+    conn.commit()
+
+    stats: dict[str, int] = {}
+    skipped: list[str] = []
+
+    for log_file in files:
+        source_id = source_id_map.get(log_file, log_file.stem)
+
+        # Fingerprint check — skip files whose mtime+size haven't changed.
+        mtime, size = _fingerprint(log_file)
+        if not force and _fp_unchanged(conn, log_file, mtime, size):
+            logger.debug("Skipping unchanged file: %s", log_file.name)
+            skipped.append(log_file.name)
+            stats[source_id] = stats.get(source_id, 0)
+            continue
+
+        count = 0
+        batch: list[RetrievedEntry] = []
+        for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
+            batch.append(entry)
+            if len(batch) >= batch_size:
+                _write_batch(conn, batch)
+                conn.commit()
+                count += len(batch)
+                batch.clear()
+        if batch:
+            _write_batch(conn, batch)
+            conn.commit()
+            count += len(batch)
+
+        _save_fingerprint(conn, log_file, mtime, size, ingest_time)
+        conn.commit()
+
+        stats[source_id] = stats.get(source_id, 0) + count
+        logger.info("Gleaned %d entries from %s (source: %s)", count, log_file.name, source_id)
+
+    conn.close()
+
+    if skipped:
+        logger.info("Skipped %d unchanged file(s): %s", len(skipped), ", ".join(skipped))
+
+    logger.info("Building FTS index...")
+    build_fts_index(db_path)
+    logger.info("FTS index ready")
+
+    return stats
+
+
+def _stream_and_write(
+    transport: SSHTransport,
+    cmd: str,
+    parser,
+    source_id: str,
+    compiled: list[tuple[LogPattern, object]],
+    ingest_time: str,
+    conn: sqlite3.Connection,
+    batch_size: int,
+) -> int:
+    """Stream *cmd* output through *parser* and write entries to *conn*.
+
+    Catches SSHCommandError per-item so one bad command doesn't abort the rest
+    of the glean items for this host.  Returns the number of entries written.
+    """
+    count = 0
+    batch: list[RetrievedEntry] = []
+    try:
+        for entry in parser(transport.exec_stream(cmd), source_id, compiled, ingest_time):
+            batch.append(entry)
+            if len(batch) >= batch_size:
+                _write_batch(conn, batch)
+                conn.commit()
+                count += len(batch)
+                batch.clear()
+        if batch:
+            _write_batch(conn, batch)
+            conn.commit()
+            count += len(batch)
+    except SSHCommandError as exc:
+        logger.warning("SSH command failed for source %r (cmd: %s): %s", source_id, cmd, exc)
+    logger.info("Gleaned %d entries from SSH source %s", count, source_id)
+    return count
+
+
+def _glean_ssh_source(
+    src: dict,  # type: ignore[type-arg]
+    compiled: list[tuple[LogPattern, object]],
+    ingest_time: str,
+    conn: sqlite3.Connection,
+    batch_size: int,
+) -> dict[str, int]:
+    """Open one SSHTransport connection for *src* and glean all its glean items.
+
+    One SSH connection is shared across all items in the ``glean:`` list so
+    the handshake overhead is paid only once per host per glean run.
+
+    Returns a stats dict mapping ``{source_id: entry_count}`` for each item.
+    Gracefully skips the entire source on SSHConnectionError.
+    """
+    host_id = src.get("id", src.get("host", "unknown"))
+    host = src["host"]
+    user = src["user"]
+    key_path = str(Path(src["key_path"]).expanduser())
+    port = int(src.get("port", 22))
+    glean_items: list[dict] = src.get("glean", [])  # type: ignore[type-arg]
+
+    stats: dict[str, int] = {}
+
+    try:
+        with SSHTransport(host=host, user=user, key_path=key_path, port=port) as t:
+            for item in glean_items:
+                item_type = item.get("type", "plaintext")
+                # Per-item source_id — falls back to host_id/type for un-labelled items
+                item_id = item.get("id") or f"{host_id}/{item_type}"
+
+                if item_type == "journald":
+                    cmd = _build_journald_command(item)
+                    count = _stream_and_write(
+                        t, cmd, journald.parse, item_id, compiled, ingest_time, conn, batch_size
+                    )
+                    stats[item_id] = stats.get(item_id, 0) + count
+
+                elif item_type == "syslog":
+                    cmd = _build_syslog_command(item)
+                    count = _stream_and_write(
+                        t, cmd, syslog.parse, item_id, compiled, ingest_time, conn, batch_size
+                    )
+                    stats[item_id] = stats.get(item_id, 0) + count
+
+                elif item_type == "plaintext":
+                    cmd = _build_plaintext_command(item)
+                    count = _stream_and_write(
+                        t, cmd, plaintext.parse, item_id, compiled, ingest_time, conn, batch_size
+                    )
+                    stats[item_id] = stats.get(item_id, 0) + count
+
+                elif item_type == "docker":
+                    cmds = _build_docker_command(item)
+                    if isinstance(cmds, str):
+                        cmds = [cmds]
+                    containers: list[str] = item.get("containers", [])
+                    for i, cmd in enumerate(cmds):
+                        # Use the container name as the final path segment when available
+                        container_name = containers[i] if i < len(containers) else str(i)
+                        container_id = f"{item_id}/{container_name}" if len(cmds) > 1 else item_id
+                        count = _stream_and_write(
+                            t, cmd, docker_log.parse, container_id,
+                            compiled, ingest_time, conn, batch_size,
+                        )
+                        stats[container_id] = stats.get(container_id, 0) + count
+
+                else:
+                    logger.warning(
+                        "Unknown SSH glean type %r for source %r — skipping item",
+                        item_type, host_id,
+                    )
+
+    except SSHConnectionError as exc:
+        logger.warning("SSH connection failed for source %r: %s", host_id, exc)
+
+    return stats
+
+
+def glean_ssh_source(
+    src: dict,  # type: ignore[type-arg]
+    db_path: Path,
+    pattern_file: Path | None = None,
+    batch_size: int = 1000,
+) -> dict[str, int]:
+    """Glean a single SSH source dict and write results to *db_path*.
+
+    Public wrapper around :func:`_glean_ssh_source` for the REST layer.
+    Manages the DB connection, pattern compilation, and FTS rebuild so callers
+    don't have to deal with those lifecycle concerns.
+
+    Returns stats mapping ``{sub_source_id: entry_count}``.
+    """
+    effective_pattern_file = pattern_file or Path("patterns/default.yaml")
+    compiled = _compile(load_patterns(effective_pattern_file))
+    ingest_time = now_iso()
+
+    conn = sqlite3.connect(str(db_path))
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.executescript(_SCHEMA)
+    conn.commit()
+
+    try:
+        stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
+    finally:
+        conn.close()
+
+    logger.info("Rebuilding FTS index after SSH source glean...")
+    build_fts_index(db_path)
+    return stats
+
+
+def glean_dir(
+    corpus_dir: Path,
+    db_path: Path,
+    pattern_file: Path | None = None,
+    batch_size: int = 1000,
+    force: bool = False,
+) -> dict[str, int]:
+    """Glean all .jsonl and .log files from a corpus directory.
+
+    Pass ``force=True`` to bypass fingerprint checks and re-glean all files
+    regardless of whether they have changed since the last run.
+    """
+    files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
+    return _glean_files(files, db_path, pattern_file, batch_size, force=force)
+
+
+def glean_file(
+    log_file: Path,
+    db_path: Path,
+    pattern_file: Path | None = None,
+    force: bool = False,
+) -> dict[str, int]:
+    """Glean a single log file (any supported format).
+
+    Pass ``force=True`` to re-glean even when the file fingerprint is unchanged.
+    """
+    return _glean_files([log_file], db_path, pattern_file, force=force)
+
+
+def glean_sources(
+    sources_file: Path,
+    db_path: Path,
+    pattern_file: Path | None = None,
+    batch_size: int = 1000,
+    force: bool = False,
+) -> dict[str, int]:
+    """Glean all sources listed in a sources.yaml config file.
+
+    Supports two source types:
+
+    Local file sources (default):
+        sources:
+          - id: sonarr
+            path: /opt/sonarr/config/logs/sonarr.0.txt
+
+    SSH remote sources (transport: ssh):
+        sources:
+          - id: rack01
+            transport: ssh
+            host: 192.168.1.10
+            user: admin
+            key_path: ~/.ssh/id_ed25519
+            glean:
+              - type: journald
+                args: ["--since", "2 hours ago"]
+              - type: syslog
+                path: /var/log/syslog
+              - type: plaintext
+                path: /var/log/app/error.log
+              - type: docker
+                containers: [myapp, nginx]
+
+    Missing local paths and SSH connection failures are logged as warnings
+    so the cron keeps running when a source is temporarily down.
+    """
+    with open(sources_file) as f:
+        config = yaml.safe_load(f)
+
+    local_sources: list[dict] = []  # type: ignore[type-arg]
+    ssh_sources: list[dict] = []    # type: ignore[type-arg]
+
+    for src in config.get("sources", []):
+        if src.get("transport") == "ssh":
+            ssh_sources.append(src)
+        else:
+            local_sources.append(src)
+
+    # ── Local file sources ─────────────────────────────────────────────────
+    files: list[Path] = []
+    source_id_map: dict[Path, str] = {}
+
+    for src in local_sources:
+        path = Path(src["path"])
+        if not path.exists():
+            logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
+            continue
+        files.append(path)
+        if "id" in src:
+            source_id_map[path] = src["id"]
+
+    if not files and not ssh_sources:
+        logger.warning("No sources found — check sources.yaml paths")
+        return {}
+
+    stats: dict[str, int] = {}
+    if files:
+        stats.update(_glean_files(files, db_path, pattern_file, batch_size, source_id_map, force=force))
+
+    # ── SSH remote sources ─────────────────────────────────────────────────
+    if not ssh_sources:
+        return stats
+
+    # Compile patterns once, share across all SSH sources in this run.
+    effective_pattern_file = pattern_file or Path("patterns/default.yaml")
+    compiled = _compile(load_patterns(effective_pattern_file))
+    ingest_time = now_iso()
+
+    conn = sqlite3.connect(str(db_path))
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.executescript(_SCHEMA)
+    conn.commit()
+
+    try:
+        for src in ssh_sources:
+            ssh_stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
+            for k, v in ssh_stats.items():
+                stats[k] = stats.get(k, 0) + v
+    finally:
+        conn.close()
+
+    # Rebuild FTS only when SSH sources added entries (_glean_files already
+    # rebuilds when local sources are present; safe to call again if both ran).
+    if ssh_sources:
+        logger.info("Rebuilding FTS index after SSH glean...")
+        build_fts_index(db_path)
+
+    return stats
--- a/app/ingest/plaintext.py
+++ b/app/ingest/plaintext.py
@ -10,7 +10,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/plex.py
+++ b/app/ingest/plex.py
@ -12,7 +12,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/qbittorrent.py
+++ b/app/ingest/qbittorrent.py
@ -18,7 +18,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/servarr.py
+++ b/app/ingest/servarr.py
@ -12,7 +12,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/glean/ssh.py
+++ b/app/glean/ssh.py
@ -0,0 +1,225 @@
+"""SSH transport layer for remote log gleaning (issue #22).
+
+Wraps Paramiko to provide a clean context-manager interface for executing
+remote commands and streaming their stdout output.  All format parsing is
+delegated to the existing per-format parsers (journald, syslog, plaintext,
+docker); this module is transport only.
+
+Key design choices:
+- Key-based auth only — no password prompts in a daemon context.
+- exec_stream is a generator; exit-status check fires after all lines are
+  yielded, so callers must drain the iterator (e.g. list()) to trigger it.
+- Command builders live here because they encode SSH/remote-execution idioms
+  (journalctl flags, docker logs invocation) that the generic parsers don't
+  need to know about.
+
+Example sources.yaml snippet::
+
+    sources:
+      - id: rack01
+        transport: ssh
+        host: 192.168.1.10
+        user: admin
+        key_path: ~/.ssh/id_ed25519
+        glean:
+          - type: journald
+            args: ["--since", "2 hours ago"]
+          - type: syslog
+            path: /var/log/syslog
+          - type: plaintext
+            path: /var/log/app/error.log
+          - type: docker
+            containers: [myapp, nginx]
+"""
+from __future__ import annotations
+
+import shlex
+from collections.abc import Iterator
+from typing import Union
+
+import paramiko
+
+
+__all__ = [
+    "SSHConnectionError",
+    "SSHCommandError",
+    "SSHTransport",
+    "_build_journald_command",
+    "_build_syslog_command",
+    "_build_plaintext_command",
+    "_build_docker_command",
+]
+
+# Default syslog path used when none is specified in the source spec.
+_SYSLOG_DEFAULT_PATH = "/var/log/syslog"
+
+
+# ── Custom exceptions ─────────────────────────────────────────────────────────
+
+class SSHConnectionError(Exception):
+    """Raised when the SSH connection cannot be established or authenticated."""
+
+
+class SSHCommandError(Exception):
+    """Raised when a remote command exits with a non-zero status code."""
+
+
+# ── Transport context manager ─────────────────────────────────────────────────
+
+class SSHTransport:
+    """Context manager wrapping a Paramiko SSH connection.
+
+    Opens the connection on ``__enter__`` and closes it on ``__exit__``,
+    even if an exception propagates.  Key-based authentication only.
+
+    Usage::
+
+        with SSHTransport(host="10.0.0.1", user="admin",
+                          key_path="~/.ssh/id_ed25519") as t:
+            for line in t.exec_stream("journalctl -o json --since '1 hour ago'"):
+                process(line)
+    """
+
+    def __init__(
+        self,
+        host: str,
+        user: str,
+        key_path: str,
+        port: int = 22,
+    ) -> None:
+        self._host = host
+        self._user = user
+        self._key_path = key_path
+        self._port = port
+        self._client: paramiko.SSHClient | None = None
+
+    # ── context manager protocol ──────────────────────────────────────────────
+
+    def __enter__(self) -> "SSHTransport":
+        client = paramiko.SSHClient()
+        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+        try:
+            client.connect(
+                hostname=self._host,
+                username=self._user,
+                key_filename=self._key_path,
+                port=self._port,
+            )
+        except paramiko.AuthenticationException as exc:
+            client.close()
+            raise SSHConnectionError(
+                f"SSH auth failed for {self._user}@{self._host}: {exc}"
+            ) from exc
+        except paramiko.SSHException as exc:
+            client.close()
+            raise SSHConnectionError(
+                f"SSH connection failed to {self._host}: {exc}"
+            ) from exc
+        self._client = client
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb) -> None:  # type: ignore[override]
+        if self._client is not None:
+            self._client.close()
+            self._client = None
+        # Return None (falsy) so any in-flight exception is not suppressed.
+
+    # ── remote execution ──────────────────────────────────────────────────────
+
+    def exec_stream(self, command: str) -> Iterator[str]:
+        """Execute *command* on the remote host and yield stdout lines.
+
+        The exit-status check runs after all stdout lines have been yielded,
+        so callers must drain the iterator to trigger it::
+
+            list(transport.exec_stream(cmd))   # raises if exit != 0
+
+        Raises:
+            SSHConnectionError: if called outside a ``with`` block.
+            SSHCommandError: if the remote command exits non-zero.
+        """
+        if self._client is None:
+            raise SSHConnectionError(
+                "Not connected — use SSHTransport as a context manager"
+            )
+        _, stdout, stderr = self._client.exec_command(command)
+        for line in stdout:
+            yield line
+        exit_code = stdout.channel.recv_exit_status()
+        # Guard against MagicMock in tests: only treat real integer exit codes.
+        if isinstance(exit_code, int) and exit_code != 0:
+            error_msg = stderr.read().decode(errors="replace").strip()
+            raise SSHCommandError(
+                f"Command failed (exit {exit_code}): {error_msg}"
+            )
+
+
+# ── Command builders ──────────────────────────────────────────────────────────
+
+def _build_journald_command(spec: dict) -> str:  # type: ignore[type-arg]
+    """Build a ``journalctl`` command string from a glean source spec.
+
+    Spec keys:
+
+    - ``args`` — list of extra journalctl arguments appended verbatim.
+    - ``unit`` — shorthand for ``--unit <name>`` (inserted before ``args``).
+
+    Returns a single shell command string.
+    """
+    parts = ["journalctl", "-o json", "--no-pager"]
+    if "unit" in spec:
+        parts.append(f"--unit {spec['unit']}")
+    if "args" in spec:
+        parts.extend(spec["args"])
+    return " ".join(parts)
+
+
+def _build_syslog_command(spec: dict) -> str:  # type: ignore[type-arg]
+    """Build a ``cat`` command for a syslog-format log file.
+
+    Spec keys:
+
+    - ``path`` — path to the file (default: ``/var/log/syslog``).
+
+    Returns a single shell command string.
+    """
+    path = spec.get("path", _SYSLOG_DEFAULT_PATH)
+    return f"cat {shlex.quote(path)}"
+
+
+def _build_plaintext_command(spec: dict) -> str:  # type: ignore[type-arg]
+    """Build a ``cat`` command for an arbitrary plaintext log file.
+
+    Spec keys:
+
+    - ``path`` — **required** path to the log file.
+
+    Raises:
+        KeyError: if ``path`` is absent from the spec.
+    """
+    path = spec["path"]  # intentional KeyError if missing — callers must supply it
+    return f"cat {shlex.quote(path)}"
+
+
+def _build_docker_command(
+    spec: dict,  # type: ignore[type-arg]
+) -> Union[str, list[str]]:
+    """Build ``docker logs`` command(s) for one or more named containers.
+
+    Spec keys:
+
+    - ``containers`` — **required** list of container names or IDs.
+
+    Returns a single command string when there is one container, or a list
+    of command strings when there are multiple (one command per container so
+    each can be streamed independently).
+
+    Raises:
+        KeyError: if ``containers`` is absent from the spec.
+        ValueError: if ``containers`` is an empty list.
+    """
+    containers = spec["containers"]  # intentional KeyError if missing
+    if not containers:
+        raise ValueError("'containers' must be a non-empty list")
+    commands = [f"docker logs {shlex.quote(c)}" for c in containers]
+    return commands[0] if len(commands) == 1 else commands
--- a/app/ingest/syslog.py
+++ b/app/ingest/syslog.py
@ -14,7 +14,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/tautulli.py
+++ b/app/ingest/tautulli.py
@ -5,7 +5,7 @@ Tautulli sends all template values as strings, so all fields are treated as str.
 """
 from __future__ import annotations

-from app.ingest.base import (
+from app.glean.base import (
    apply_patterns,
    epoch_float_to_iso,
    make_entry_id,
--- a/app/ingest/wazuh.py
+++ b/app/ingest/wazuh.py
@ -22,7 +22,7 @@ import json
 from datetime import datetime, timezone
 from typing import Iterator

-from app.ingest.base import (
+from app.glean.base import (
    SourceState, apply_patterns, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/pipeline.py
+++ b/app/ingest/pipeline.py
@ -1,328 +0,0 @@
-"""Ingest pipeline: auto-detect format, parse, write to SQLite."""
-from __future__ import annotations
-
-import json
-import logging
-import re
-import sqlite3
-from pathlib import Path
-from typing import Iterator
-
-import yaml
-
-from app.ingest import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
-from app.ingest.base import _compile, load_patterns, now_iso
-from app.services.models import LogPattern, RetrievedEntry
-from app.services.search import build_fts_index
-
-logger = logging.getLogger(__name__)
-
-_SCHEMA = """
-CREATE TABLE IF NOT EXISTS log_entries (
-    id              TEXT PRIMARY KEY,
-    source_id       TEXT NOT NULL,
-    sequence        INTEGER NOT NULL,
-    timestamp_raw   TEXT,
-    timestamp_iso   TEXT,
-    ingest_time     TEXT NOT NULL,
-    severity        TEXT,
-    repeat_count    INTEGER DEFAULT 1,
-    out_of_order    INTEGER DEFAULT 0,
-    matched_patterns TEXT DEFAULT '[]',
-    text            TEXT NOT NULL
-);
-CREATE INDEX IF NOT EXISTS idx_source      ON log_entries(source_id);
-CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso);
-CREATE INDEX IF NOT EXISTS idx_ts_repeat   ON log_entries(timestamp_iso, repeat_count);
-CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(severity);
-CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns);
-
-CREATE TABLE IF NOT EXISTS incidents (
-    id          TEXT PRIMARY KEY,
-    label       TEXT NOT NULL,
-    issue_type  TEXT NOT NULL DEFAULT '',
-    started_at  TEXT,
-    ended_at    TEXT,
-    notes       TEXT NOT NULL DEFAULT '',
-    created_at  TEXT NOT NULL,
-    severity    TEXT NOT NULL DEFAULT 'medium'
-);
-CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
-
-CREATE TABLE IF NOT EXISTS received_bundles (
-    id          TEXT PRIMARY KEY,
-    source_host TEXT NOT NULL,
-    issue_type  TEXT NOT NULL DEFAULT '',
-    label       TEXT NOT NULL,
-    severity    TEXT NOT NULL DEFAULT 'medium',
-    started_at  TEXT,
-    bundled_at  TEXT NOT NULL,
-    entry_count INTEGER NOT NULL DEFAULT 0,
-    bundle_json TEXT NOT NULL
-);
-CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
-CREATE INDEX IF NOT EXISTS idx_bundles_type    ON received_bundles(issue_type);
-
-CREATE TABLE IF NOT EXISTS context_facts (
-    id           TEXT PRIMARY KEY,
-    category     TEXT NOT NULL,
-    key          TEXT NOT NULL,
-    value        TEXT NOT NULL,
-    source       TEXT,
-    created_at   TEXT NOT NULL
-);
-CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
-CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
-
-CREATE TABLE IF NOT EXISTS context_documents (
-    id           TEXT PRIMARY KEY,
-    filename     TEXT NOT NULL,
-    doc_type     TEXT NOT NULL,
-    full_text    TEXT NOT NULL,
-    file_size    INTEGER,
-    uploaded_at  TEXT NOT NULL
-);
-
-CREATE TABLE IF NOT EXISTS context_chunks (
-    id           TEXT PRIMARY KEY,
-    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
-    chunk_index  INTEGER NOT NULL,
-    text         TEXT NOT NULL,
-    embedding    BLOB
-);
-CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
-
-CREATE TABLE IF NOT EXISTS blocklist_candidates (
-    id                 TEXT PRIMARY KEY,
-    domain_or_ip       TEXT NOT NULL,
-    source_device_ip   TEXT,
-    source_device_name TEXT,
-    first_seen         TEXT NOT NULL,
-    last_seen          TEXT NOT NULL,
-    hit_count          INTEGER DEFAULT 1,
-    status             TEXT DEFAULT 'pending',
-    pushed_at          TEXT,
-    log_evidence       TEXT DEFAULT '[]',
-    matched_rule       TEXT,
-    llm_score          REAL,
-    llm_reason         TEXT
-);
-CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
-CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
-CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
-"""
-
-
-def ensure_schema(db_path: Path) -> None:
-    """Create all tables and apply additive migrations. Safe to call on every startup."""
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.executescript(_SCHEMA)
-    # Additive column migrations — ALTER TABLE silently skips if column exists
-    for stmt in [
-        "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
-    ]:
-        try:
-            conn.execute(stmt)
-        except sqlite3.OperationalError:
-            pass
-    conn.commit()
-    conn.close()
-
-
-def _detect_format(first_line: str) -> str:
-    try:
-        obj = json.loads(first_line)
-        if "__REALTIME_TIMESTAMP" in obj:
-            return "journald"
-        if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
-            return "docker"
-        if wazuh.is_wazuh_alert(obj):
-            return "wazuh"
-        if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
-            return "caddy"
-    except (json.JSONDecodeError, AttributeError):
-        pass
-    if plex.is_plex_log(first_line):
-        return "plex"
-    if qbittorrent.is_qbit_log(first_line):
-        return "qbittorrent"
-    if servarr.is_servarr_log(first_line):
-        return "servarr"
-    if dmesg_log.is_dmesg_log(first_line):
-        return "dmesg"
-    if syslog.is_syslog(first_line):
-        return "syslog"
-    return "plaintext"
-
-
-def _parse_file(
-    path: Path,
-    compiled: list[tuple[LogPattern, object]],
-    ingest_time: str,
-    source_id: str | None = None,
-) -> Iterator[RetrievedEntry]:
-    source_id = source_id or path.stem
-
-    with path.open("r", errors="replace") as f:
-        lines = iter(f)
-        try:
-            first = next(lines)
-        except StopIteration:
-            return
-
-        fmt = _detect_format(first.strip())
-        logger.info("Detected format %r for %s", fmt, path.name)
-
-        def all_lines():
-            yield first
-            yield from lines
-
-        if fmt == "journald":
-            yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "wazuh":
-            yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "docker":
-            yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "caddy":
-            yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "plex":
-            yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "qbittorrent":
-            yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "servarr":
-            yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "dmesg":
-            yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
-        elif fmt == "syslog":
-            yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
-        else:
-            yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
-
-
-def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
-    conn.executemany(
-        """
-        INSERT OR IGNORE INTO log_entries
-          (id, source_id, sequence, timestamp_raw, timestamp_iso,
-           ingest_time, severity, repeat_count, out_of_order,
-           matched_patterns, text)
-        VALUES (?,?,?,?,?,?,?,?,?,?,?)
-        """,
-        [
-            (
-                e.entry_id, e.source_id, e.sequence,
-                e.timestamp_raw, e.timestamp_iso, e.ingest_time,
-                e.severity, e.repeat_count, int(e.out_of_order),
-                json.dumps(list(e.matched_patterns)), e.text,
-            )
-            for e in batch
-        ],
-    )
-
-
-def _ingest_files(
-    files: list[Path],
-    db_path: Path,
-    pattern_file: Path | None = None,
-    batch_size: int = 1000,
-    source_id_map: dict[Path, str] | None = None,
-) -> dict[str, int]:
-    pattern_file = pattern_file or Path("patterns/default.yaml")
-    patterns = load_patterns(pattern_file)
-    compiled = _compile(patterns)
-    ingest_time = now_iso()
-    source_id_map = source_id_map or {}
-
-    conn = sqlite3.connect(str(db_path))
-    conn.execute("PRAGMA journal_mode=WAL")
-    conn.executescript(_SCHEMA)
-    conn.commit()
-
-    stats: dict[str, int] = {}
-
-    for log_file in files:
-        source_id = source_id_map.get(log_file, log_file.stem)
-        count = 0
-        batch: list[RetrievedEntry] = []
-        for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
-            batch.append(entry)
-            if len(batch) >= batch_size:
-                _write_batch(conn, batch)
-                conn.commit()
-                count += len(batch)
-                batch.clear()
-        if batch:
-            _write_batch(conn, batch)
-            conn.commit()
-            count += len(batch)
-        stats[source_id] = stats.get(source_id, 0) + count
-        logger.info("Ingested %d entries from %s (source: %s)", count, log_file.name, source_id)
-
-    conn.close()
-
-    logger.info("Building FTS index...")
-    build_fts_index(db_path)
-    logger.info("FTS index ready")
-
-    return stats
-
-
-def ingest(
-    corpus_dir: Path,
-    db_path: Path,
-    pattern_file: Path | None = None,
-    batch_size: int = 1000,
-) -> dict[str, int]:
-    """Ingest all .jsonl and .log files from a corpus directory."""
-    files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
-    return _ingest_files(files, db_path, pattern_file, batch_size)
-
-
-def ingest_file(
-    log_file: Path,
-    db_path: Path,
-    pattern_file: Path | None = None,
-) -> dict[str, int]:
-    """Ingest a single log file (any supported format)."""
-    return _ingest_files([log_file], db_path, pattern_file)
-
-
-def ingest_sources(
-    sources_file: Path,
-    db_path: Path,
-    pattern_file: Path | None = None,
-    batch_size: int = 1000,
-) -> dict[str, int]:
-    """Ingest all sources listed in a sources.yaml config file.
-
-    sources.yaml format:
-        sources:
-          - id: sonarr
-            path: /opt/sonarr/config/logs/sonarr.0.txt
-          - id: qbittorrent
-            path: /opt/qbittorrent/config/data/logs/qbittorrent.log
-
-    Missing paths are skipped with a warning so the cron keeps running
-    when a service is temporarily down.
-    """
-    with open(sources_file) as f:
-        config = yaml.safe_load(f)
-
-    files: list[Path] = []
-    source_id_map: dict[Path, str] = {}
-
-    for src in config.get("sources", []):
-        path = Path(src["path"])
-        if not path.exists():
-            logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
-            continue
-        files.append(path)
-        if "id" in src:
-            source_id_map[path] = src["id"]
-
-    if not files:
-        logger.warning("No source files found — check sources.yaml paths")
-        return {}
-
-    return _ingest_files(files, db_path, pattern_file, batch_size, source_id_map)
--- a/app/mcp_server.py
+++ b/app/mcp_server.py
@ -94,7 +94,7 @@ def search_logs(
        severity: Filter by level — EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG.
        source: Partial match on source_id. Format is 'corpus:host:service'.
                Example: 'example-node:caddy' matches all Caddy entries from example-node.
-        pattern: Filter by named pattern tag applied at ingest time.
+        pattern: Filter by named pattern tag applied at glean time.
                 Known tags: auth_failure, connection_lost, oom, segfault, disk_full,
                 timeout, caddy_tls_error, caddy_config_error, caddy_auth_error,
                 caddy_upstream_error, service_restart, service_update,
@ -176,7 +176,7 @@ def list_log_sources() -> str:
    """
    sources = list_sources(DB_PATH)
    if not sources:
-        return "No log sources found. Has the corpus been ingested? Run: python scripts/ingest_corpus.py"
+        return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py"

    lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"]
    for s in sources:
@ -192,7 +192,7 @@ def list_log_sources() -> str:
 if __name__ == "__main__":
    if not DB_PATH.exists():
        logger.error("Database not found: %s", DB_PATH)
-        logger.error("Run: python scripts/ingest_corpus.py <corpus_dir> <db_path>")
+        logger.error("Run: python scripts/glean_corpus.py <corpus_dir> <db_path>")
        sys.exit(1)
    logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH)
    mcp.run()
--- a/app/rest.py
+++ b/app/rest.py
@ -27,10 +27,10 @@ from fastapi.responses import FileResponse, RedirectResponse, StreamingResponse
 from fastapi.staticfiles import StaticFiles
 from pydantic import BaseModel

-from app.ingest.pipeline import ensure_schema, ingest_file as _ingest_file
-from app.ingest.base import load_compiled_patterns, now_iso
-from app.ingest.tautulli import parse_webhook as _parse_tautulli
-from app.ingest.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
+from app.glean.pipeline import ensure_schema, glean_file as _glean_file, glean_ssh_source as _glean_ssh_source
+from app.glean.base import load_compiled_patterns, now_iso
+from app.glean.tautulli import parse_webhook as _parse_tautulli
+from app.glean.wazuh import is_wazuh_alert as _is_wazuh_alert, parse as _parse_wazuh
 from app.services.blocklist import (
    BlocklistCandidate,
    get_candidate,
@ -71,11 +71,11 @@ from app.context.store import (
    delete_document as _delete_document,
 )
 from app.context.retriever import retrieve_context as _retrieve_context, format_context_block
-from app.ingest.doc_upload import ingest_upload as _ingest_upload
+from app.glean.doc_upload import glean_upload as _glean_upload
 from app.context.wizard import get_schema as _wizard_schema, advance_step, is_complete, apply_session
 from app.context.chunker import UnsupportedDocType, FileTooLarge
-from app.tasks.ingest_scheduler import get_state as _ingest_state, run_once as _run_ingest, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
-from app.ingest.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers
+from app.tasks.glean_scheduler import get_state as _glean_state, run_once as _run_glean, scheduler_loop as _scheduler_loop, submit_matched as _submit_matched
+from app.glean.mqtt_subscriber import run_mqtt_subscribers as _run_mqtt_subscribers

 DB_PATH = Path(os.environ.get("TURNSTONE_DB", Path(__file__).parent.parent / "data" / "turnstone.db"))
 PREFS_PATH = DB_PATH.parent / "preferences.json"
@ -84,7 +84,7 @@ SOURCE_HOST = os.environ.get("TURNSTONE_SOURCE_HOST", "unknown")
 BUNDLE_ENDPOINT = os.environ.get("TURNSTONE_BUNDLE_ENDPOINT", "")
 PATTERN_DIR = Path(os.environ.get("TURNSTONE_PATTERNS", Path(__file__).parent.parent / "patterns"))
 PATTERN_FILE = PATTERN_DIR / "default.yaml"
-INGEST_INTERVAL = int(os.environ.get("TURNSTONE_INGEST_INTERVAL", "900"))
+GLEAN_INTERVAL = int(os.environ.get("TURNSTONE_GLEAN_INTERVAL", "900"))
 SUBMIT_ENDPOINT = os.environ.get("TURNSTONE_SUBMIT_ENDPOINT", "").rstrip("/")

 # GPU inference server URL.
@ -119,14 +119,14 @@ async def _lifespan(app: FastAPI):

    sources_file = PATTERN_DIR / "sources.yaml"
    _scheduler_task: asyncio.Task | None = None
-    if INGEST_INTERVAL > 0 and sources_file.exists():
+    if GLEAN_INTERVAL > 0 and sources_file.exists():
        _scheduler_task = asyncio.create_task(
            _scheduler_loop(
-                sources_file, DB_PATH, PATTERN_FILE, INGEST_INTERVAL,
+                sources_file, DB_PATH, PATTERN_FILE, GLEAN_INTERVAL,
                submit_endpoint=SUBMIT_ENDPOINT or None,
                source_host=SOURCE_HOST,
            ),
-            name="ingest-scheduler",
+            name="glean-scheduler",
        )

    _mqtt_task: asyncio.Task | None = None
@ -433,6 +433,72 @@ def list_sources() -> dict:
    return {"sources": _list_sources(DB_PATH)}


+@router.get("/api/sources/configured")
+def list_configured_sources() -> dict:
+    """Return every source in sources.yaml, enriched with DB stats.
+
+    Unlike ``/api/sources`` (which is DB-only), this endpoint reads sources.yaml
+    so SSH sources appear even before their first successful glean.  DB entry
+    counts, error counts, and timestamps are aggregated and merged in.
+
+    For SSH sources, sub-source IDs (e.g. ``rack01/journald``) are summed to
+    produce a single aggregate stat row for the top-level host entry.
+    """
+    sources_file = PATTERN_DIR / "sources.yaml"
+    if not sources_file.exists():
+        return {"sources": []}
+
+    with open(sources_file) as f:
+        config = yaml.safe_load(f) or {}
+
+    # Fetch all DB source stats once; key by source_id for O(1) lookup.
+    db_stats: dict[str, dict] = {}
+    try:
+        for row in _list_sources(DB_PATH):
+            db_stats[row["source_id"]] = row
+    except Exception:
+        pass  # DB may not exist on first run
+
+    result = []
+    for src in config.get("sources", []):
+        transport = src.get("transport", "local")
+        src_id = src.get("id", "")
+
+        entry: dict = {"id": src_id, "transport": transport}
+
+        if transport != "ssh":
+            entry["path"] = src.get("path", "")
+            db = db_stats.get(src_id, {})
+            entry["entry_count"] = db.get("entry_count", 0)
+            entry["error_count"] = db.get("error_count", 0)
+            entry["earliest"] = db.get("earliest")
+            entry["latest"] = db.get("latest")
+        else:
+            entry["host"] = src.get("host", "")
+            entry["user"] = src.get("user", "")
+            glean_items: list[dict] = src.get("glean", [])
+            entry["glean_types"] = sorted({item.get("type", "plaintext") for item in glean_items})
+            entry["glean_items"] = glean_items
+
+            # Aggregate sub-source DB rows that belong to this SSH host.
+            # Sub-sources use IDs like "{host_id}/{type}" or "{host_id}/{type}/{container}".
+            prefix = src_id + "/"
+            matching_rows = [
+                v for k, v in db_stats.items()
+                if k.startswith(prefix) or k == src_id
+            ]
+            entry["entry_count"] = sum(r.get("entry_count", 0) for r in matching_rows)
+            entry["error_count"] = sum(r.get("error_count", 0) for r in matching_rows)
+            earliests = [r["earliest"] for r in matching_rows if r.get("earliest")]
+            latests = [r["latest"] for r in matching_rows if r.get("latest")]
+            entry["earliest"] = min(earliests) if earliests else None
+            entry["latest"] = max(latests) if latests else None
+
+        result.append(entry)
+
+    return {"sources": result}
+
+
@router.delete("/api/sources/{source_id}")
 def delete_source(source_id: str) -> dict:
    """Delete all log entries (and FTS index rows) for a given source."""
@ -448,9 +514,22 @@ def delete_source(source_id: str) -> dict:
    return {"deleted": deleted, "source_id": source_id}


-@router.post("/api/sources/{source_id}/ingest")
-def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
-    """Trigger a re-ingest for a configured source from sources.yaml."""
+@router.post("/api/sources/{source_id}/glean")
+def reglean_source(
+    source_id: str,
+    background_tasks: BackgroundTasks,
+    force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean even if file is unchanged")] = False,
+) -> dict:
+    """Trigger a re-glean for a configured source from sources.yaml.
+
+    Handles both local file sources and SSH remote sources.  For SSH sources,
+    the glean runs in the foreground and rebuilds the FTS index before returning
+    (same behaviour as local sources — callers can rely on the count being final
+    when the response arrives).
+
+    Use ``?force=true`` to bypass the fingerprint cache and re-glean the file
+    even if mtime and size appear unchanged since the last run.
+    """
    sources_file = PATTERN_DIR / "sources.yaml"
    if not sources_file.exists():
        raise HTTPException(status_code=404, detail="sources.yaml not found")
@ -459,21 +538,31 @@ def reingest_source(source_id: str, background_tasks: BackgroundTasks) -> dict:
    matching = [s for s in config.get("sources", []) if s.get("id") == source_id]
    if not matching:
        raise HTTPException(status_code=404, detail=f"Source {source_id!r} not in sources.yaml")
-    src_path = Path(matching[0]["path"])
+
+    src = matching[0]
+
+    if src.get("transport") == "ssh":
+        # SSH sources: open connection, glean all items, rebuild FTS inline.
+        # Fingerprint skipping applies only to local file sources.
+        stats = _glean_ssh_source(src, DB_PATH, PATTERN_FILE)
+        return {"source_id": source_id, "gleaned": sum(stats.values())}
+
+    # Local file source.
+    src_path = Path(src["path"])
    if not src_path.exists():
        raise HTTPException(status_code=422, detail=f"Path does not exist: {src_path}")
-    stats = _ingest_file(src_path, DB_PATH, PATTERN_FILE)
+    stats = _glean_file(src_path, DB_PATH, PATTERN_FILE, force=force)
    background_tasks.add_task(build_fts_index, DB_PATH)
-    return {"source_id": source_id, "ingested": stats.get(source_id, sum(stats.values()))}
+    return {"source_id": source_id, "gleaned": stats.get(source_id, sum(stats.values()))}


-@router.post("/api/ingest/upload")
-async def ingest_upload(
+@router.post("/api/glean/upload")
+async def glean_upload(
    file: UploadFile,
    source_id: Annotated[str | None, Query(description="Override source ID (defaults to filename)")] = None,
    background_tasks: BackgroundTasks = None,
 ) -> dict:
-    """Accept a multipart log file, auto-detect format, ingest into DB."""
+    """Accept a multipart log file, auto-detect format, glean into DB."""
    sid = source_id or Path(file.filename or "upload").stem
    content = await file.read()
    with tempfile.NamedTemporaryFile(
@ -483,13 +572,13 @@ async def ingest_upload(
        tmp.write(content)
        tmp_path = Path(tmp.name)
    try:
-        stats = _ingest_file(tmp_path, DB_PATH, PATTERN_FILE)
+        stats = _glean_file(tmp_path, DB_PATH, PATTERN_FILE)
    finally:
        tmp_path.unlink(missing_ok=True)
    if background_tasks is not None:
        background_tasks.add_task(build_fts_index, DB_PATH)
    total = sum(stats.values())
-    return {"source_id": sid, "ingested": total, "stats": stats}
+    return {"source_id": sid, "gleaned": total, "stats": stats}


 class BatchEntry(BaseModel):
@ -506,20 +595,20 @@ class BatchEntry(BaseModel):
    text: str


-class BatchIngestRequest(BaseModel):
+class BatchGleanRequest(BaseModel):
    source_host: str = "unknown"
    entries: list[BatchEntry]


-@router.post("/api/ingest/batch")
-def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks) -> dict:
+@router.post("/api/glean/batch")
+def glean_batch(payload: BatchGleanRequest, background_tasks: BackgroundTasks) -> dict:
    """Accept pre-parsed log entries from a remote Turnstone instance (submission protocol).

    Used by nodes with TURNSTONE_SUBMIT_ENDPOINT configured to push their
    pattern-matched entries to a central receiving instance.
    """
    if not payload.entries:
-        return {"ingested": 0}
+        return {"gleaned": 0}
    conn = sqlite3.connect(str(DB_PATH))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.executemany(
@ -550,13 +639,13 @@ def ingest_batch(payload: BatchIngestRequest, background_tasks: BackgroundTasks)
    conn.commit()
    conn.close()
    background_tasks.add_task(build_fts_index, DB_PATH)
-    return {"ingested": len(payload.entries), "source_host": payload.source_host}
+    return {"gleaned": len(payload.entries), "source_host": payload.source_host}


-@router.get("/api/tasks/ingest/status")
-def ingest_task_status() -> dict:
-    """Return the current state of the periodic batch ingest scheduler."""
-    s = _ingest_state()
+@router.get("/api/tasks/glean/status")
+def glean_task_status() -> dict:
+    """Return the current state of the periodic glean scheduler."""
+    s = _glean_state()
    return {
        "running": s.running,
        "run_count": s.run_count,
@ -565,8 +654,8 @@ def ingest_task_status() -> dict:
        "last_stats": s.last_stats,
        "last_error": s.last_error,
        "next_run_at": s.next_run_at,
-        "interval_s": INGEST_INTERVAL,
-        "scheduler_active": INGEST_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
+        "interval_s": GLEAN_INTERVAL,
+        "scheduler_active": GLEAN_INTERVAL > 0 and (PATTERN_DIR / "sources.yaml").exists(),
        "submit_endpoint": SUBMIT_ENDPOINT or None,
        "last_submitted_at": s.last_submitted_at,
        "last_submit_count": s.last_submit_count,
@ -574,21 +663,28 @@ def ingest_task_status() -> dict:
    }


-@router.post("/api/tasks/ingest")
-async def trigger_ingest() -> dict:
-    """Manually trigger a batch ingest of all configured sources. No-ops if already running."""
+@router.post("/api/tasks/glean")
+async def trigger_glean(
+    force: Annotated[bool, Query(description="Bypass fingerprint check and re-glean all sources")] = False,
+) -> dict:
+    """Manually trigger a glean of all configured sources. No-ops if already running.
+
+    Use ``?force=true`` to bypass the fingerprint cache and re-glean every local
+    file source even when mtime and size are unchanged since the last run.
+    """
    sources_file = PATTERN_DIR / "sources.yaml"
    if not sources_file.exists():
        raise HTTPException(status_code=404, detail="sources.yaml not found — configure log sources first")
-    return await _run_ingest(
+    return await _run_glean(
        sources_file, DB_PATH, PATTERN_FILE,
        submit_endpoint=SUBMIT_ENDPOINT or None,
        source_host=SOURCE_HOST,
+        force=force,
    )


-@router.post("/api/ingest/wazuh/alert")
-async def ingest_wazuh_alert(
+@router.post("/api/glean/wazuh/alert")
+async def glean_wazuh_alert(
    alert: dict,
    source_id: Annotated[str | None, Query(description="Source label (defaults to 'wazuh')")] = None,
    background_tasks: BackgroundTasks = None,
@ -769,8 +865,8 @@ def _tautulli_write_entry(conn: sqlite3.Connection, entry) -> None:
    )


-@router.post("/api/ingest/tautulli")
-def ingest_tautulli(
+@router.post("/api/glean/tautulli")
+def glean_tautulli(
    payload: dict,
    request: Request,
    background_tasks: BackgroundTasks,
--- a/app/services/diagnose/init.py
+++ b/app/services/diagnose/init.py
@ -0,0 +1,357 @@
+"""Frictionless diagnose service — NL time extraction + layered log search.
+
+This module is the public interface for the diagnose package.
+Full implementation lives here so that patch("app.services.diagnose._HAS_DATEPARSER")
+and patch("app.services.diagnose._search_dates") continue to target the correct
+namespace, preserving backward compatibility with existing tests.
+
+The verbatim original is preserved in legacy.py for reference.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import dataclasses
+import logging
+import os
+import re
+from collections.abc import AsyncGenerator
+from datetime import datetime, timedelta, timezone
+from pathlib import Path
+from typing import Any
+
+from app.context.retriever import retrieve_context, format_context_block
+from app.services.llm import summarize
+from app.services.search import SearchResult, entries_in_window, search
+from app.services.diagnose.pipeline import run_pipeline
+
+logger = logging.getLogger(__name__)
+
+try:
+    from dateparser.search import search_dates as _search_dates  # type: ignore[import]
+
+    _HAS_DATEPARSER = True
+except ImportError:
+    _search_dates = None  # type: ignore[assignment]
+    _HAS_DATEPARSER = False
+
+
+_RELATIVE_RE = re.compile(
+    r"\b(?:last|past)\s+(?:(?P<n>\d+)|(?P<approx>a\s+few|few|couple(?:\s+of)?|several))?\s*(?P<unit>minute|hour|day|week)s?\b",
+    re.IGNORECASE,
+)
+_RELATIVE_UNITS = {"minute": 1, "hour": 60, "day": 1440, "week": 10080}
+# Fuzzy quantifiers map to a reasonable span so "last few hours" → 3h window
+_APPROX_N = 3
+
+
+def _relative_window(match: re.Match) -> tuple[str, str]:
+    """Convert a relative time match to (since_iso, until_iso)."""
+    n_str = match.group("n")
+    approx = match.group("approx")
+    unit = match.group("unit").lower()
+    n = int(n_str) if n_str else (_APPROX_N if approx else 1)
+    minutes = n * _RELATIVE_UNITS[unit]
+    return _last_n_minutes(minutes), _now_iso()
+
+
+def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
+    """Extract a time window from a natural-language query string.
+
+    Returns (since_iso, until_iso, keywords) where keywords is the query with
+    the matched time phrase stripped. Falls back to last-60-min window.
+    """
+    # Handle relative expressions first ("last hour", "past 30 minutes", etc.)
+    # dateparser misinterprets these as absolute times.
+    m = _RELATIVE_RE.search(query)
+    if m:
+        since, until = _relative_window(m)
+        keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip()
+        return since, until, keywords or query
+
+    if _HAS_DATEPARSER and _search_dates is not None:
+        # Tell dateparser what timezone the user is in so "3:35 am" means local time.
+        # PREFER_DAY_OF_MONTH is unused here but PREFER_DATES_FROM=past ensures
+        # "3:35 am" resolves to the most recent past occurrence, not a future one.
+        local_offset = datetime.now().astimezone().utcoffset()
+        offset_h = int((local_offset.total_seconds() if local_offset else 0) / 3600)
+        tz_str = f"UTC{'+' if offset_h >= 0 else ''}{offset_h}"
+        try:
+            results = _search_dates(
+                query,
+                languages=["en"],
+                settings={
+                    "PREFER_DATES_FROM": "past",
+                    "TIMEZONE": tz_str,
+                    "RETURN_AS_TIMEZONE_AWARE": True,
+                },
+            )
+        except Exception as e:
+            logger.warning(
+                "dateparser failed (%s) on query %r — falling back to 60-min window",
+                type(e).__name__,
+                query,
+            )
+            results = None
+        if results:
+            phrase, dt = results[0]
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            else:
+                dt = dt.astimezone(
+                    timezone.utc
+                )  # normalise to UTC for SQLite string compare
+            since = (dt - timedelta(minutes=30)).isoformat()
+            until = (dt + timedelta(minutes=30)).isoformat()
+            keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
+            return since, until, keywords or query
+
+    return _last_n_minutes(60), _now_iso(), query
+
+
+def diagnose(
+    db_path: Path,
+    query: str,
+    since: str | None = None,
+    until: str | None = None,
+    source_filter: str | None = None,
+    llm_url: str | None = None,
+    llm_model: str | None = None,
+    llm_api_key: str | None = None,
+) -> dict[str, Any]:
+    """Run layered log search with NL time extraction. Returns summary + entries."""
+    time_detected = since is not None and until is not None
+    if not time_detected:
+        parsed_since, parsed_until, keywords = parse_time_window(query)
+        since = since or parsed_since
+        until = until or parsed_until
+        time_detected = keywords != query
+    else:
+        keywords = query
+
+    keyword_hits = search(
+        db_path,
+        query=keywords,
+        since=since,
+        until=until,
+        source_filter=source_filter,
+        limit=150,
+        or_mode=True,
+    )
+    window_hits = entries_in_window(
+        db_path,
+        since=since,
+        until=until,
+        source_filter=source_filter,
+        limit=50,
+        per_source_cap=15,
+    )
+
+    seen: set[str] = set()
+    merged: list[SearchResult] = []
+    for r in keyword_hits + window_hits:
+        if r.entry_id not in seen:
+            seen.add(r.entry_id)
+            merged.append(r)
+
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+        :200
+    ]
+
+    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
+    by_source: dict[str, int] = {}
+    for r in combined:
+        sev = (r.severity or "INFO").upper()
+        if sev in by_severity:
+            by_severity[sev] += 1
+        by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
+
+    reasoning: str | None = None
+    if llm_url and llm_model:
+        reasoning = summarize(
+            query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
+        )
+
+    return {
+        "summary": {
+            "total": len(combined),
+            "window_start": since,
+            "window_end": until,
+            "time_detected": time_detected,
+            "by_severity": by_severity,
+            "by_source": by_source,
+        },
+        "reasoning": reasoning,
+        "entries": combined,
+    }
+
+
+async def diagnose_stream(
+    db_path: Path,
+    query: str,
+    since: str | None = None,
+    until: str | None = None,
+    source_filter: str | None = None,
+    llm_url: str | None = None,
+    llm_model: str | None = None,
+    llm_api_key: str | None = None,
+) -> AsyncGenerator[dict[str, Any], None]:
+    """Async generator yielding SSE event dicts for the diagnose pipeline.
+
+    Yields events in order:
+      {"type":"status","message":"…"}  — pipeline progress
+      {"type":"summary","data":{…}}    — window + severity counts (fast, from DB)
+      {"type":"entries","data":[…]}    — log entries (fast, from DB)
+      {"type":"reasoning","text":"…"}  — LLM analysis (slow, optional)
+      {"type":"done"}
+    """
+    keywords = query.strip()
+    source_browse = not keywords and source_filter is not None
+
+    if source_browse:
+        # No keyword — browsing a source directly. Use 24h window; skip FTS entirely.
+        yield {"type": "status", "message": f"Loading {source_filter}…"}
+        since = since or _last_n_minutes(60 * 24)
+        until = until or _now_iso()
+        time_detected = False
+    else:
+        yield {"type": "status", "message": "Parsing time window…"}
+        time_detected = since is not None and until is not None
+        if not time_detected:
+            parsed_since, parsed_until, keywords = await asyncio.to_thread(
+                parse_time_window, query
+            )
+            since = since or parsed_since
+            until = until or parsed_until
+            time_detected = keywords != query
+
+    yield {"type": "status", "message": "Loading environment context…"}
+    ctx = await asyncio.to_thread(lambda: retrieve_context(db_path, query))
+    context_block = format_context_block(ctx)
+    yield {
+        "type": "context",
+        "facts": ctx.facts,
+        "chunks": ctx.chunks,
+    }
+
+    yield {"type": "status", "message": "Searching logs…"}
+
+    if source_browse:
+        keyword_hits: list[SearchResult] = []
+        window_hits = await asyncio.to_thread(
+            lambda: entries_in_window(
+                db_path,
+                since,
+                until,
+                source_filter=source_filter,
+                limit=200,
+            )
+        )
+    else:
+        keyword_hits, window_hits = await asyncio.gather(
+            asyncio.to_thread(
+                lambda: search(
+                    db_path,
+                    keywords,
+                    source_filter=source_filter,
+                    since=since,
+                    until=until,
+                    limit=150,
+                    or_mode=True,
+                )
+            ),
+            asyncio.to_thread(
+                lambda: entries_in_window(
+                    db_path,
+                    since,
+                    until,
+                    source_filter=source_filter,
+                    limit=50,
+                    per_source_cap=15,
+                )
+            ),
+        )
+
+    seen: set[str] = set()
+    merged: list[SearchResult] = []
+    for r in keyword_hits + window_hits:
+        if r.entry_id not in seen:
+            seen.add(r.entry_id)
+            merged.append(r)
+
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+        :200
+    ]
+
+    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
+    by_source: dict[str, int] = {}
+    for r in combined:
+        sev = (r.severity or "INFO").upper()
+        if sev in by_severity:
+            by_severity[sev] += 1
+        by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
+
+    yield {
+        "type": "summary",
+        "data": {
+            "total": len(combined),
+            "window_start": since,
+            "window_end": until,
+            "time_detected": time_detected,
+            "by_severity": by_severity,
+            "by_source": by_source,
+        },
+    }
+    yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
+
+    if MULTI_AGENT_ENABLED:
+        async for event in run_pipeline(
+            db_path=db_path,
+            entries=combined,
+            ctx=ctx,
+            query=query,
+            since=since,
+            until=until,
+            llm_url=llm_url,
+            llm_model=llm_model,
+            llm_api_key=llm_api_key,
+        ):
+            yield event
+        return  # pipeline emits its own "done" event
+
+    if llm_url and llm_model and combined:
+        yield {"type": "status", "message": "Analyzing with LLM…"}
+        reasoning = await asyncio.to_thread(
+            lambda: summarize(
+                query,
+                combined,
+                llm_url,
+                llm_model,
+                llm_api_key,
+                context_block=context_block,
+            )
+        )
+        if reasoning:
+            yield {"type": "reasoning", "text": reasoning}
+
+    yield {"type": "done"}
+
+
+def _now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def _last_n_minutes(n: int) -> str:
+    return (datetime.now(timezone.utc) - timedelta(minutes=n)).isoformat()
+
+
+__all__ = [
+    "diagnose",
+    "diagnose_stream",
+    "parse_time_window",
+]
+
+# Feature flag for Task 6
+MULTI_AGENT_ENABLED = (
+    os.getenv("TURNSTONE_MULTI_AGENT_DIAGNOSE", "false").lower() == "true"
+)
--- a/app/services/diagnose/classifier.py
+++ b/app/services/diagnose/classifier.py
@ -0,0 +1,249 @@
+"""Stage 2: Severity Classifier — ML with two fallback levels.
+
+Classification strategy (in priority order):
+
+  Path A — ML: Hugging Face text-classification pipeline, loaded lazily.
+  Path B — pattern_tags: Map cluster.pattern_tags through the loaded pattern
+            severity dict; pick the highest severity across matching tags.
+  Path C — regex: Call detect_severity() from app.glean.base on the cluster's
+            representative_text.
+
+Each cluster is classified independently. The ``classifier_used`` field on the
+returned ``ClassifiedTimeline`` reflects the primary path (the one that governed
+the overall classification session, not individual cluster fallbacks).
+"""
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+from app.services.diagnose.models import (
+    ClassifiedTimeline,
+    EventCluster,
+    SeverityLabel,
+    TimelineResult,
+)
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Module-level ML singleton — reset to None between tests via the fixture
+# ---------------------------------------------------------------------------
+
+_ml_classifier: Any | None = None
+
+
+def _get_ml_classifier(model_id: str, device: str) -> Any:
+    """Return the cached HF pipeline, loading it on first call."""
+    global _ml_classifier  # noqa: PLW0603
+    if _ml_classifier is None:
+        from transformers import pipeline as hf_pipeline  # type: ignore[import-untyped]
+
+        _ml_classifier = hf_pipeline(
+            "text-classification", model=model_id, device=device
+        )
+    return _ml_classifier
+
+
+# ---------------------------------------------------------------------------
+# Label mapping
+# ---------------------------------------------------------------------------
+
+_LABEL_MAP: dict[str, SeverityLabel] = {
+    "ERROR": "ERROR",
+    "WARNING": "WARN",
+    "WARN": "WARN",
+    "INFO": "INFO",
+    "DEBUG": "DEBUG",
+    "CRITICAL": "CRITICAL",
+}
+
+_CRITICAL_KEYWORDS: frozenset[str] = frozenset(
+    {
+        "panic",
+        "oom",
+        "fatal",
+        "critical",
+        "kernel panic",
+        "out of memory",
+        "segfault",
+        "segmentation fault",
+    }
+)
+
+_SEVERITY_ORDER: dict[str | None, int] = {
+    "CRITICAL": 5,
+    "ERROR": 4,
+    "WARN": 3,
+    "WARNING": 3,
+    "INFO": 2,
+    "DEBUG": 1,
+    None: 0,
+}
+
+
+def _map_label(label: str, score: float, text: str) -> SeverityLabel:
+    """Apply the severity shim: promote to CRITICAL or demote to DEBUG where warranted."""
+    upper = label.upper()
+    if upper == "ERROR" and score > 0.95 and any(
+        k in text.lower() for k in _CRITICAL_KEYWORDS
+    ):
+        return "CRITICAL"
+    if upper == "INFO" and score < 0.4:
+        return "DEBUG"
+    return _LABEL_MAP.get(upper, "UNKNOWN")  # type: ignore[return-value]
+
+
+def _highest_from_tags(
+    tags: tuple[str, ...], severity_map: dict[str, str]
+) -> SeverityLabel | None:
+    """Return the highest severity from the pattern_tags that appear in severity_map."""
+    best: str | None = None
+    best_rank = -1
+    for tag in tags:
+        sev = severity_map.get(tag)
+        rank = _SEVERITY_ORDER.get(sev, 0)
+        if rank > best_rank:
+            best_rank = rank
+            best = sev
+    if best is None:
+        return None
+    normalised = "WARN" if best.upper() == "WARNING" else best.upper()
+    return normalised  # type: ignore[return-value]
+
+
+# ---------------------------------------------------------------------------
+# SeverityClassifier
+# ---------------------------------------------------------------------------
+
+
+class SeverityClassifier:
+    """Classify each EventCluster's severity using ML, patterns, or regex fallback.
+
+    Parameters
+    ----------
+    model_id:
+        Hugging Face model identifier. When empty (default), ML is skipped.
+    device:
+        Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``).
+    pattern_file:
+        Path to the YAML pattern file. When ``None`` the classifier reads
+        ``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``).
+    """
+
+    def __init__(
+        self,
+        model_id: str = "",
+        device: str = "cpu",
+        pattern_file: Path | None = None,
+    ) -> None:
+        self._model_id = model_id
+        self._device = device
+        self._pattern_file: Path | None = pattern_file
+        self._pattern_severity: dict[str, str] = {}
+        self._patterns_loaded = False
+
+    # ------------------------------------------------------------------
+    # Lazy loaders
+    # ------------------------------------------------------------------
+
+    def _resolve_pattern_file(self) -> Path | None:
+        """Resolve pattern file from constructor arg or env var."""
+        if self._pattern_file is not None:
+            return self._pattern_file
+        env_dir = os.environ.get("TURNSTONE_PATTERNS")
+        if env_dir:
+            return Path(env_dir) / "default.yaml"
+        return None
+
+    def _ensure_patterns_loaded(self) -> None:
+        """Populate _pattern_severity from the pattern YAML file (once)."""
+        if self._patterns_loaded:
+            return
+        self._patterns_loaded = True
+        path = self._resolve_pattern_file()
+        if path is None:
+            return
+        from app.glean.base import load_patterns
+
+        patterns = load_patterns(path)
+        self._pattern_severity = {p.name: p.severity for p in patterns}
+
+    # ------------------------------------------------------------------
+    # Per-cluster classification helpers
+    # ------------------------------------------------------------------
+
+    def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None:
+        """Attempt ML classification. Returns None on any inference failure."""
+        try:
+            pipe = _get_ml_classifier(self._model_id, self._device)
+            results = pipe(cluster.representative_text)
+            if not results:
+                return None
+            hit = results[0]
+            return _map_label(hit["label"], hit["score"], cluster.representative_text)
+        except Exception:  # noqa: BLE001
+            logger.warning(
+                "ML inference failed for cluster %s — falling back",
+                cluster.cluster_id,
+            )
+            return None
+
+    def _classify_cluster_pattern_tags(
+        self, cluster: EventCluster
+    ) -> SeverityLabel | None:
+        """Derive severity from the cluster's pattern_tags. Returns None if no match."""
+        return _highest_from_tags(cluster.pattern_tags, self._pattern_severity)
+
+    def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel:
+        """Classify by scanning representative_text with the severity regex."""
+        from app.glean.base import detect_severity
+
+        raw = detect_severity(cluster.representative_text)
+        if raw is None:
+            return "INFO"
+        return _LABEL_MAP.get(raw.upper(), "INFO")  # type: ignore[return-value]
+
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+
+    def classify(self, timeline: TimelineResult) -> ClassifiedTimeline:
+        """Classify every cluster in *timeline* and return a ``ClassifiedTimeline``."""
+        self._ensure_patterns_loaded()
+
+        # Determine which primary path governs this session
+        ml_available = bool(self._model_id)
+        patterns_available = bool(self._pattern_severity)
+
+        if ml_available:
+            classifier_used: str = "ml"
+        elif patterns_available:
+            classifier_used = "pattern_tags"
+        else:
+            classifier_used = "regex"
+
+        cluster_severities: dict[str, SeverityLabel] = {}
+
+        for cluster in timeline.clusters:
+            severity: SeverityLabel | None = None
+
+            if ml_available:
+                severity = self._classify_cluster_ml(cluster)
+
+            if severity is None and patterns_available:
+                severity = self._classify_cluster_pattern_tags(cluster)
+
+            if severity is None:
+                severity = self._classify_cluster_regex(cluster)
+
+            cluster_severities[cluster.cluster_id] = severity
+
+        return ClassifiedTimeline(
+            timeline=timeline,
+            cluster_severities=cluster_severities,
+            classifier_used=classifier_used,  # type: ignore[arg-type]
+            model_id=self._model_id if ml_available else None,
+        )
--- a/app/services/diagnose/hypothesizer.py
+++ b/app/services/diagnose/hypothesizer.py
@ -0,0 +1,216 @@
+"""Stage 3: Root-Cause Hypothesizer — LLM + RAG context."""
+from __future__ import annotations
+
+import json
+import logging
+from uuid import uuid4
+
+import httpx
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.models import (
+    ClassifiedTimeline,
+    EventCluster,
+    Hypothesis,
+    SeverityLabel,
+)
+
+logger = logging.getLogger(__name__)
+
+_VALID_SEVERITIES: frozenset[str] = frozenset({"CRITICAL", "ERROR", "WARN", "INFO", "DEBUG"})
+
+_SYSTEM_PROMPT = (
+    "You are a Linux sysadmin log analyst. Analyze the following clustered log timeline "
+    "and generate 2-4 root cause hypotheses as a JSON array.\n\n"
+    "Each hypothesis must follow this exact JSON schema:\n"
+    '{"title": str (≤80 chars), "description": str (2-4 sentences), '
+    '"confidence": float (0.0-1.0), "severity": str (one of: CRITICAL, ERROR, WARN, INFO), '
+    '"supporting_clusters": [str list of cluster IDs]}\n\n'
+    "Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON."
+)
+
+
+def _coerce_float(val: object, default: float) -> float:
+    """Safely coerce LLM output to float, returning default on failure."""
+    try:
+        return float(val)  # type: ignore[arg-type]
+    except (TypeError, ValueError):
+        return default
+
+
+def _validate_severity(s: str) -> SeverityLabel:
+    """Map a raw severity string to a valid SeverityLabel, defaulting to ERROR."""
+    upper = s.upper()
+    if upper == "WARNING":
+        return "WARN"
+    return upper if upper in _VALID_SEVERITIES else "ERROR"  # type: ignore[return-value]
+
+
+def _cluster_summary(cluster: EventCluster, severity: str) -> str:
+    """Build a condensed single-line summary of a cluster for the prompt."""
+    sources = ", ".join(list(cluster.source_ids)[:3])
+    patterns = ", ".join(list(cluster.pattern_tags)[:5])
+    text_preview = cluster.representative_text[:200]
+    summary = (
+        f"[{severity}] {cluster.start_iso or 'unknown'} "
+        f"({sources}) — {text_preview}"
+    )
+    if patterns:
+        summary += f" [patterns: {patterns}]"
+    return summary
+
+
+def _extract_content(resp_json: dict) -> str | None:
+    """Pull text content from an OpenAI-compat chat completion response."""
+    choices = resp_json.get("choices") or []
+    if not choices:
+        return None
+    return (choices[0].get("message", {}).get("content") or "").strip() or None
+
+
+class RootCauseHypothesizer:
+    """Generate ranked root-cause hypotheses from a classified log timeline."""
+
+    def __init__(self, max_hypotheses: int = 4) -> None:
+        self._max_hypotheses = max_hypotheses
+
+    def hypothesize(
+        self,
+        classified: ClassifiedTimeline,
+        ctx: RetrievedContext,
+        query: str,
+        llm_url: str | None = None,
+        llm_model: str | None = None,
+        llm_api_key: str | None = None,
+    ) -> list[Hypothesis]:
+        """Generate hypotheses from a classified timeline and RAG context.
+
+        Returns an empty list when no LLM is configured or there are no
+        clusters to analyse.
+        """
+        if not llm_url or not llm_model:
+            return []
+
+        clusters = classified.timeline.clusters
+        if not clusters:
+            return []
+
+        cluster_lines = [
+            _cluster_summary(c, classified.cluster_severities.get(c.cluster_id, c.severity))
+            for c in clusters
+        ]
+        cluster_block = "\n".join(cluster_lines)
+
+        context_parts: list[str] = []
+        for chunk in ctx.chunks[:5]:
+            filename = chunk.get("filename", "unknown")
+            text = chunk.get("text", "")[:300]
+            context_parts.append(f"[{filename}] {text}")
+        context_block = "\n".join(context_parts) if context_parts else "(none)"
+
+        user_message = (
+            f"Query: {query}\n\n"
+            f"Context from runbooks and known patterns:\n{context_block}\n\n"
+            f"Log timeline (clustered, {len(clusters)} clusters):\n{cluster_block}\n\n"
+            f"Generate up to {self._max_hypotheses} hypotheses. Return JSON array only."
+        )
+
+        messages = [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": user_message},
+        ]
+
+        raw_response = self._call_llm(
+            llm_url=llm_url,
+            llm_model=llm_model,
+            llm_api_key=llm_api_key,
+            messages=messages,
+        )
+        if raw_response is None:
+            return []
+
+        return self._parse_response(raw_response)
+
+    def _call_llm(
+        self,
+        llm_url: str,
+        llm_model: str,
+        llm_api_key: str | None,
+        messages: list[dict],
+    ) -> str | None:
+        """Send messages to the LLM and return raw text content."""
+        headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
+
+        # Try cf-orch task-based endpoint first.
+        task_url = f"{llm_url.rstrip('/')}/api/inference/task"
+        try:
+            resp = httpx.post(
+                task_url,
+                json={
+                    "product": "turnstone",
+                    "task": "log_analysis",
+                    "payload": {"messages": messages, "stream": False},
+                },
+                headers=headers,
+                timeout=120.0,
+            )
+            if resp.status_code == 200:
+                return _extract_content(resp.json())
+            if resp.status_code != 404:
+                resp.raise_for_status()
+            logger.debug(
+                "No task assignment for turnstone.log_analysis — falling back to direct model"
+            )
+        except Exception as exc:
+            logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc)
+
+        # Fallback: OpenAI-compat endpoint with explicit model name.
+        try:
+            resp = httpx.post(
+                f"{llm_url.rstrip('/')}/v1/chat/completions",
+                json={"model": llm_model, "messages": messages, "stream": False},
+                headers=headers,
+                timeout=120.0,
+            )
+            resp.raise_for_status()
+            return _extract_content(resp.json())
+        except Exception as exc:
+            logger.warning(
+                "LLM hypothesizer failed (%s): %s", type(exc).__name__, exc
+            )
+            return None
+
+    def _parse_response(self, raw: str) -> list[Hypothesis]:
+        """Parse the LLM JSON response into a list of Hypothesis objects."""
+        try:
+            data = json.loads(raw.strip())
+        except json.JSONDecodeError:
+            logger.warning(
+                "Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw
+            )
+            return []
+
+        if not isinstance(data, list):
+            logger.warning(
+                "Hypothesizer: expected JSON array, got %s", type(data).__name__
+            )
+            return []
+
+        hypotheses: list[Hypothesis] = []
+        for item in data[: self._max_hypotheses]:
+            if not isinstance(item, dict):
+                continue
+            severity_raw = item.get("severity", "ERROR")
+            severity = _validate_severity(str(severity_raw))
+            hypothesis = Hypothesis(
+                hypothesis_id=str(uuid4()),
+                title=str(item.get("title", "Unknown"))[:80],
+                description=str(item.get("description", "")),
+                confidence=_coerce_float(item.get("confidence"), 0.5),
+                supporting_cluster_ids=tuple(item.get("supporting_clusters") or []),
+                runbook_refs=(),
+                severity=severity,
+            )
+            hypotheses.append(hypothesis)
+
+        return hypotheses
--- a/app/services/diagnose/legacy.py
+++ b/app/services/diagnose/legacy.py
@ -1,4 +1,5 @@
 """Frictionless diagnose service — NL time extraction + layered log search."""
+
 from __future__ import annotations

 import asyncio
@ -18,6 +19,7 @@ logger = logging.getLogger(__name__)

 try:
    from dateparser.search import search_dates as _search_dates  # type: ignore[import]
+
    _HAS_DATEPARSER = True
 except ImportError:
    _search_dates = None  # type: ignore[assignment]
@ -68,17 +70,25 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
            results = _search_dates(
                query,
                languages=["en"],
-                settings={"PREFER_DATES_FROM": "past", "TIMEZONE": tz_str, "RETURN_AS_TIMEZONE_AWARE": True},
+                settings={
+                    "PREFER_DATES_FROM": "past",
+                    "TIMEZONE": tz_str,
+                    "RETURN_AS_TIMEZONE_AWARE": True,
+                },
            )
        except Exception:
-            logger.warning("dateparser failed on query %r — falling back to 60-min window", query)
+            logger.warning(
+                "dateparser failed on query %r — falling back to 60-min window", query
+            )
            results = None
        if results:
            phrase, dt = results[0]
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            else:
-                dt = dt.astimezone(timezone.utc)  # normalise to UTC for SQLite string compare
+                dt = dt.astimezone(
+                    timezone.utc
+                )  # normalise to UTC for SQLite string compare
            since = (dt - timedelta(minutes=30)).isoformat()
            until = (dt + timedelta(minutes=30)).isoformat()
            keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
@ -107,8 +117,23 @@ def diagnose(
    else:
        keywords = query

-    keyword_hits = search(db_path, query=keywords, since=since, until=until, source_filter=source_filter, limit=150, or_mode=True)
-    window_hits = entries_in_window(db_path, since=since, until=until, source_filter=source_filter, limit=50, per_source_cap=15)
+    keyword_hits = search(
+        db_path,
+        query=keywords,
+        since=since,
+        until=until,
+        source_filter=source_filter,
+        limit=150,
+        or_mode=True,
+    )
+    window_hits = entries_in_window(
+        db_path,
+        since=since,
+        until=until,
+        source_filter=source_filter,
+        limit=50,
+        per_source_cap=15,
+    )

    seen: set[str] = set()
    merged: list[SearchResult] = []
@ -117,7 +142,9 @@ def diagnose(
            seen.add(r.entry_id)
            merged.append(r)

-    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+        :200
+    ]

    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
@ -129,7 +156,9 @@ def diagnose(

    reasoning: str | None = None
    if llm_url and llm_model:
-        reasoning = summarize(query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key)
+        reasoning = summarize(
+            query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
+        )

    return {
        "summary": {
@ -177,7 +206,9 @@ async def diagnose_stream(
        yield {"type": "status", "message": "Parsing time window…"}
        time_detected = since is not None and until is not None
        if not time_detected:
-            parsed_since, parsed_until, keywords = await asyncio.to_thread(parse_time_window, query)
+            parsed_since, parsed_until, keywords = await asyncio.to_thread(
+                parse_time_window, query
+            )
            since = since or parsed_since
            until = until or parsed_until
            time_detected = keywords != query
@ -197,23 +228,34 @@ async def diagnose_stream(
        keyword_hits: list[SearchResult] = []
        window_hits = await asyncio.to_thread(
            lambda: entries_in_window(
-                db_path, since, until,
-                source_filter=source_filter, limit=200,
+                db_path,
+                since,
+                until,
+                source_filter=source_filter,
+                limit=200,
            )
        )
    else:
        keyword_hits, window_hits = await asyncio.gather(
            asyncio.to_thread(
                lambda: search(
-                    db_path, keywords,
-                    source_filter=source_filter, since=since, until=until,
-                    limit=150, or_mode=True,
+                    db_path,
+                    keywords,
+                    source_filter=source_filter,
+                    since=since,
+                    until=until,
+                    limit=150,
+                    or_mode=True,
                )
            ),
            asyncio.to_thread(
                lambda: entries_in_window(
-                    db_path, since, until,
-                    source_filter=source_filter, limit=50, per_source_cap=15,
+                    db_path,
+                    since,
+                    until,
+                    source_filter=source_filter,
+                    limit=50,
+                    per_source_cap=15,
                )
            ),
        )
@ -225,7 +267,9 @@ async def diagnose_stream(
            seen.add(r.entry_id)
            merged.append(r)

-    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+        :200
+    ]

    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
@ -251,7 +295,14 @@ async def diagnose_stream(
    if llm_url and llm_model and combined:
        yield {"type": "status", "message": "Analyzing with LLM…"}
        reasoning = await asyncio.to_thread(
-            lambda: summarize(query, combined, llm_url, llm_model, llm_api_key, context_block=context_block)
+            lambda: summarize(
+                query,
+                combined,
+                llm_url,
+                llm_model,
+                llm_api_key,
+                context_block=context_block,
+            )
        )
        if reasoning:
            yield {"type": "reasoning", "text": reasoning}
--- a/app/services/diagnose/models.py
+++ b/app/services/diagnose/models.py
@ -0,0 +1,72 @@
+"""Pipeline data types for the multi-agent diagnose pipeline."""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Literal
+
+SeverityLabel = Literal["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "UNKNOWN"]
+
+
+@dataclass(frozen=True)
+class EventCluster:
+    """A time-correlated group of log entries within the timeline."""
+
+    cluster_id: str
+    entries: tuple[str, ...]  # entry_id refs
+    start_iso: str | None
+    end_iso: str | None
+    duration_seconds: float
+    source_ids: tuple[str, ...]
+    pattern_tags: tuple[str, ...]
+    severity: SeverityLabel
+    burst: bool
+    gap_before_seconds: float
+    representative_text: str
+
+
+@dataclass(frozen=True)
+class TimelineResult:
+    """Structured timeline of event clusters built from log entries."""
+
+    clusters: tuple[EventCluster, ...]
+    total_entries: int
+    window_start: str | None
+    window_end: str | None
+    gap_count: int
+    burst_count: int
+    dominant_sources: tuple[str, ...]
+
+
+@dataclass(frozen=True)
+class ClassifiedTimeline:
+    """Timeline annotated with ML-assigned severity per cluster."""
+
+    timeline: TimelineResult
+    cluster_severities: dict[str, SeverityLabel]
+    classifier_used: Literal["ml", "pattern_tags", "regex"]
+    model_id: str | None
+
+
+@dataclass(frozen=True)
+class Hypothesis:
+    """A root-cause hypothesis generated by Stage 3."""
+
+    hypothesis_id: str
+    title: str
+    description: str
+    confidence: float
+    supporting_cluster_ids: tuple[str, ...]
+    runbook_refs: tuple[str, ...]
+    severity: SeverityLabel
+
+
+@dataclass(frozen=True)
+class RankedHypothesis:
+    """A hypothesis enriched by Stage 4 false-positive suppression."""
+
+    hypothesis: Hypothesis
+    novelty_score: float
+    similarity_to_known: float
+    suppress: bool
+    suppression_reason: str | None
--- a/app/services/diagnose/pipeline.py
+++ b/app/services/diagnose/pipeline.py
@ -0,0 +1,132 @@
+"""Multi-agent diagnose pipeline orchestrator — Stage 1–5 wiring."""
+
+from __future__ import annotations
+
+import asyncio
+import dataclasses
+import logging
+from collections.abc import AsyncGenerator
+from pathlib import Path
+from typing import Any
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.classifier import SeverityClassifier
+from app.services.diagnose.hypothesizer import RootCauseHypothesizer
+from app.services.diagnose.suppressor import FalsePositiveSuppressor
+from app.services.diagnose.synthesizer import SummarySynthesizer
+from app.services.diagnose.timeline import TimelineReconstructor
+from app.services.search import SearchResult
+
+logger = logging.getLogger(__name__)
+
+
+async def run_pipeline(
+    db_path: Path,
+    entries: list[SearchResult],
+    ctx: RetrievedContext,
+    query: str,
+    since: str | None,   # reserved for future range-filtering in stage queries (#29 follow-up)
+    until: str | None,   # reserved for future range-filtering in stage queries (#29 follow-up)
+    llm_url: str | None,
+    llm_model: str | None,
+    llm_api_key: str | None,
+) -> AsyncGenerator[dict[str, Any], None]:
+    """Async generator that runs all 5 pipeline stages and yields SSE event dicts.
+
+    Stages:
+      1. TimelineReconstructor  — cluster log entries by time
+      2. SeverityClassifier     — annotate clusters with severity
+      3. RootCauseHypothesizer  — generate hypotheses via LLM
+      4. FalsePositiveSuppressor — rank and suppress known patterns
+      5. SummarySynthesizer     — produce a narrative diagnosis
+
+    Yields events in order:
+      {"type": "status", "message": "Building timeline…"}
+      {"type": "pipeline_stage", "stage": 1, ...}
+      {"type": "pipeline_stage", "stage": 2, ...}
+      {"type": "pipeline_stage", "stage": 3, ...}
+      {"type": "pipeline_stage", "stage": 4, ...}
+      {"type": "hypotheses", "data": [...]}
+      {"type": "status", "message": "Synthesizing…"}
+      {"type": "reasoning", "text": "..."}   — only when synthesis produces text
+      {"type": "done"}
+    """
+    # Stage 1: Timeline reconstruction
+    yield {"type": "status", "message": "Building timeline…"}
+    timeline = await asyncio.to_thread(
+        TimelineReconstructor().reconstruct, entries
+    )
+    n_clusters = len(timeline.clusters)
+    burst = timeline.burst_count
+    yield {
+        "type": "pipeline_stage",
+        "stage": 1,
+        "name": "timeline",
+        "message": f"Built {n_clusters} clusters, {burst} bursts",
+    }
+
+    # Stage 2: Severity classification
+    classified = await asyncio.to_thread(
+        SeverityClassifier().classify, timeline
+    )
+    sev_counts: dict[str, int] = {}
+    for sev in classified.cluster_severities.values():
+        sev_counts[sev] = sev_counts.get(sev, 0) + 1
+    counts_str = ", ".join(f"{k}:{v}" for k, v in sorted(sev_counts.items()))
+    yield {
+        "type": "pipeline_stage",
+        "stage": 2,
+        "name": "classifier",
+        "message": f"{classified.classifier_used} classifier: {counts_str}",
+    }
+
+    # Stage 3: Root-cause hypotheses
+    hypotheses = await asyncio.to_thread(
+        RootCauseHypothesizer().hypothesize,
+        classified,
+        ctx,
+        query,
+        llm_url,
+        llm_model,
+        llm_api_key,
+    )
+    yield {
+        "type": "pipeline_stage",
+        "stage": 3,
+        "name": "hypotheses",
+        "message": f"{len(hypotheses)} hypotheses generated",
+    }
+
+    # Stage 4: False-positive suppression
+    ranked = await asyncio.to_thread(
+        FalsePositiveSuppressor().suppress, hypotheses, db_path
+    )
+    suppressed = sum(1 for rh in ranked if rh.suppress)
+    active = len(ranked) - suppressed
+    yield {
+        "type": "pipeline_stage",
+        "stage": 4,
+        "name": "suppressor",
+        "message": f"{suppressed} suppressed, {active} active",
+    }
+    yield {
+        "type": "hypotheses",
+        "data": [dataclasses.asdict(rh) for rh in ranked],
+    }
+
+    # Stage 5: Summary synthesis
+    yield {"type": "status", "message": "Synthesizing…"}
+    synthesis_text = await asyncio.to_thread(
+        SummarySynthesizer().synthesize,
+        ranked,
+        timeline,
+        ctx,
+        query,
+        llm_url,
+        llm_model,
+        llm_api_key,
+    )
+    if synthesis_text:
+        yield {"type": "reasoning", "text": synthesis_text}
+
+    yield {"type": "done"}
--- a/app/services/diagnose/suppressor.py
+++ b/app/services/diagnose/suppressor.py
@ -0,0 +1,275 @@
+"""Stage 4: False-Positive Suppressor — embedding cosine similarity.
+
+Compares each hypothesis against a corpus of resolved incidents using
+embedding cosine similarity. Hypotheses that closely match a previously
+resolved incident are suppressed as likely false positives.
+
+When no embedding model is configured or the service is unavailable, all
+hypotheses pass through with novelty_score=1.0 (full novelty assumed).
+"""
+from __future__ import annotations
+
+import logging
+import sqlite3
+from pathlib import Path
+from typing import Any
+
+from app.services.diagnose.models import Hypothesis, RankedHypothesis
+
+logger = logging.getLogger(__name__)
+
+# Module-level corpus cache: db_path_str -> (corpus_texts, embeddings)
+# Invalidated when the corpus text list changes between calls.
+_corpus_cache: dict[str, tuple[list[str], Any]] = {}
+
+# ---------------------------------------------------------------------------
+# Cosine similarity helpers
+# ---------------------------------------------------------------------------
+
+try:
+    import numpy as np
+
+    def _cosine_similarities(
+        query_emb: list[float], corpus_embs: list[list[float]]
+    ) -> list[float]:
+        """Batch cosine similarity of one query embedding against all corpus embeddings."""
+        q = np.array(query_emb, dtype=np.float32)
+        c = np.array(corpus_embs, dtype=np.float32)
+        q_norm = q / (np.linalg.norm(q) + 1e-10)
+        c_norm = c / (np.linalg.norm(c, axis=1, keepdims=True) + 1e-10)
+        return list(c_norm @ q_norm)
+
+    _HAS_NUMPY = True
+
+except ImportError:  # pragma: no cover
+    import math
+
+    _HAS_NUMPY = False
+
+    def _dot(a: list[float], b: list[float]) -> float:
+        return sum(x * y for x, y in zip(a, b))
+
+    def _norm(a: list[float]) -> float:
+        return math.sqrt(sum(x * x for x in a)) + 1e-10
+
+    def _cosine(a: list[float], b: list[float]) -> float:
+        return _dot(a, b) / (_norm(a) * _norm(b))
+
+    def _cosine_similarities(
+        query_emb: list[float], corpus_embs: list[list[float]]
+    ) -> list[float]:
+        return [_cosine(query_emb, c) for c in corpus_embs]
+
+
+# ---------------------------------------------------------------------------
+# DB helpers
+# ---------------------------------------------------------------------------
+
+def _fetch_resolved_incidents(db_path: Path) -> list[str]:
+    """Fetch resolved incident texts from SQLite.
+
+    Returns a list of non-empty combined strings for each resolved incident.
+    Returns an empty list on any error (missing table, connection failure, etc.).
+    """
+    try:
+        with sqlite3.connect(str(db_path)) as conn:
+            cursor = conn.execute(
+                "SELECT label, notes FROM incidents WHERE ended_at IS NOT NULL LIMIT 200"
+            )
+            rows = cursor.fetchall()
+    except sqlite3.OperationalError as exc:
+        logger.warning("Could not query resolved incidents (%s) — treating as empty corpus", exc)
+        return []
+    except sqlite3.Error as exc:
+        # Catches all remaining SQLite-family errors (IntegrityError, DatabaseError, etc.)
+        logger.warning("Unexpected SQLite error fetching resolved incidents (%s) — treating as empty corpus", exc)
+        return []
+
+    texts: list[str] = []
+    for label, notes in rows:
+        label = (label or "").strip()
+        notes = (notes or "").strip()
+        combined = f"{label}. {notes}" if label and notes else (label or notes)
+        if combined:
+            texts.append(combined)
+    return texts
+
+
+# ---------------------------------------------------------------------------
+# Public class
+# ---------------------------------------------------------------------------
+
+class FalsePositiveSuppressor:
+    """Stage 4 of the multi-agent diagnose pipeline.
+
+    Uses embedding cosine similarity to detect hypotheses that closely match
+    previously resolved incidents and suppress them as likely false positives.
+
+    When model_id is empty or the embedding service is unavailable, all
+    hypotheses pass through with novelty_score=1.0 (no suppression).
+    """
+
+    def __init__(
+        self,
+        model_id: str = "",
+        device: str = "cpu",
+        similarity_threshold: float = 0.85,
+    ) -> None:
+        self._model_id = model_id
+        self._device = device
+        # _device stored for future use when get_embedder() supports device selection
+        # Suppress when cosine similarity to a known resolved incident >= threshold.
+        # A threshold of 0.85 means "suppress if 85%+ similar to something already resolved."
+        self._similarity_threshold = similarity_threshold
+
+    def suppress(
+        self,
+        hypotheses: list[Hypothesis],
+        db_path: Path,
+    ) -> list[RankedHypothesis]:
+        """Rank hypotheses by novelty, suppressing those matching resolved incidents.
+
+        Args:
+            hypotheses: Candidate hypotheses from Stage 3.
+            db_path: Path to the Turnstone SQLite database containing incidents.
+
+        Returns:
+            List of RankedHypothesis sorted by (novelty_score * confidence) descending.
+            Non-suppressed hypotheses appear first in practice.
+        """
+        if not hypotheses:
+            return []
+
+        # No model configured — full passthrough, rank by confidence only.
+        if not self._model_id:
+            return self._passthrough(hypotheses)
+
+        # Attempt to obtain an embedder; fall back to passthrough on failure.
+        embedder = self._load_embedder()
+        if embedder is None:
+            logger.warning(
+                "Embedding service unavailable for model %r — skipping suppression",
+                self._model_id,
+            )
+            return self._passthrough(hypotheses)
+
+        # Fetch corpus texts from DB; fall back to passthrough if corpus is empty.
+        corpus_texts = _fetch_resolved_incidents(db_path)
+        if not corpus_texts:
+            logger.debug("No resolved incidents found — all hypotheses treated as novel")
+            return self._passthrough(hypotheses)
+
+        # Embed corpus (with caching).
+        corpus_embeddings = self._get_corpus_embeddings(embedder, corpus_texts, db_path)
+
+        # Score each hypothesis and sort by novelty * confidence descending.
+        ranked = [
+            self._score_hypothesis(h, embedder, corpus_embeddings)
+            for h in hypotheses
+        ]
+        ranked.sort(key=lambda rh: rh.novelty_score * rh.hypothesis.confidence, reverse=True)
+        return ranked
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+
+    def _score_hypothesis(
+        self,
+        hypothesis: Hypothesis,
+        embedder: Any,
+        corpus_embeddings: list[list[float]],
+    ) -> RankedHypothesis:
+        """Score a single hypothesis against the resolved incident corpus."""
+        try:
+            query_text = f"{hypothesis.title}. {hypothesis.description}"
+            h_emb = embedder.embed(query_text)
+            # Convert numpy array to plain Python list for _cosine_similarities
+            h_emb_list: list[float] = h_emb.tolist() if hasattr(h_emb, "tolist") else list(h_emb)
+            sims = _cosine_similarities(h_emb_list, corpus_embeddings)
+            max_sim = float(max(sims)) if sims else 0.0
+        except Exception as exc:
+            # Broad catch is intentional: catches unknown embedder runtime errors
+            # (e.g. CUDA OOM, backend crashes) so one bad hypothesis never halts the pipeline.
+            logger.warning("Embedding failed for hypothesis %r: %s — treating as novel", hypothesis.title, exc)
+            return RankedHypothesis(
+                hypothesis=hypothesis,
+                novelty_score=1.0,
+                similarity_to_known=0.0,
+                suppress=False,
+                suppression_reason=None,
+            )
+
+        novelty_score = 1.0 - max_sim
+        suppress = bool(max_sim >= self._similarity_threshold)
+        suppression_reason = (
+            f"Similar to resolved incident (similarity {max_sim:.2f})"
+            if suppress
+            else None
+        )
+        return RankedHypothesis(
+            hypothesis=hypothesis,
+            novelty_score=novelty_score,
+            similarity_to_known=max_sim,
+            suppress=suppress,
+            suppression_reason=suppression_reason,
+        )
+
+    def _load_embedder(self) -> Any | None:
+        """Load the embedding service. Returns None if unavailable."""
+        try:
+            from app.services.embeddings import get_embedder
+            return get_embedder()
+        except Exception as exc:
+            # Broad catch is intentional: get_embedder() may raise on import or
+            # backend init failures from any number of third-party libraries.
+            logger.warning("Failed to import/initialise embedding service: %s", exc)
+            return None
+
+    def _get_corpus_embeddings(
+        self,
+        embedder: Any,
+        corpus_texts: list[str],
+        db_path: Path,
+    ) -> list[list[float]]:
+        """Return cached corpus embeddings, re-embedding if the corpus has changed."""
+        cache_key = str(db_path)
+        cached = _corpus_cache.get(cache_key)
+
+        if cached is not None:
+            cached_texts, cached_embeddings = cached
+            if cached_texts == corpus_texts:
+                return cached_embeddings
+
+        logger.debug("Embedding corpus of %d resolved incidents", len(corpus_texts))
+        try:
+            raw_embeddings = embedder.embed_batch(corpus_texts)
+            # Normalise each embedding to a plain Python list for portability
+            corpus_embeddings: list[list[float]] = [
+                e.tolist() if hasattr(e, "tolist") else list(e)
+                for e in raw_embeddings
+            ]
+        except Exception as exc:
+            # Broad catch is intentional: embed_batch() may raise from any backend
+            # (network timeout, CUDA error, etc.) — treat as empty corpus so the
+            # pipeline can continue without suppression.
+            logger.warning("Corpus embedding failed: %s — treating as empty corpus", exc)
+            return []
+
+        _corpus_cache[cache_key] = (corpus_texts, corpus_embeddings)
+        return corpus_embeddings
+
+    def _passthrough(self, hypotheses: list[Hypothesis]) -> list[RankedHypothesis]:
+        """Return all hypotheses as non-suppressed, ranked by confidence descending."""
+        ranked = [
+            RankedHypothesis(
+                hypothesis=h,
+                novelty_score=1.0,
+                similarity_to_known=0.0,
+                suppress=False,
+                suppression_reason=None,
+            )
+            for h in hypotheses
+        ]
+        ranked.sort(key=lambda rh: rh.hypothesis.confidence, reverse=True)
+        return ranked
--- a/app/services/diagnose/synthesizer.py
+++ b/app/services/diagnose/synthesizer.py
@ -0,0 +1,210 @@
+"""Stage 5: Summary Synthesizer — deterministic narrative from ranked hypotheses.
+
+Streaming upgrade (async SSE chunks) is tracked as a follow-up enhancement.
+This implementation is synchronous to match the rest of the pipeline.
+"""
+from __future__ import annotations
+
+import logging
+
+import httpx
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.models import RankedHypothesis, TimelineResult
+
+logger = logging.getLogger(__name__)
+
+_SYSTEM_PROMPT = (
+    "You are a Linux sysadmin diagnosing a system incident. "
+    "Write a concise, actionable incident diagnosis.\n\n"
+    "Format your response exactly as:\n"
+    "1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — <what happened> (<X>% confidence)\n"
+    "2. TIMELINE: <what the logs show in sequence, 2-3 sentences>\n"
+    "3. ROOT CAUSES:\n"
+    "   - <hypothesis 1 title> (<confidence>%)\n"
+    "   - <hypothesis 2 title> (<confidence>%)\n"
+    "4. RECOMMENDED ACTIONS:\n"
+    "   - <action based on hypotheses>\n"
+    "5. INVESTIGATE FURTHER: <open questions, if any>"
+)
+
+
+def _extract_content(resp_json: dict) -> str | None:
+    """Pull text content from an OpenAI-compat chat completion response."""
+    choices = resp_json.get("choices") or []
+    if not choices:
+        return None
+    return (choices[0].get("message", {}).get("content") or "").strip() or None
+
+
+def _build_hypothesis_block(ranked: list[RankedHypothesis]) -> str:
+    """Build the hypothesis block for the prompt (non-suppressed only, top 3)."""
+    active = [rh for rh in ranked if not rh.suppress][:3]
+    if not active:
+        return "(none)"
+    lines: list[str] = []
+    for rh in active:
+        h = rh.hypothesis
+        conf_pct = int(h.confidence * 100)
+        similar = (
+            f"Yes — suppressed, {rh.suppression_reason}"
+            if rh.suppress and rh.suppression_reason
+            else "No"
+        )
+        novelty = f"{rh.novelty_score:.2f}"
+        lines.append(
+            f"- [{h.severity}, {conf_pct}%] {h.title}\n"
+            f"  Similar resolved incident? {similar} (novelty {novelty})"
+        )
+    return "\n".join(lines)
+
+
+def _build_context_block(ctx: RetrievedContext) -> str:
+    """Build the runbook context block for the prompt."""
+    parts: list[str] = []
+    for chunk in ctx.chunks[:5]:
+        filename = chunk.get("filename", "unknown")
+        text = chunk.get("text", "")[:300]
+        parts.append(f"[{filename}] {text}")
+    return "\n".join(parts) if parts else "(none)"
+
+
+def _deterministic_fallback(
+    ranked: list[RankedHypothesis],
+    timeline: TimelineResult,
+) -> str:
+    """Build a deterministic fallback text when no LLM is available."""
+    active = [rh for rh in ranked if not rh.suppress][:3]
+    if active:
+        top = active[0]
+        verdict_severity = top.hypothesis.severity
+        verdict_title = top.hypothesis.title
+        verdict_conf = int(top.hypothesis.confidence * 100)
+    elif ranked:
+        top = ranked[0]
+        verdict_severity = top.hypothesis.severity
+        verdict_title = top.hypothesis.title
+        verdict_conf = int(top.hypothesis.confidence * 100)
+    else:
+        verdict_severity = "UNKNOWN"
+        verdict_title = "No hypotheses generated"
+        verdict_conf = 0
+
+    root_causes = ", ".join(
+        rh.hypothesis.title for rh in (active or ranked[:3])
+    ) or "None"
+
+    return (
+        f"VERDICT: {verdict_severity} — {verdict_title} ({verdict_conf}% confidence)\n"
+        f"TIMELINE: {timeline.total_entries} entries across {len(timeline.clusters)} clusters.\n"
+        f"ROOT CAUSES: {root_causes}"
+    )
+
+
+class SummarySynthesizer:
+    """Stage 5 of the multi-agent diagnose pipeline.
+
+    Synthesizes a human-readable incident narrative from ranked hypotheses,
+    the reconstructed timeline, and RAG context. When no LLM is configured,
+    returns a deterministic fallback built from the hypothesis data.
+    """
+
+    def synthesize(
+        self,
+        ranked: list[RankedHypothesis],
+        timeline: TimelineResult,
+        ctx: RetrievedContext,
+        query: str,
+        llm_url: str | None = None,
+        llm_model: str | None = None,
+        llm_api_key: str | None = None,
+    ) -> str:
+        """Return synthesis text (single string, synchronous).
+
+        Falls back to a deterministic narrative when no LLM URL or model is
+        provided, or when the LLM call fails.
+        """
+        fallback = _deterministic_fallback(ranked, timeline)
+
+        if not llm_url or not llm_model:
+            return fallback
+
+        hypothesis_block = _build_hypothesis_block(ranked)
+        context_block = _build_context_block(ctx)
+        dominant = ", ".join(timeline.dominant_sources[:5]) or "none"
+
+        user_message = (
+            f"Query: {query}\n\n"
+            f"Timeline summary:\n"
+            f"- {len(timeline.clusters)} clusters, "
+            f"{timeline.burst_count} bursts, "
+            f"{timeline.gap_count} silence gaps\n"
+            f"- Primary sources: {dominant}\n\n"
+            f"Top hypotheses:\n{hypothesis_block}\n\n"
+            f"Context from runbooks:\n{context_block}"
+        )
+
+        messages = [
+            {"role": "system", "content": _SYSTEM_PROMPT},
+            {"role": "user", "content": user_message},
+        ]
+
+        result = self._call_llm(
+            llm_url=llm_url,
+            llm_model=llm_model,
+            llm_api_key=llm_api_key,
+            messages=messages,
+        )
+        return result if result else fallback
+
+    def _call_llm(
+        self,
+        llm_url: str,
+        llm_model: str,
+        llm_api_key: str | None,
+        messages: list[dict],
+    ) -> str | None:
+        """Send messages to the LLM and return raw text content.
+
+        Tries the cf-orch task endpoint first, falls back to direct OpenAI-compat.
+        """
+        headers = {"Authorization": f"Bearer {llm_api_key}"} if llm_api_key else {}
+
+        task_url = f"{llm_url.rstrip('/')}/api/inference/task"
+        try:
+            resp = httpx.post(
+                task_url,
+                json={
+                    "product": "turnstone",
+                    "task": "log_analysis",
+                    "payload": {"messages": messages, "stream": False},
+                },
+                headers=headers,
+                timeout=120.0,
+            )
+            if resp.status_code == 200:
+                return _extract_content(resp.json())
+            if resp.status_code != 404:
+                resp.raise_for_status()
+            logger.debug(
+                "No task assignment for turnstone.log_analysis — falling back to direct model"
+            )
+        except Exception as exc:
+            logger.debug(
+                "Task endpoint unavailable (%s) — falling back to direct model", exc
+            )
+
+        try:
+            resp = httpx.post(
+                f"{llm_url.rstrip('/')}/v1/chat/completions",
+                json={"model": llm_model, "messages": messages, "stream": False},
+                headers=headers,
+                timeout=120.0,
+            )
+            resp.raise_for_status()
+            return _extract_content(resp.json())
+        except Exception as exc:
+            logger.warning(
+                "LLM synthesizer failed (%s): %s", type(exc).__name__, exc
+            )
+            return None
--- a/app/services/diagnose/timeline.py
+++ b/app/services/diagnose/timeline.py
@ -0,0 +1,272 @@
+"""Stage 1: Timeline Reconstructor — pure Python, no ML."""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+from collections import defaultdict
+from datetime import datetime, timezone
+
+from app.services.diagnose.models import EventCluster, TimelineResult
+from app.services.search import SearchResult
+
+logger = logging.getLogger(__name__)
+
+_SEVERITY_ORDER: dict[str | None, int] = {
+    "CRITICAL": 5,
+    "ERROR": 4,
+    "WARN": 3,
+    "WARNING": 3,
+    "INFO": 2,
+    "DEBUG": 1,
+    None: 0,
+}
+
+
+def _parse_iso(s: str) -> datetime | None:
+    """Parse ISO 8601 string to UTC-aware datetime. Returns None on parse failure."""
+    try:
+        dt = datetime.fromisoformat(s)
+    except ValueError:
+        logger.warning("Unparseable timestamp in log entry, treating as None: %r", s)
+        return None
+    if dt.tzinfo is None:
+        logger.debug("Naive timestamp treated as UTC: %s", s)
+        dt = dt.replace(tzinfo=timezone.utc)
+    return dt.astimezone(timezone.utc)
+
+
+def _sort_key(e: SearchResult) -> tuple[int, str]:
+    """Sort key: timestamped entries first (ascending), then None-timestamp entries."""
+    if e.timestamp_iso is None:
+        return (1, "")
+    return (0, e.timestamp_iso)
+
+
+def _highest_severity(entries: list[SearchResult]) -> str:
+    """Return the highest severity label across all entries."""
+    best: str | None = None
+    best_rank = -1
+    for entry in entries:
+        sev = entry.severity
+        rank = _SEVERITY_ORDER.get(sev, 0)
+        if rank > best_rank:
+            best_rank = rank
+            best = sev
+    # SeverityLabel requires a valid literal; fall back to "UNKNOWN" if None
+    if best is None:
+        return "UNKNOWN"
+    # Normalise WARNING -> WARN for the output type
+    if best == "WARNING":
+        return "WARN"
+    return best
+
+
+def _representative_text(entries: list[SearchResult]) -> str:
+    """Return text of the entry with highest rank; tie-break on longest text."""
+    if not entries:
+        return ""
+    best = max(entries, key=lambda e: (e.rank, len(e.text)))
+    return best.text
+
+
+def _cluster_id(entry_ids: list[str]) -> str:
+    """Compute a 12-char hex cluster ID from a sorted list of entry IDs."""
+    payload = ",".join(sorted(entry_ids)).encode()
+    return hashlib.sha1(payload).hexdigest()[:12]  # noqa: S324 — not used for security
+
+
+def _make_event_cluster(
+    cluster_entries: list[SearchResult],
+    gap_before_seconds: float,
+    burst_threshold: int,
+    burst_window_seconds: int,
+) -> EventCluster:
+    """Construct an EventCluster from a list of SearchResult entries."""
+    timestamps = [
+        ts
+        for e in cluster_entries
+        if e.timestamp_iso is not None
+        for ts in (_parse_iso(e.timestamp_iso),)
+        if ts is not None
+    ]
+
+    start_iso: str | None = None
+    end_iso: str | None = None
+    duration_seconds = 0.0
+
+    if timestamps:
+        ts_min = min(timestamps)
+        ts_max = max(timestamps)
+        start_iso = ts_min.isoformat()
+        end_iso = ts_max.isoformat()
+        duration_seconds = (ts_max - ts_min).total_seconds()
+
+    entry_ids = [e.entry_id for e in cluster_entries]
+    burst = (
+        len(cluster_entries) >= burst_threshold
+        and duration_seconds <= burst_window_seconds
+    )
+
+    return EventCluster(
+        cluster_id=_cluster_id(entry_ids),
+        entries=tuple(entry_ids),
+        start_iso=start_iso,
+        end_iso=end_iso,
+        duration_seconds=duration_seconds,
+        source_ids=tuple(sorted(set(e.source_id for e in cluster_entries))),
+        pattern_tags=tuple(
+            sorted(set(tag for e in cluster_entries for tag in e.matched_patterns))
+        ),
+        severity=_highest_severity(cluster_entries),  # type: ignore[arg-type]  # SeverityLabel is a Literal; _highest_severity returns a compatible str
+        burst=burst,
+        gap_before_seconds=gap_before_seconds,
+        representative_text=_representative_text(cluster_entries),
+    )
+
+
+class TimelineReconstructor:
+    """Reconstruct a structured timeline of event clusters from log entries.
+
+    Pure Python — no ML or LLM calls. Designed as Stage 1 of the multi-agent
+    diagnose pipeline.
+    """
+
+    def __init__(
+        self,
+        cluster_window_seconds: int = 30,
+        burst_threshold: int = 10,
+        burst_window_seconds: int = 5,
+        gap_significance_seconds: int = 30,
+    ) -> None:
+        self._cluster_window = cluster_window_seconds
+        self._burst_threshold = burst_threshold
+        self._burst_window = burst_window_seconds
+        self._gap_significance_seconds: int = gap_significance_seconds
+
+    def _sort_entries(self, entries: list[SearchResult]) -> list[SearchResult]:
+        """Sort entries: timestamped first (ascending), then None-timestamp entries."""
+        return sorted(entries, key=_sort_key)
+
+    def _group_into_raw_clusters(
+        self, sorted_entries: list[SearchResult]
+    ) -> list[list[SearchResult]]:
+        """Group sorted entries into time-window clusters."""
+        raw_clusters: list[list[SearchResult]] = []
+        current: list[SearchResult] = []
+        cluster_anchor: datetime | None = None
+
+        for entry in sorted_entries:
+            if not current:
+                current.append(entry)
+                if entry.timestamp_iso is not None:
+                    cluster_anchor = _parse_iso(entry.timestamp_iso)
+                continue
+
+            if entry.timestamp_iso is None:
+                # No timestamp — always joins the current cluster
+                current.append(entry)
+                continue
+
+            entry_dt = _parse_iso(entry.timestamp_iso)
+
+            if entry_dt is None:
+                # Malformed timestamp — treat same as None: join current cluster
+                current.append(entry)
+                continue
+
+            if cluster_anchor is None:
+                # Current cluster has no anchor yet — set it, stay in cluster
+                cluster_anchor = entry_dt
+                current.append(entry)
+                continue
+
+            delta = (entry_dt - cluster_anchor).total_seconds()
+            if delta > self._cluster_window:
+                raw_clusters.append(current)
+                current = [entry]
+                cluster_anchor = entry_dt
+            else:
+                current.append(entry)
+
+        if current:
+            raw_clusters.append(current)
+
+        return raw_clusters
+
+    def _build_cluster(
+        self,
+        cluster_entries: list[SearchResult],
+        prev_end_iso: str | None,
+    ) -> EventCluster:
+        """Build an EventCluster from a list of SearchResult entries."""
+        gap_before = 0.0
+        if prev_end_iso is not None:
+            ts_list = [
+                ts
+                for e in cluster_entries
+                if e.timestamp_iso is not None
+                for ts in (_parse_iso(e.timestamp_iso),)
+                if ts is not None
+            ]
+            if ts_list:
+                this_start = min(ts_list)
+                prev_end = _parse_iso(prev_end_iso)
+                if prev_end is not None:
+                    gap_before = (this_start - prev_end).total_seconds()
+
+        return _make_event_cluster(
+            cluster_entries,
+            gap_before_seconds=gap_before,
+            burst_threshold=self._burst_threshold,
+            burst_window_seconds=self._burst_window,
+        )
+
+    def _dominant_sources_tuple(self, entries: list[SearchResult]) -> tuple[str, ...]:
+        """Return source_ids sorted by total entry count descending."""
+        source_counts: dict[str, int] = defaultdict(int)
+        for entry in entries:
+            source_counts[entry.source_id] += 1
+        return tuple(
+            src for src, _ in sorted(source_counts.items(), key=lambda kv: -kv[1])
+        )
+
+    def reconstruct(self, entries: list[SearchResult]) -> TimelineResult:
+        """Build a structured timeline from a flat list of log entries."""
+        if not entries:
+            return TimelineResult(
+                clusters=(),
+                total_entries=0,
+                window_start=None,
+                window_end=None,
+                gap_count=0,
+                burst_count=0,
+                dominant_sources=(),
+            )
+
+        sorted_entries = self._sort_entries(entries)
+        raw_clusters = self._group_into_raw_clusters(sorted_entries)
+
+        clusters: list[EventCluster] = []
+        prev_end: str | None = None
+        for raw in raw_clusters:
+            c = self._build_cluster(raw, prev_end)
+            clusters.append(c)
+            prev_end = c.end_iso
+
+        clusters_tuple = tuple(clusters)
+        gap_count = sum(
+            1
+            for c in clusters_tuple
+            if c.gap_before_seconds > self._gap_significance_seconds
+        )
+
+        return TimelineResult(
+            clusters=clusters_tuple,
+            total_entries=len(entries),
+            window_start=clusters_tuple[0].start_iso if clusters_tuple else None,
+            window_end=clusters_tuple[-1].end_iso if clusters_tuple else None,
+            gap_count=gap_count,
+            burst_count=sum(1 for c in clusters_tuple if c.burst),
+            dominant_sources=self._dominant_sources_tuple(entries),
+        )
--- a/app/services/embeddings.py
+++ b/app/services/embeddings.py
@ -0,0 +1,229 @@
+"""Configurable embedding service — BSL licensed.
+
+Backends:
+  sentence_transformers — local in-process inference (default, no server needed)
+  ollama               — HTTP to a running Ollama instance
+
+Configuration (env vars):
+  TURNSTONE_EMBED_BACKEND   sentence_transformers | ollama  (default: sentence_transformers)
+  TURNSTONE_EMBED_MODEL     model name/path                 (backend-specific default)
+  TURNSTONE_EMBED_DEVICE    cpu | cuda                      (default: cpu; ST backend only)
+  TURNSTONE_LLM_URL         Ollama base URL                 (default: http://localhost:11434)
+
+When no backend is importable/reachable, EMBEDDING_AVAILABLE is False and all
+embed calls return empty arrays — callers must handle this gracefully.
+"""
+from __future__ import annotations
+
+import logging
+import os
+import struct
+from typing import Protocol, runtime_checkable
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# ── Public availability flag ──────────────────────────────────────────────────
+
+EMBEDDING_AVAILABLE: bool = False
+
+# ── Config ────────────────────────────────────────────────────────────────────
+
+_BACKEND = os.environ.get("TURNSTONE_EMBED_BACKEND", "sentence_transformers").lower()
+_DEVICE  = os.environ.get("TURNSTONE_EMBED_DEVICE", "cpu").lower()
+_LLM_URL = os.environ.get("TURNSTONE_LLM_URL", "http://localhost:11434")
+
+# BAAI/bge-small-en-v1.5: 33MB, MIT, 49M downloads/month, 384-dim, 512-token max.
+# Benchmarked as the best quality-to-size ratio in the field (MTEB 62.17).
+# all-MiniLM-L6-v2 is a viable lighter alternative (23MB, 256-token max) if
+# inference speed is the primary constraint.
+_DEFAULT_MODEL: dict[str, str] = {
+    "sentence_transformers": "BAAI/bge-small-en-v1.5",
+    "ollama":                "nomic-embed-text",
+}
+_MODEL = os.environ.get(
+    "TURNSTONE_EMBED_MODEL",
+    _DEFAULT_MODEL.get(_BACKEND, "sentence-transformers/all-MiniLM-L6-v2"),
+)
+
+
+# ── Protocol ──────────────────────────────────────────────────────────────────
+
+@runtime_checkable
+class Embedder(Protocol):
+    """Minimal interface all embedding backends must satisfy."""
+
+    @property
+    def dim(self) -> int:
+        """Embedding dimension produced by this model."""
+        ...
+
+    @property
+    def model_name(self) -> str:
+        """Human-readable model identifier."""
+        ...
+
+    def embed(self, text: str) -> np.ndarray:
+        """Embed a single string. Returns 1-D float32 array of length dim."""
+        ...
+
+    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
+        """Embed a list of strings. Returns list of 1-D float32 arrays."""
+        ...
+
+
+# ── sentence-transformers backend ─────────────────────────────────────────────
+
+class SentenceTransformerEmbedder:
+    """Local in-process embedding via the sentence-transformers library.
+
+    The model is downloaded from HuggingFace on first instantiation and cached
+    at ~/.cache/huggingface/. Subsequent starts use the local cache.
+    """
+
+    def __init__(self, model_name: str = _MODEL, device: str = _DEVICE) -> None:
+        from sentence_transformers import SentenceTransformer  # type: ignore[import]
+        logger.info("Loading embedding model %r on device %r ...", model_name, device)
+        self._model = SentenceTransformer(model_name, device=device)
+        self._model_name = model_name
+        # Infer dimension from a test embed rather than hard-coding
+        self._dim: int = int(self._model.encode("test").shape[0])
+        logger.info("Embedding model ready — dim=%d", self._dim)
+
+    @property
+    def dim(self) -> int:
+        return self._dim
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    def embed(self, text: str) -> np.ndarray:
+        vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
+        return vec.astype(np.float32)
+
+    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
+        if not texts:
+            return []
+        vecs = self._model.encode(
+            texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=32
+        )
+        return [v.astype(np.float32) for v in vecs]
+
+
+# ── Ollama backend ────────────────────────────────────────────────────────────
+
+class OllamaEmbedder:
+    """HTTP embedding via a running Ollama instance."""
+
+    def __init__(
+        self,
+        model_name: str = _MODEL,
+        llm_url: str = _LLM_URL,
+        timeout: float = 30.0,
+    ) -> None:
+        import httpx  # already a project dependency
+        self._model_name = model_name
+        self._url = f"{llm_url.rstrip('/')}/api/embeddings"
+        self._timeout = timeout
+        self._client = httpx.Client(timeout=timeout)
+        # Probe dimension with a test call
+        self._dim = self._probe_dim()
+
+    def _probe_dim(self) -> int:
+        try:
+            vec = self._raw_embed("probe")
+            return len(vec)
+        except Exception as exc:
+            logger.warning("Ollama dim probe failed (%s) — defaulting to 768", exc)
+            return 768
+
+    def _raw_embed(self, text: str) -> list[float]:
+        resp = self._client.post(
+            self._url, json={"model": self._model_name, "prompt": text}
+        )
+        resp.raise_for_status()
+        return resp.json().get("embedding") or []
+
+    @property
+    def dim(self) -> int:
+        return self._dim
+
+    @property
+    def model_name(self) -> str:
+        return self._model_name
+
+    def embed(self, text: str) -> np.ndarray:
+        vec = self._raw_embed(text)
+        return np.array(vec, dtype=np.float32)
+
+    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
+        return [self.embed(t) for t in texts]
+
+
+# ── Singleton factory ─────────────────────────────────────────────────────────
+
+_embedder: Embedder | None = None
+
+
+def get_embedder() -> Embedder | None:
+    """Return the configured embedder singleton, or None when unavailable.
+
+    Lazy-initialises on first call.  Callers should check EMBEDDING_AVAILABLE
+    or test for None rather than calling this unconditionally.
+    """
+    global _embedder, EMBEDDING_AVAILABLE
+    if _embedder is not None:
+        return _embedder
+
+    if _BACKEND == "sentence_transformers":
+        try:
+            _embedder = SentenceTransformerEmbedder(_MODEL, _DEVICE)
+            EMBEDDING_AVAILABLE = True
+        except ImportError:
+            logger.warning(
+                "sentence-transformers not installed — embeddings disabled. "
+                "Install with: pip install sentence-transformers"
+            )
+        except Exception as exc:
+            logger.warning("Failed to load sentence-transformers model %r: %s", _MODEL, exc)
+
+    elif _BACKEND == "ollama":
+        try:
+            _embedder = OllamaEmbedder(_MODEL, _LLM_URL)
+            EMBEDDING_AVAILABLE = True
+        except Exception as exc:
+            logger.warning("Ollama embedder init failed: %s", exc)
+
+    else:
+        logger.warning("Unknown TURNSTONE_EMBED_BACKEND %r — embeddings disabled", _BACKEND)
+
+    return _embedder
+
+
+# ── BLOB serialisation helpers ────────────────────────────────────────────────
+
+def pack_vector(vec: np.ndarray) -> bytes:
+    """Serialise a float32 numpy vector to a SQLite BLOB."""
+    arr = vec.astype(np.float32)
+    return struct.pack(f"{len(arr)}f", *arr.tolist())
+
+
+def unpack_vector(blob: bytes) -> np.ndarray:
+    """Deserialise a SQLite BLOB back to a float32 numpy vector."""
+    n = len(blob) // 4  # 4 bytes per float32
+    return np.array(struct.unpack(f"{n}f", blob), dtype=np.float32)
+
+
+def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
+    """Cosine similarity between two L2-normalised vectors.
+
+    Both vectors are re-normalised defensively so callers need not pre-normalise.
+    Returns 0.0 when either vector has zero norm.
+    """
+    norm_a = np.linalg.norm(a)
+    norm_b = np.linalg.norm(b)
+    if norm_a == 0.0 or norm_b == 0.0:
+        return 0.0
+    return float(np.dot(a, b) / (norm_a * norm_b))
--- a/app/services/incidents.py
+++ b/app/services/incidents.py
@ -6,7 +6,7 @@ import sqlite3
 import uuid
 from pathlib import Path

-from app.ingest.base import now_iso
+from app.glean.base import now_iso
 from app.services.models import Incident, ReceivedBundle
 from app.services.search import SearchResult, entries_in_window, search

--- a/app/services/models.py
+++ b/app/services/models.py
@ -10,7 +10,7 @@ class RetrievedEntry:

    entry_id: str
    source_id: str          # log file path or service name
-    sequence: int           # original line number — ingest order, not wall-clock order
+    sequence: int           # original line number — glean order, not wall-clock order
    timestamp_raw: str | None       # timestamp as it appeared in the log
    timestamp_iso: str | None       # parsed to ISO 8601 for sorting; None if unparseable
    ingest_time: str                # when Turnstone indexed this entry (wall clock)
@ -25,7 +25,7 @@ class RetrievedEntry:

@dataclass(frozen=True)
 class LogPattern:
-    """A named regex pattern for tagging entries at ingest time."""
+    """A named regex pattern for tagging entries at glean time."""

    name: str           # e.g. "device_disconnect", "auth_failure"
    pattern: str        # regex string
--- a/app/services/search.py
+++ b/app/services/search.py
@ -451,9 +451,8 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
        else:
            suppressed += 1

-    # When did we last ingest anything?
    last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone()
-    last_ingested: str | None = last_row["t"] if last_row else None
+    last_gleaned: str | None = last_row["t"] if last_row else None

    conn.close()

@ -465,7 +464,7 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
        "source_health": source_health,
        "recent_criticals": recent_criticals,
        "suppressed_criticals": suppressed,
-        "last_ingested": last_ingested,
+        "last_gleaned": last_gleaned,
    }


--- a/app/tasks/ingest_scheduler.py
+++ b/app/tasks/ingest_scheduler.py
@ -1,10 +1,10 @@
-"""Periodic batch ingest scheduler with optional CF submission.
+"""Periodic batch glean scheduler with optional CF submission.

-Runs ingest_sources on a configurable interval (TURNSTONE_INGEST_INTERVAL env var,
+Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var,
 default 900s / 15 min). Set to 0 to disable.

 When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote
-Turnstone instance (the CF receiving store) after each ingest run.
+Turnstone instance (the CF receiving store) after each glean run.
 """
 from __future__ import annotations

@ -19,7 +19,7 @@ from typing import Any

 import httpx

-from app.ingest.pipeline import ingest_sources
+from app.glean.pipeline import glean_sources

 logger = logging.getLogger(__name__)

@ -96,14 +96,14 @@ async def submit_matched(
    if not entries:
        return {"ok": True, "submitted": 0, "skipped": True}

-    url = f"{submit_endpoint.rstrip('/')}/turnstone/api/ingest/batch"
+    url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch"
    payload = {"source_host": source_host, "entries": entries}
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.post(url, json=payload)
            resp.raise_for_status()
        result = resp.json()
-        submitted = result.get("ingested", len(entries))
+        submitted = result.get("gleaned", len(entries))
        _state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat()
        _state.last_submit_count = submitted
        _state.last_submit_error = None
@ -121,10 +121,15 @@ async def run_once(
    pattern_file: Path | None = None,
    submit_endpoint: str | None = None,
    source_host: str = "unknown",
+    force: bool = False,
 ) -> dict[str, Any]:
-    """Ingest all sources once, then submit matched entries if configured."""
+    """Ingest all sources once, then submit matched entries if configured.
+
+    Pass ``force=True`` to bypass fingerprint checks and re-glean all local
+    file sources regardless of whether they appear unchanged.
+    """
    if _lock.locked():
-        return {"ok": False, "error": "ingest already running", "skipped": True}
+        return {"ok": False, "error": "glean already running", "skipped": True}

    async with _lock:
        _state.running = True
@ -133,7 +138,7 @@ async def run_once(
            loop = asyncio.get_running_loop()
            stats: dict[str, int] = await loop.run_in_executor(
                None,
-                lambda: ingest_sources(sources_file, db_path, pattern_file),
+                lambda: glean_sources(sources_file, db_path, pattern_file, force=force),
            )
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
@ -141,14 +146,14 @@ async def run_once(
            _state.last_stats = stats
            _state.last_error = None
            _state.run_count += 1
-            logger.info("Batch ingest complete in %.1fs — %s", duration, stats)
+            logger.info("Batch glean complete in %.1fs — %s", duration, stats)
        except Exception as exc:
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_error = str(exc)
            _state.run_count += 1
-            logger.error("Batch ingest failed: %s", exc)
+            logger.error("Batch glean failed: %s", exc)
            _state.running = False
            return {"ok": False, "error": str(exc)}
        finally:
@ -168,7 +173,7 @@ async def scheduler_loop(
    submit_endpoint: str | None = None,
    source_host: str = "unknown",
 ) -> None:
-    """Run ingest + optional submission every interval_s seconds until cancelled."""
+    """Run glean + optional submission every interval_s seconds until cancelled."""
    logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file)
    if submit_endpoint:
        logger.info("Submission enabled — endpoint: %s", submit_endpoint)
--- a/app/watch/watcher.py
+++ b/app/watch/watcher.py
@ -1,4 +1,4 @@
-"""Live watch: tail active log sources and ingest entries in near-real-time.
+"""Live watch: tail active log sources and glean entries in near-real-time.

 Each WatchSource runs a subprocess (journalctl -f, podman/docker logs -f)
 in a daemon thread and pipes lines through the existing ingestors into SQLite.
@ -18,12 +18,12 @@ from typing import Iterator

 import yaml

-from app.ingest import journald as journald_parser, syslog as syslog_parser
-from app.ingest import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
-from app.ingest import qbittorrent as qbit_parser, caddy as caddy_parser
-from app.ingest.pipeline import _detect_format
-from app.ingest.base import _compile, load_patterns, now_iso
-from app.ingest.pipeline import _write_batch, _SCHEMA
+from app.glean import journald as journald_parser, syslog as syslog_parser
+from app.glean import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
+from app.glean import qbittorrent as qbit_parser, caddy as caddy_parser
+from app.glean.pipeline import _detect_format
+from app.glean.base import _compile, load_patterns, now_iso
+from app.glean.pipeline import _write_batch, _SCHEMA
 from app.services.search import build_fts_index
 from app.services.models import RetrievedEntry

@ -85,7 +85,7 @@ class WatchSource:
            "source_id": self.config.source_id,
            "type": self.config.source_type,
            "running": self._thread is not None and self._thread.is_alive(),
-            "entries_ingested": self._entry_count,
+            "entries_gleaned": self._entry_count,
            "last_event": self._last_event,
            "error": self._error,
        }
--- a/docs/tautulli-setup.md
+++ b/docs/tautulli-setup.md
@ -39,7 +39,7 @@ notification agent:
 ## Webhook URL

 ```
-http://<turnstone-host>:8534/turnstone/api/ingest/tautulli
+http://<turnstone-host>:8534/turnstone/api/glean/tautulli
 ```

 Replace `<turnstone-host>` with the hostname or IP of the machine running
--- a/harvester/harvester.py
+++ b/harvester/harvester.py
@ -2,7 +2,7 @@
 """Turnstone Harvester — collect logs and ship them to a Turnstone instance.

 Subcommands:
-    push      Read sources.yaml, POST each log file to Turnstone /api/ingest/upload
+    push      Read sources.yaml, POST each log file to Turnstone /api/glean/upload
    incident  Tag an incident on the remote Turnstone instance

 Usage:
@ -97,8 +97,8 @@ def cmd_push(args: argparse.Namespace) -> int:
        logger.warning("No sources defined in %s", sources_path)
        return 0

-    upload_url = args.url.rstrip("/") + "/turnstone/api/ingest/upload"
-    total_ingested = 0
+    upload_url = args.url.rstrip("/") + "/turnstone/api/glean/upload"
+    total_gleaned = 0
    errors = 0

    for src in sources:
@ -110,9 +110,9 @@ def cmd_push(args: argparse.Namespace) -> int:
        logger.info("Pushing %s (%s) ...", src_id, src_path)
        try:
            result = _post_file(upload_url, src_path, src_id)
-            count = result.get("ingested", 0)
-            total_ingested += count
-            logger.info("  %s: %d entries ingested", src_id, count)
+            count = result.get("gleaned", 0)
+            total_gleaned += count
+            logger.info("  %s: %d entries gleaned", src_id, count)
        except urllib.error.HTTPError as exc:
            logger.error("  %s: HTTP %d — %s", src_id, exc.code, exc.read().decode(errors="replace"))
            errors += 1
@ -120,7 +120,7 @@ def cmd_push(args: argparse.Namespace) -> int:
            logger.error("  %s: %s", src_id, exc)
            errors += 1

-    logger.info("Done. Total ingested: %d entries, errors: %d", total_ingested, errors)
+    logger.info("Done. Total gleaned: %d entries, errors: %d", total_gleaned, errors)
    return 1 if errors else 0


--- a/harvester/sources.example.yaml
+++ b/harvester/sources.example.yaml
@ -46,6 +46,6 @@ sources:
  # Wazuh SIEM — alerts.json on the Wazuh manager
  # Turnstone auto-detects this format; source_id is qualified per agent automatically.
  # For push-based ingestion from Wazuh custom integrations, use:
-  #   POST /api/ingest/wazuh/alert  (single alert JSON body)
+  #   POST /api/glean/wazuh/alert  (single alert JSON body)
  # - id: wazuh
  #   path: /var/ossec/logs/alerts/alerts.json
--- a/manage.sh
+++ b/manage.sh
@ -120,9 +120,9 @@ usage() {
    echo -e "    ${GREEN}dev${NC}                      uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})"
    echo ""
    echo "  Data:"
-    echo -e "    ${GREEN}ingest PATH [DB]${NC}         Ingest a log file or corpus directory"
-    echo -e "    ${GREEN}ingest-plex [HOST]${NC}       Pull Plex log from Cass (or HOST) and ingest"
-    echo -e "    ${GREEN}ingest-qbit [HOST]${NC}       Pull qBittorrent log locally or from HOST via SSH"
+    echo -e "    ${GREEN}glean PATH [DB]${NC}          Glean a log file or corpus directory"
+    echo -e "    ${GREEN}glean-plex [HOST]${NC}        Pull Plex log from Cass (or HOST) and glean"
+    echo -e "    ${GREEN}glean-qbit [HOST]${NC}        Pull qBittorrent log locally or from HOST via SSH"
    echo -e "    ${GREEN}build-fts${NC}                Rebuild the FTS search index"
    echo ""
    echo "  Tests:"
@ -134,8 +134,8 @@ usage() {
    echo "  Examples:"
    echo "    ./manage.sh start"
    echo "    ./manage.sh dev"
-    echo "    ./manage.sh ingest corpus/raw/"
-    echo "    ./manage.sh ingest corpus/raw/ data/custom.db"
+    echo "    ./manage.sh glean corpus/raw/"
+    echo "    ./manage.sh glean corpus/raw/ data/custom.db"
    echo ""
 }

@ -231,15 +231,15 @@ case "$CMD" in
        (cd web && npm run dev -- --port "$VITE_PORT")
        ;;

-    ingest)
+    glean)
        if [[ $# -lt 1 ]]; then
-            error "Usage: ./manage.sh ingest <file_or_dir> [DB_PATH]"
+            error "Usage: ./manage.sh glean <file_or_dir> [DB_PATH]"
        fi
-        info "Ingesting $1 → ${2:-$DB}…"
-        "$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}"
+        info "Gleaning $1 → ${2:-$DB}…"
+        "$PYTHON" scripts/glean_corpus.py "$1" "${2:-$DB}"
        ;;

-    ingest-plex)
+    glean-plex)
        PLEX_HOST="${1:-cass}"
        PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs"
        TMP_DIR="/tmp/turnstone-plex-$$"
@ -264,16 +264,16 @@ case "$CMD" in
            ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path"
        done

-        info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
+        info "Gleaning ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
        for f in "$TMP_DIR"/*.log; do
-            "$PYTHON" scripts/ingest_corpus.py "$f" "$DB"
+            "$PYTHON" scripts/glean_corpus.py "$f" "$DB"
        done
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
        ;;

-    ingest-qbit)
+    glean-qbit)
        QBIT_HOST="${1:-}"
        # Default log locations in priority order
        QBIT_LOG_PATHS=(
@ -316,8 +316,8 @@ case "$CMD" in
            info "  ← ${LOCAL_LOG}"
        fi

-        info "Ingesting into ${DB}…"
-        "$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB"
+        info "Gleaning into ${DB}…"
+        "$PYTHON" scripts/glean_corpus.py "${TMP_DIR}"/*.log "$DB"
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
--- a/patterns/default.yaml
+++ b/patterns/default.yaml
@ -1,4 +1,4 @@
-# Turnstone pattern library — named regex patterns for log tagging at ingest time.
+# Turnstone pattern library — named regex patterns for log tagging at glean time.
 # Each matched pattern name is stored on RetrievedEntry.matched_patterns and
 # used to boost retrieval relevance for diagnostic queries.
 #
@ -128,6 +128,21 @@ patterns:
    severity: ERROR
    description: NFS mount or RPC timeout

+  - name: service_crash_loop
+    pattern: "(restart counter is at [0-9]|start request repeated too quickly|Restart limit hit)"
+    severity: WARN
+    description: systemd service crash-looping — restart counter incrementing or rate-limit hit; check for DNS resolution failures, missing dependencies, or bad config
+
+  - name: pkg_daemon_restart
+    pattern: "(invoke-rc\\.d|Unit process.*(apt-get|dpkg|preinst).*remains running after unit stopped|Stopped.*service.*openssh|Restarting.*OpenBSD Secure Shell)"
+    severity: WARN
+    description: Package manager restarted a system daemon — active SSH or service sessions may have been interrupted
+
+  - name: ssh_forward_conflict
+    pattern: "(channel_setup_fwd_listener_tcpip: cannot listen to port|error: bind.*Address already in use)"
+    severity: WARN
+    description: SSH port-forward conflict — previous session port still bound; stale sessions accumulating or rapid reconnects
+
  # Add device/service-specific patterns below this line:

  - name: qbit_tracker_error
--- a/patterns/sources-cluster.yaml
+++ b/patterns/sources-cluster.yaml
@ -1,15 +1,15 @@
-# Turnstone log sources — Heimdall cluster ingest.
+# Turnstone log sources — Heimdall cluster glean.
 # Covers: Heimdall (local), Navi, Sif, Cass, Strahl (SSH-collected),
 #         Docker services on Heimdall, and network device syslog.
 #
-# Collected by scripts/collect_cluster_logs.sh before each ingest run.
+# Collected by scripts/collect_cluster_logs.sh before each glean run.
 # All paths are container-side (/data/ = bind-mount of /devl/turnstone-cluster/data/).
 #
-# Cron (collect + ingest, every 15 min):
+# Cron (collect + glean, every 15 min):
 #   */15 * * * * bash /Library/Development/CircuitForge/turnstone/scripts/collect_cluster_logs.sh && \
-#     docker exec turnstone-cluster python scripts/ingest_corpus.py \
+#     docker exec turnstone-cluster python scripts/glean_corpus.py \
 #       --sources /patterns/sources-cluster.yaml --db /data/turnstone.db \
-#       >> /var/log/turnstone-cluster-ingest.log 2>&1
+#       >> /var/log/turnstone-cluster-glean.log 2>&1

 sources:
  # ── Heimdall (local) ─────────────────────────────────────────────────────────
--- a/patterns/sources.yaml
+++ b/patterns/sources.yaml
@ -1,8 +1,8 @@
 # Turnstone log sources — edit this file to add or remove services.
 # NOTE: the system-journal entry requires export_journal.sh to run on the HOST
-# before the container ingest step. See crontab setup instructions in the README.
-# Run ingest manually:
-#   sudo podman exec turnstone python scripts/ingest_corpus.py \
+# before the container glean step. See crontab setup instructions in the README.
+# Run glean manually:
+#   sudo podman exec turnstone python scripts/glean_corpus.py \
 #     --sources /patterns/sources.yaml --db /data/turnstone.db
 #
 # Paths here are container-side paths under the /opt bind mount.
@ -12,7 +12,7 @@
 sources:
  # ── System (exported by export_journal.sh on the host) ───────────────────
  # journal-export.jsonl and dmesg-export.txt are written to /opt/turnstone/data/
-  # by the export script before each ingest run.
+  # by the export script before each glean run.
  - id: system-journal
    path: /data/journal-export.jsonl

@ -73,7 +73,7 @@ sources:

  # ── MQTT / IoT (live — subscribe mode, no path needed) ───────────────────
  # Requires: pip install circuitforge-core[mqtt]
-  # These sources are handled by the live MQTT subscriber task (not batch ingest).
+  # These sources are handled by the live MQTT subscriber task (not batch glean).
  # Uncomment and configure to enable.
  #
  # Meshtastic MQTT bridge (node must have MQTT uplink enabled):
--- a/podman-standalone.sh
+++ b/podman-standalone.sh
@ -2,7 +2,7 @@
 # podman-standalone.sh — Turnstone rootful Podman setup (no Compose)
 #
 # For hosts running system Podman (non-rootless) with systemd.
-# Turnstone is a diagnostic log intelligence layer — ingest service logs,
+# Turnstone is a diagnostic log intelligence layer — glean service logs,
 # search by symptom, and view incidents in a lightweight web UI.
 #
 # ── Prerequisites ────────────────────────────────────────────────────────────
@ -28,18 +28,18 @@
 #   sudo systemctl daemon-reload
 #   sudo systemctl enable --now turnstone
 #
-# ── Ingesting logs ────────────────────────────────────────────────────────────
+# ── Gleaning logs ─────────────────────────────────────────────────────────────
 #   All service logs under /opt are accessible inside the container.
 #   Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/).
 #
-#   To ingest all sources (run manually or via cron):
+#   To glean all sources (run manually or via cron):
 #
-#     sudo podman exec turnstone python scripts/ingest_corpus.py \
+#     sudo podman exec turnstone python scripts/glean_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db
 #
 #   Example cron (every 15 minutes, add to root's crontab with: sudo crontab -e):
-#     */15 * * * * podman exec turnstone python scripts/ingest_corpus.py \
-#       --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-ingest.log 2>&1
+#     */15 * * * * podman exec turnstone python scripts/glean_corpus.py \
+#       --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1
 #
 #   To add a new log source: edit /opt/turnstone/patterns/sources.yaml — no restart needed.
 #
@ -73,7 +73,7 @@ TZ=America/Los_Angeles
 #
 # ── Orchard submission (opt-in telemetry) ────────────────────────────────────
 # Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF
-# receiving instance after each ingest run. Only matched entries are sent —
+# receiving instance after each glean run. Only matched entries are sent —
 # no raw log content. Used to build Avocet training data.
 #
 #   export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/contrib2
@ -142,8 +142,8 @@ echo "Check container health with:"
 echo "  sudo podman ps"
 echo "  sudo podman logs turnstone"
 echo ""
-echo "To ingest all sources now:"
-echo "  sudo podman exec turnstone python scripts/ingest_corpus.py \\"
+echo "To glean all sources now:"
+echo "  sudo podman exec turnstone python scripts/glean_corpus.py \\"
 echo "    --sources /patterns/sources.yaml --db /data/turnstone.db"
 echo ""
 echo "To add a new source: edit /opt/turnstone/patterns/sources.yaml — no restart needed."
--- a/requirements.txt
+++ b/requirements.txt
@ -6,3 +6,4 @@ aiofiles>=23.0.0
 python-multipart>=0.0.9
 dateparser>=1.2.0
 httpx>=0.27.0
+paramiko
--- a/scripts/build_fts_index.py
+++ b/scripts/build_fts_index.py
@ -1,4 +1,4 @@
-"""CLI: build (or update) the FTS5 full-text search index after ingest."""
+"""CLI: build (or update) the FTS5 full-text search index after glean."""
 from __future__ import annotations

 import sys
@ -13,7 +13,7 @@ if __name__ == "__main__":

    if not db_path.exists():
        print(f"ERROR: database not found: {db_path}", file=sys.stderr)
-        print("Run ingest first: python scripts/ingest_corpus.py", file=sys.stderr)
+        print("Run glean first: python scripts/glean_corpus.py", file=sys.stderr)
        sys.exit(1)

    print(f"Building FTS index for {db_path} ...")
--- a/scripts/collect_cluster_logs.sh
+++ b/scripts/collect_cluster_logs.sh
@ -20,7 +20,7 @@ SSH_OPTS="-o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=no"
 PYTHON=/devl/miniconda3/envs/cf/bin/python
 INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/ingest_corpus.py"
 DB=/devl/turnstone-cluster/data/turnstone.db
-LOG=/devl/turnstone-cluster/data/ingest.log
+LOG=/devl/turnstone-cluster/data/glean.log

 mkdir -p "${DATA_DIR}"

@ -141,7 +141,7 @@ fi
  # Remote journals (explicit source IDs via YAML)
  ${INGEST} --sources /devl/turnstone-cluster/patterns/sources-cluster.yaml --db "${DB}"

-  # Docker and Plex logs (source IDs derived from filenames by directory ingest)
+  # Docker and Plex logs (source IDs derived from filenames by directory glean)
  for dir in "${HEIMDALL_DIR}" "${NAVI_DIR}" "${STRAHL_DIR}" "${PLEX_DIR}"; do
    [[ -d "${dir}" ]] && ls "${dir}"/*.jsonl "${dir}"/*.log 2>/dev/null | grep -q . && \
      ${INGEST} "${dir}" "${DB}" || true
--- a/scripts/export_journal.sh
+++ b/scripts/export_journal.sh
@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Export recent system messages to files the Turnstone container can ingest.
+# Export recent system messages to files the Turnstone container can glean.
 #
 # Exports:
 #   journal-export.jsonl  — journald (if journalctl is available)
@ -11,11 +11,11 @@
 # Usage (standalone):
 #   sudo bash /opt/turnstone/scripts/export_journal.sh
 #
-# Cron (combined with ingest):
+# Cron (combined with glean):
 #   */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \
 #     podman exec turnstone python scripts/ingest_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db \
-#       >> /var/log/turnstone-ingest.log 2>&1
+#       >> /var/log/turnstone-glean.log 2>&1

 set -euo pipefail

--- a/scripts/ingest_corpus.py
+++ b/scripts/ingest_corpus.py
@ -1,11 +1,11 @@
-"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database.
+"""CLI: glean a log file or corpus directory into the Turnstone SQLite database.

 Usage:
    # Single file or directory (legacy)
-    python scripts/ingest_corpus.py <file_or_dir> [db_path]
+    python scripts/glean_corpus.py <file_or_dir> [db_path]

    # Sources config (multi-service)
-    python scripts/ingest_corpus.py --sources <sources.yaml> [--db <db_path>]
+    python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>]
 """
 from __future__ import annotations

@ -17,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

 sys.path.insert(0, str(Path(__file__).parent.parent))

-from app.ingest.pipeline import ingest, ingest_file, ingest_sources
+from app.glean.pipeline import glean_dir, glean_file, glean_sources


 def _print_stats(stats: dict[str, int]) -> None:
@ -33,33 +33,33 @@ if __name__ == "__main__":
    if not args:
        print(
            "Usage:\n"
-            "  ingest_corpus.py <file_or_dir> [db_path]\n"
-            "  ingest_corpus.py --sources <sources.yaml> [--db <db_path>]",
+            "  glean_corpus.py <file_or_dir> [db_path]\n"
+            "  glean_corpus.py --sources <sources.yaml> [--db <db_path>]",
            file=sys.stderr,
        )
        sys.exit(1)

    if args[0] == "--sources":
        if len(args) < 2:
-            print("Usage: ingest_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
+            print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
            sys.exit(1)
        sources_file = Path(args[1])
        db_path = Path("data/turnstone.db")
        if "--db" in args:
            db_path = Path(args[args.index("--db") + 1])
        db_path.parent.mkdir(parents=True, exist_ok=True)
-        print(f"Ingesting sources from {sources_file} → {db_path}")
-        stats = ingest_sources(sources_file, db_path)
+        print(f"Gleaning sources from {sources_file} → {db_path}")
+        stats = glean_sources(sources_file, db_path)
        _print_stats(stats)
    else:
        target = Path(args[0])
        db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db")
        db_path.parent.mkdir(parents=True, exist_ok=True)
-        print(f"Ingesting {target} → {db_path}")
+        print(f"Gleaning {target} → {db_path}")
        if target.is_file():
-            stats = ingest_file(target, db_path)
+            stats = glean_file(target, db_path)
        elif target.is_dir():
-            stats = ingest(target, db_path)
+            stats = glean_dir(target, db_path)
        else:
            print(f"Error: {target} is not a file or directory", file=sys.stderr)
            sys.exit(1)
--- a/tests/context/test_doc_upload.py
+++ b/tests/context/test_doc_upload.py
@ -3,7 +3,7 @@ import sqlite3
 import pytest
 from pathlib import Path

-from app.ingest.doc_upload import ingest_upload
+from app.glean.doc_upload import glean_upload
 from app.context.store import list_facts, list_documents
 from app.context.chunker import UnsupportedDocType

@ -40,7 +40,7 @@ services:
    ports:
      - "32400:32400"
 """
-    result = ingest_upload(db, "docker-compose.yml", yaml_bytes)
+    result = glean_upload(db, "docker-compose.yml", yaml_bytes)
    assert result["doc_type"] == "yaml"
    assert result["facts_written"] >= 1
    assert result["chunks_written"] >= 1
@ -53,7 +53,7 @@ services:

 def test_ingest_markdown_no_facts(db):
    md = b"# Runbook\n\nRestart plex with `systemctl restart plex`."
-    result = ingest_upload(db, "runbook.md", md)
+    result = glean_upload(db, "runbook.md", md)
    assert result["doc_type"] == "markdown"
    assert result["facts_written"] == 0
    assert result["chunks_written"] >= 1
@ -61,4 +61,4 @@ def test_ingest_markdown_no_facts(db):

 def test_ingest_raises_on_bad_type(db):
    with pytest.raises(UnsupportedDocType):
-        ingest_upload(db, "report.pdf", b"data")
+        glean_upload(db, "report.pdf", b"data")
--- a/tests/context/test_embedder.py
+++ b/tests/context/test_embedder.py
@ -1,13 +1,17 @@
-"""Tests for app/context/embedder.py — graceful no-op without sqlite-vec."""
+"""Tests for app/context/embedder.py — delegates to app.services.embeddings."""
 import sqlite3
+import struct
 from pathlib import Path
-from unittest.mock import patch
+from unittest.mock import MagicMock, patch
+
+import numpy as np
 import pytest
+
 from app.context import embedder as emb_mod


-@pytest.fixture
-def db(tmp_path):
+@pytest.fixture()
+def db(tmp_path: Path) -> Path:
    db_path = tmp_path / "t.db"
    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
@ -20,34 +24,78 @@ def db(tmp_path):
                REFERENCES context_documents(id) ON DELETE CASCADE,
            chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
        );
-        INSERT INTO context_documents VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
+        INSERT INTO context_documents
+            VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
        INSERT INTO context_chunks VALUES ('c1','d1',0,'hello world',NULL);
+        INSERT INTO context_chunks VALUES ('c2','d1',1,'second chunk',NULL);
    """)
    conn.commit()
    conn.close()
    return db_path


-def test_embed_skipped_when_extension_absent(db):
-    with patch.object(emb_mod, "EMBEDDING_AVAILABLE", False):
-        count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
+def _mock_embedder(dim: int = 3) -> MagicMock:
+    """Return a mock Embedder that returns constant dim-length vectors."""
+    m = MagicMock()
+    m.dim = dim
+    m.embed_batch.return_value = [np.zeros(dim, dtype=np.float32)] * 10
+    return m
+
+
+class TestEmbedChunks:
+    def test_returns_zero_when_no_embedder(self, db: Path) -> None:
+        with patch("app.context.embedder.get_embedder", return_value=None):
+            count = emb_mod.embed_chunks(db, "d1")
        assert count == 0

-
-def test_embed_calls_ollama_when_available(db):
-    import httpx
-
-    class FakeResponse:
-        status_code = 200
-        def raise_for_status(self): pass
-        def json(self): return {"embedding": [0.1, 0.2, 0.3]}
-
-    with patch.object(emb_mod, "EMBEDDING_AVAILABLE", True), \
-         patch("app.context.embedder.httpx.post", return_value=FakeResponse()):
-        count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
-    assert count == 1
-    # Verify blob was written
+    def test_returns_zero_when_no_unembedded_chunks(self, db: Path) -> None:
+        # Pre-fill both chunks with a blob
+        blob = struct.pack("3f", 0.1, 0.2, 0.3)
        conn = sqlite3.connect(str(db))
-    row = conn.execute("SELECT embedding FROM context_chunks WHERE id='c1'").fetchone()
+        conn.execute("UPDATE context_chunks SET embedding=?", (blob,))
+        conn.commit()
        conn.close()
-    assert row[0] is not None
+
+        embedder = _mock_embedder()
+        with patch("app.context.embedder.get_embedder", return_value=embedder):
+            count = emb_mod.embed_chunks(db, "d1")
+        assert count == 0
+        embedder.embed_batch.assert_not_called()
+
+    def test_embeds_all_null_chunks(self, db: Path) -> None:
+        embedder = _mock_embedder(dim=3)
+        with patch("app.context.embedder.get_embedder", return_value=embedder):
+            count = emb_mod.embed_chunks(db, "d1")
+        assert count == 2  # two chunks in fixture
+
+    def test_blobs_written_to_db(self, db: Path) -> None:
+        vec = np.array([0.1, 0.2, 0.3], dtype=np.float32)
+        embedder = _mock_embedder(dim=3)
+        embedder.embed_batch.return_value = [vec, vec]
+
+        with patch("app.context.embedder.get_embedder", return_value=embedder):
+            emb_mod.embed_chunks(db, "d1")
+
+        conn = sqlite3.connect(str(db))
+        rows = conn.execute(
+            "SELECT embedding FROM context_chunks WHERE document_id='d1'"
+        ).fetchall()
+        conn.close()
+        for (blob,) in rows:
+            assert blob is not None
+            unpacked = struct.unpack(f"{len(blob)//4}f", blob)
+            assert len(unpacked) == 3
+
+    def test_legacy_llm_url_param_accepted(self, db: Path) -> None:
+        """Ensure backward-compat signature still works (llm_url ignored)."""
+        embedder = _mock_embedder()
+        with patch("app.context.embedder.get_embedder", return_value=embedder):
+            count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434", "nomic-embed-text")
+        assert count == 2
+
+    def test_embed_batch_error_returns_zero(self, db: Path) -> None:
+        embedder = _mock_embedder()
+        embedder.embed_batch.side_effect = RuntimeError("model exploded")
+        with patch("app.context.embedder.get_embedder", return_value=embedder):
+            count = emb_mod.embed_chunks(db, "d1")
+        assert count == 0
--- a/tests/context/test_schema.py
+++ b/tests/context/test_schema.py
@ -2,7 +2,7 @@
 import sqlite3
 from pathlib import Path
 import pytest
-from app.ingest.pipeline import ensure_schema
+from app.glean.pipeline import ensure_schema


 def test_context_tables_created(tmp_path):
--- a/tests/test_blocklist_endpoints.py
+++ b/tests/test_blocklist_endpoints.py
@ -9,7 +9,7 @@ from unittest.mock import MagicMock, patch
@pytest.fixture
 def client(tmp_path):
    from fastapi.testclient import TestClient
-    from app.ingest.pipeline import ensure_schema
+    from app.glean.pipeline import ensure_schema
    import app.rest as rest_module

    db = tmp_path / "test.db"
@ -25,7 +25,7 @@ def client(tmp_path):
@pytest.fixture
 def client_with_candidate(tmp_path):
    from fastapi.testclient import TestClient
-    from app.ingest.pipeline import ensure_schema
+    from app.glean.pipeline import ensure_schema
    import app.rest as rest_module
    import sqlite3, uuid

--- a/tests/test_diagnose_classifier.py
+++ b/tests/test_diagnose_classifier.py
@ -0,0 +1,245 @@
+"""Tests for app/services/diagnose/classifier.py — SeverityClassifier.
+
+All ML-path tests mock ``transformers.pipeline`` so no model weights are
+downloaded during the test suite.
+"""
+from __future__ import annotations
+
+from dataclasses import FrozenInstanceError
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import app.services.diagnose.classifier as clf_module
+from app.services.diagnose.classifier import SeverityClassifier
+from app.services.diagnose.models import ClassifiedTimeline, EventCluster, TimelineResult
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture(autouse=True)
+def reset_ml_singleton():
+    """Ensure the module-level ML singleton is cleared before and after each test."""
+    clf_module._ml_classifier = None
+    yield
+    clf_module._ml_classifier = None
+
+
+# ---------------------------------------------------------------------------
+# Test-object builders
+# ---------------------------------------------------------------------------
+
+
+def _make_cluster(
+    representative_text: str = "test log",
+    pattern_tags: tuple[str, ...] = (),
+    severity: str = "INFO",
+) -> EventCluster:
+    return EventCluster(
+        cluster_id="abc123",
+        entries=("e1",),
+        start_iso=None,
+        end_iso=None,
+        duration_seconds=0.0,
+        source_ids=("src",),
+        pattern_tags=pattern_tags,
+        severity=severity,  # type: ignore[arg-type]
+        burst=False,
+        gap_before_seconds=0.0,
+        representative_text=representative_text,
+    )
+
+
+def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
+    return TimelineResult(
+        clusters=clusters,
+        total_entries=0,
+        window_start=None,
+        window_end=None,
+        gap_count=0,
+        burst_count=0,
+        dominant_sources=(),
+    )
+
+
+def _mock_hf_pipeline(label: str, score: float) -> MagicMock:
+    """Return a mock HF pipeline callable that always yields one result."""
+    pipe = MagicMock()
+    pipe.return_value = [{"label": label, "score": score}]
+    return pipe
+
+
+# ---------------------------------------------------------------------------
+# Path A — ML classification
+# ---------------------------------------------------------------------------
+
+
+class TestMLPath:
+    def test_ml_error_maps_to_error(self) -> None:
+        """ML returning ERROR with score 0.98 → cluster severity ERROR."""
+        pipe = _mock_hf_pipeline("ERROR", 0.98)
+        with patch(
+            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
+        ):
+            clf = SeverityClassifier(model_id="fake/model")
+            result = clf.classify(_make_timeline(((_make_cluster("disk error detected")),)))
+
+        assert result.cluster_severities["abc123"] == "ERROR"
+        assert result.classifier_used == "ml"
+        assert result.model_id == "fake/model"
+
+    def test_ml_critical_promotion(self) -> None:
+        """ERROR + score > 0.95 + 'kernel panic' in text → promoted to CRITICAL."""
+        pipe = _mock_hf_pipeline("ERROR", 0.97)
+        with patch(
+            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
+        ):
+            clf = SeverityClassifier(model_id="fake/model")
+            result = clf.classify(
+                _make_timeline((_make_cluster("kernel panic: not syncing VFS"),))
+            )
+
+        assert result.cluster_severities["abc123"] == "CRITICAL"
+
+    def test_ml_debug_demotion(self) -> None:
+        """INFO + score < 0.4 → demoted to DEBUG."""
+        pipe = _mock_hf_pipeline("INFO", 0.3)
+        with patch(
+            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
+        ):
+            clf = SeverityClassifier(model_id="fake/model")
+            result = clf.classify(_make_timeline((_make_cluster("routine ping"),)))
+
+        assert result.cluster_severities["abc123"] == "DEBUG"
+
+    def test_ml_warning_maps_to_warn(self) -> None:
+        """ML returning WARNING → mapped to WARN."""
+        pipe = _mock_hf_pipeline("WARNING", 0.85)
+        with patch(
+            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
+        ):
+            clf = SeverityClassifier(model_id="fake/model")
+            result = clf.classify(_make_timeline((_make_cluster("low disk space"),)))
+
+        assert result.cluster_severities["abc123"] == "WARN"
+
+
+# ---------------------------------------------------------------------------
+# Path B — pattern_tags fallback
+# ---------------------------------------------------------------------------
+
+
+class TestPatternTagsPath:
+    def test_pattern_tags_resolve_error_severity(self, tmp_path: Path) -> None:
+        """Cluster with pattern_tag 'service_crash_loop' → ERROR from pattern file."""
+        pattern_yaml = tmp_path / "default.yaml"
+        pattern_yaml.write_text(
+            "patterns:\n"
+            "  - name: service_crash_loop\n"
+            "    pattern: crash\n"
+            "    severity: ERROR\n"
+            "    description: Service crashed in a loop\n"
+        )
+        clf = SeverityClassifier(model_id="", pattern_file=pattern_yaml)
+        cluster = _make_cluster(
+            representative_text="service crashed",
+            pattern_tags=("service_crash_loop",),
+        )
+        result = clf.classify(_make_timeline((cluster,)))
+
+        assert result.cluster_severities["abc123"] == "ERROR"
+        assert result.classifier_used == "pattern_tags"
+        assert result.model_id is None
+
+
+# ---------------------------------------------------------------------------
+# Path C — regex fallback
+# ---------------------------------------------------------------------------
+
+
+class TestRegexPath:
+    def test_regex_detects_error(self) -> None:
+        """No ML, no pattern file: 'ERROR: disk full' → ERROR via regex."""
+        clf = SeverityClassifier(model_id="")
+        result = clf.classify(
+            _make_timeline((_make_cluster("ERROR: disk full"),))
+        )
+
+        assert result.cluster_severities["abc123"] == "ERROR"
+        assert result.classifier_used == "regex"
+
+    def test_regex_defaults_to_info_when_no_match(self) -> None:
+        """No severity keyword in text → defaults to INFO."""
+        clf = SeverityClassifier(model_id="")
+        result = clf.classify(
+            _make_timeline((_make_cluster("mount: disk mounted successfully"),))
+        )
+
+        assert result.cluster_severities["abc123"] == "INFO"
+
+
+# ---------------------------------------------------------------------------
+# Fallback behaviour
+# ---------------------------------------------------------------------------
+
+
+class TestImportErrorFallback:
+    def test_transformers_import_error_falls_back_to_pattern_tags(
+        self, tmp_path: Path
+    ) -> None:
+        """ImportError from transformers → clean fallback to pattern_tags path."""
+        pattern_yaml = tmp_path / "default.yaml"
+        pattern_yaml.write_text(
+            "patterns:\n"
+            "  - name: auth_failure\n"
+            "    pattern: auth\n"
+            "    severity: ERROR\n"
+            "    description: Auth failure\n"
+        )
+
+        def _raising_get_ml(*_args: Any, **_kwargs: Any) -> None:
+            raise ImportError("No module named 'transformers'")
+
+        with patch(
+            "app.services.diagnose.classifier._get_ml_classifier",
+            side_effect=_raising_get_ml,
+        ):
+            clf = SeverityClassifier(model_id="fake/model", pattern_file=pattern_yaml)
+            cluster = _make_cluster(
+                representative_text="auth failed",
+                pattern_tags=("auth_failure",),
+            )
+            result = clf.classify(_make_timeline((cluster,)))
+
+        # ML was attempted (classifier_used == "ml") but pattern_tags resolved it
+        assert result.classifier_used == "ml"
+        assert result.cluster_severities["abc123"] == "ERROR"
+
+
+# ---------------------------------------------------------------------------
+# Edge cases
+# ---------------------------------------------------------------------------
+
+
+class TestEdgeCases:
+    def test_empty_timeline_produces_empty_severities(self) -> None:
+        """TimelineResult with no clusters → empty cluster_severities, no crash."""
+        clf = SeverityClassifier(model_id="")
+        result = clf.classify(_make_timeline())
+
+        assert isinstance(result, ClassifiedTimeline)
+        assert result.cluster_severities == {}
+        assert result.classifier_used == "regex"
+
+    def test_classified_timeline_is_frozen(self) -> None:
+        """ClassifiedTimeline must be frozen (FrozenInstanceError on mutation)."""
+        clf = SeverityClassifier(model_id="")
+        result = clf.classify(_make_timeline((_make_cluster(),)))
+
+        with pytest.raises(FrozenInstanceError):
+            result.classifier_used = "ml"  # type: ignore[misc]
--- a/tests/test_diagnose_hypothesizer.py
+++ b/tests/test_diagnose_hypothesizer.py
@ -0,0 +1,486 @@
+"""Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer.
+
+All tests use mocking; no real LLM calls are made.
+"""
+from __future__ import annotations
+
+import json
+import re
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.hypothesizer import RootCauseHypothesizer
+from app.services.diagnose.models import (
+    ClassifiedTimeline,
+    EventCluster,
+    Hypothesis,
+    TimelineResult,
+)
+
+
+# ---------------------------------------------------------------------------
+# Fixture helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_cluster(
+    cluster_id: str = "c1",
+    representative_text: str = "kernel: oom-killer invoked",
+    severity: str = "ERROR",
+    source_ids: tuple[str, ...] = ("syslog",),
+    pattern_tags: tuple[str, ...] = ("oom",),
+    start_iso: str | None = "2024-01-01T00:00:00+00:00",
+) -> EventCluster:
+    return EventCluster(
+        cluster_id=cluster_id,
+        entries=("e1",),
+        start_iso=start_iso,
+        end_iso=None,
+        duration_seconds=1.0,
+        source_ids=source_ids,
+        pattern_tags=pattern_tags,
+        severity=severity,  # type: ignore[arg-type]
+        burst=False,
+        gap_before_seconds=0.0,
+        representative_text=representative_text,
+    )
+
+
+def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
+    return TimelineResult(
+        clusters=clusters,
+        total_entries=len(clusters),
+        window_start=None,
+        window_end=None,
+        gap_count=0,
+        burst_count=0,
+        dominant_sources=(),
+    )
+
+
+def _make_classified(
+    clusters: tuple[EventCluster, ...] = (),
+    cluster_severities: dict | None = None,
+) -> ClassifiedTimeline:
+    if cluster_severities is None:
+        cluster_severities = {c.cluster_id: c.severity for c in clusters}
+    return ClassifiedTimeline(
+        timeline=_make_timeline(clusters),
+        cluster_severities=cluster_severities,
+        classifier_used="pattern_tags",
+        model_id=None,
+    )
+
+
+def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
+    return RetrievedContext(
+        facts=[],
+        chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}],
+    )
+
+
+def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock:
+    """Build a mock httpx.Response that returns the given list as JSON."""
+    mock_resp = MagicMock()
+    mock_resp.status_code = 200
+    mock_resp.json.return_value = {
+        "choices": [{"message": {"content": json.dumps(items)}}]
+    }
+    return mock_resp
+
+
+_SAMPLE_HYPOTHESES = [
+    {
+        "title": "OOM killer terminated critical process",
+        "description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.",
+        "confidence": 0.85,
+        "severity": "CRITICAL",
+        "supporting_clusters": ["c1"],
+    },
+    {
+        "title": "Disk I/O saturation",
+        "description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.",
+        "confidence": 0.6,
+        "severity": "ERROR",
+        "supporting_clusters": ["c2"],
+    },
+]
+
+
+# ---------------------------------------------------------------------------
+# Test 1: Valid JSON response returns correct Hypothesis objects
+# ---------------------------------------------------------------------------
+
+
+def test_valid_json_response_returns_hypotheses():
+    """Valid LLM JSON array produces a list of Hypothesis objects with correct fields."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="why is memory failing?",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 2
+    assert isinstance(results[0], Hypothesis)
+    assert results[0].title == "OOM killer terminated critical process"
+    assert results[0].confidence == pytest.approx(0.85)
+    assert results[0].severity == "CRITICAL"
+    assert results[0].supporting_cluster_ids == ("c1",)
+    assert results[1].title == "Disk I/O saturation"
+    assert results[1].severity == "ERROR"
+
+
+# ---------------------------------------------------------------------------
+# Test 2: hypothesis_id is a non-empty UUID string on each result
+# ---------------------------------------------------------------------------
+
+
+_UUID_RE = re.compile(
+    r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
+)
+
+
+def test_hypothesis_id_is_uuid():
+    """Each returned Hypothesis carries a distinct UUID v4 hypothesis_id."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 2
+    for h in results:
+        assert h.hypothesis_id, "hypothesis_id must not be empty"
+        assert _UUID_RE.match(h.hypothesis_id), (
+            f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4"
+        )
+    # Each ID must be distinct
+    ids = [h.hypothesis_id for h in results]
+    assert len(set(ids)) == len(ids), "hypothesis_ids must be unique"
+
+
+# ---------------------------------------------------------------------------
+# Test 3: Malformed JSON response returns [] with a logged warning
+# ---------------------------------------------------------------------------
+
+
+def test_malformed_json_returns_empty_and_warns(caplog):
+    """When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    bad_resp = MagicMock()
+    bad_resp.status_code = 200
+    bad_resp.json.return_value = {
+        "choices": [{"message": {"content": "not valid json"}}]
+    }
+
+    import logging
+    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert results == []
+    assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records)
+
+
+# ---------------------------------------------------------------------------
+# Test 4: Non-list JSON (dict) returns []
+# ---------------------------------------------------------------------------
+
+
+def test_non_list_json_returns_empty(caplog):
+    """When the LLM returns a JSON object instead of an array, hypothesize() returns []."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    dict_resp = MagicMock()
+    dict_resp.status_code = 200
+    dict_resp.json.return_value = {
+        "choices": [{"message": {"content": '{"error": "oops"}'}}]
+    }
+
+    import logging
+    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert results == []
+    assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records)
+
+
+# ---------------------------------------------------------------------------
+# Test 5: Empty clusters returns [] without any LLM call
+# ---------------------------------------------------------------------------
+
+
+def test_empty_clusters_returns_empty_no_llm_call():
+    """ClassifiedTimeline with no clusters returns [] and never calls the LLM."""
+    classified = _make_classified(clusters=())
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    with patch("httpx.post") as mock_post:
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert results == []
+    mock_post.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test 6: No LLM URL returns [] without any HTTP call
+# ---------------------------------------------------------------------------
+
+
+def test_no_llm_url_returns_empty_no_http_call():
+    """When llm_url is None, hypothesize() returns [] immediately with no HTTP requests."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    with patch("httpx.post") as mock_post:
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url=None,
+            llm_model="llama3",
+        )
+
+    assert results == []
+    mock_post.assert_not_called()
+
+
+def test_empty_llm_url_returns_empty_no_http_call():
+    """When llm_url is empty string, hypothesize() returns [] immediately."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    with patch("httpx.post") as mock_post:
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="",
+            llm_model="llama3",
+        )
+
+    assert results == []
+    mock_post.assert_not_called()
+
+
+def test_no_llm_model_returns_empty_no_http_call():
+    """When llm_model is None, hypothesize() returns [] immediately."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    with patch("httpx.post") as mock_post:
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model=None,
+        )
+
+    assert results == []
+    mock_post.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test 7: max_hypotheses is respected
+# ---------------------------------------------------------------------------
+
+
+def test_max_hypotheses_respected():
+    """When LLM returns more items than max_hypotheses, only max_hypotheses are returned."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer(max_hypotheses=3)
+
+    six_items = [
+        {
+            "title": f"Hypothesis {i}",
+            "description": "Some description. A second sentence. Third sentence here.",
+            "confidence": 0.5,
+            "severity": "ERROR",
+            "supporting_clusters": ["c1"],
+        }
+        for i in range(6)
+    ]
+    mock_resp = _llm_json_response(six_items)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 3
+
+
+# ---------------------------------------------------------------------------
+# Test 8: Severity validation — WARNING → WARN, garbage → ERROR
+# ---------------------------------------------------------------------------
+
+
+def test_severity_warning_maps_to_warn():
+    """'WARNING' from the LLM is normalised to 'WARN'."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    items = [
+        {
+            "title": "A warning severity hypothesis",
+            "description": "Test description. Second sentence. Third.",
+            "confidence": 0.7,
+            "severity": "WARNING",
+            "supporting_clusters": ["c1"],
+        }
+    ]
+    mock_resp = _llm_json_response(items)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 1
+    assert results[0].severity == "WARN"
+
+
+def test_severity_garbage_maps_to_error():
+    """An unrecognised severity string from the LLM defaults to 'ERROR'."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    items = [
+        {
+            "title": "A garbage severity hypothesis",
+            "description": "Test description. Second sentence. Third.",
+            "confidence": 0.4,
+            "severity": "GARBAGE",
+            "supporting_clusters": ["c1"],
+        }
+    ]
+    mock_resp = _llm_json_response(items)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 1
+    assert results[0].severity == "ERROR"
+
+
+# ---------------------------------------------------------------------------
+# Test 9: Confidence field works with string floats from the LLM
+# ---------------------------------------------------------------------------
+
+
+def test_confidence_string_float_coercion():
+    """A confidence value returned as a string by the LLM is coerced to float via float()."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    items = [
+        {
+            "title": "String confidence test",
+            "description": "Some description. Second sentence. Third.",
+            "confidence": "0.8",  # LLM returned a string, not a float
+            "severity": "INFO",
+            "supporting_clusters": ["c1"],
+        }
+    ]
+    mock_resp = _llm_json_response(items)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 1
+    assert isinstance(results[0].confidence, float)
+    assert results[0].confidence == pytest.approx(0.8)
+
+
+# ---------------------------------------------------------------------------
+# Test 10: Non-numeric confidence string falls back to default 0.5
+# ---------------------------------------------------------------------------
+
+
+def test_non_numeric_confidence_uses_default():
+    """LLM returning 'high' for confidence should not raise and defaults to 0.5."""
+    cluster = _make_cluster()
+    classified = _make_classified(clusters=(cluster,))
+    ctx = _make_ctx()
+    hypothesizer = RootCauseHypothesizer()
+
+    items = [
+        {
+            "title": "t",
+            "description": "d",
+            "confidence": "high",
+            "severity": "ERROR",
+            "supporting_clusters": [],
+        }
+    ]
+    mock_resp = _llm_json_response(items)
+
+    with patch("httpx.post", return_value=mock_resp):
+        results = hypothesizer.hypothesize(
+            classified, ctx, query="test",
+            llm_url="http://localhost:11434",
+            llm_model="llama3",
+        )
+
+    assert len(results) == 1
+    assert isinstance(results[0].confidence, float)
+    assert results[0].confidence == pytest.approx(0.5)
--- a/tests/test_diagnose_pipeline.py
+++ b/tests/test_diagnose_pipeline.py
@ -0,0 +1,489 @@
+"""Tests for app/services/diagnose/pipeline.py and __init__.py feature flag wiring.
+
+All tests use mocking; no real LLM, ML, or DB calls are made.
+"""
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.models import (
+    ClassifiedTimeline,
+    Hypothesis,
+    RankedHypothesis,
+    TimelineResult,
+)
+from app.services.search import SearchResult
+
+
+# ---------------------------------------------------------------------------
+# Shared helpers
+# ---------------------------------------------------------------------------
+
+def _make_search_result(
+    entry_id: str = "e1",
+    source_id: str = "syslog",
+    timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
+    severity: str | None = "ERROR",
+    text: str = "ssh: invalid user",
+) -> SearchResult:
+    return SearchResult(
+        entry_id=entry_id,
+        source_id=source_id,
+        sequence=1,
+        timestamp_iso=timestamp_iso,
+        severity=severity,
+        repeat_count=1,
+        out_of_order=False,
+        matched_patterns=["ssh_fail"],
+        text=text,
+        rank=1.0,
+    )
+
+
+def _make_ctx() -> RetrievedContext:
+    return RetrievedContext(facts=[], chunks=[])
+
+
+def _make_timeline(n_clusters: int = 2) -> TimelineResult:
+    return TimelineResult(
+        clusters=tuple(),
+        total_entries=5,
+        window_start="2026-01-01T00:00:00+00:00",
+        window_end="2026-01-01T01:00:00+00:00",
+        gap_count=0,
+        burst_count=1,
+        dominant_sources=("syslog",),
+    )
+
+
+def _make_classified(timeline: TimelineResult | None = None) -> ClassifiedTimeline:
+    tl = timeline or _make_timeline()
+    return ClassifiedTimeline(
+        timeline=tl,
+        cluster_severities={},
+        classifier_used="regex",
+        model_id=None,
+    )
+
+
+def _make_hypothesis(
+    hypothesis_id: str = "h1",
+    title: str = "SSH flood",
+    confidence: float = 0.87,
+    severity: str = "CRITICAL",
+) -> Hypothesis:
+    return Hypothesis(
+        hypothesis_id=hypothesis_id,
+        title=title,
+        description="Multiple failed SSH attempts.",
+        confidence=confidence,
+        supporting_cluster_ids=("c1",),
+        runbook_refs=(),
+        severity=severity,  # type: ignore[arg-type]
+    )
+
+
+def _make_ranked(hypothesis: Hypothesis | None = None, suppress: bool = False) -> RankedHypothesis:
+    h = hypothesis or _make_hypothesis()
+    return RankedHypothesis(
+        hypothesis=h,
+        novelty_score=0.95,
+        similarity_to_known=0.05,
+        suppress=suppress,
+        suppression_reason="similar to known" if suppress else None,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Helper: collect all events from run_pipeline
+# ---------------------------------------------------------------------------
+
+async def _collect_pipeline_events(**kwargs) -> list[dict[str, Any]]:
+    """Run run_pipeline and collect all yielded events into a list."""
+    from app.services.diagnose.pipeline import run_pipeline
+    events = []
+    async for event in run_pipeline(**kwargs):
+        events.append(event)
+    return events
+
+
+def _default_pipeline_kwargs(entries=None, db_path=None) -> dict:
+    return dict(
+        db_path=db_path or Path("/tmp/fake.db"),
+        entries=entries or [_make_search_result()],
+        ctx=_make_ctx(),
+        query="ssh brute force",
+        since="2026-01-01T00:00:00+00:00",
+        until="2026-01-01T01:00:00+00:00",
+        llm_url=None,
+        llm_model=None,
+        llm_api_key=None,
+    )
+
+
+# ---------------------------------------------------------------------------
+# Mock factories for all 5 stage classes
+# ---------------------------------------------------------------------------
+
+def _mock_all_stages(
+    hypotheses=None,
+    ranked=None,
+    synthesis_text="VERDICT: CRITICAL — SSH flood (87% confidence)",
+):
+    """Return a dict of patch targets and their mock return values."""
+    timeline = _make_timeline()
+    classified = _make_classified(timeline)
+    hyps = hypotheses if hypotheses is not None else [_make_hypothesis()]
+    rnk = ranked if ranked is not None else [_make_ranked()]
+
+    mock_reconstructor = MagicMock()
+    mock_reconstructor.return_value.reconstruct.return_value = timeline
+
+    mock_classifier = MagicMock()
+    mock_classifier.return_value.classify.return_value = classified
+
+    mock_hypothesizer = MagicMock()
+    mock_hypothesizer.return_value.hypothesize.return_value = hyps
+
+    mock_suppressor = MagicMock()
+    mock_suppressor.return_value.suppress.return_value = rnk
+
+    mock_synthesizer = MagicMock()
+    mock_synthesizer.return_value.synthesize.return_value = synthesis_text
+
+    return {
+        "app.services.diagnose.pipeline.TimelineReconstructor": mock_reconstructor,
+        "app.services.diagnose.pipeline.SeverityClassifier": mock_classifier,
+        "app.services.diagnose.pipeline.RootCauseHypothesizer": mock_hypothesizer,
+        "app.services.diagnose.pipeline.FalsePositiveSuppressor": mock_suppressor,
+        "app.services.diagnose.pipeline.SummarySynthesizer": mock_synthesizer,
+    }
+
+
+# ---------------------------------------------------------------------------
+# 1. Feature flag off: legacy summarize() path runs, not run_pipeline
+# ---------------------------------------------------------------------------
+
+class TestFeatureFlagOff:
+    @pytest.mark.asyncio
+    async def test_legacy_path_when_flag_off(self):
+        """With MULTI_AGENT_ENABLED=False, run_pipeline is never called."""
+        from app.services import diagnose as diagnose_module
+
+        entries = [_make_search_result()]
+
+        with (
+            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
+            patch("app.services.diagnose.search", return_value=entries),
+            patch("app.services.diagnose.entries_in_window", return_value=[]),
+            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
+            patch("app.services.diagnose.format_context_block", return_value=None),
+            patch("app.services.diagnose.run_pipeline") as mock_pipeline,
+            patch("app.services.diagnose.summarize", return_value=None),
+        ):
+            events = []
+            async for event in diagnose_module.diagnose_stream(
+                db_path=Path("/tmp/fake.db"),
+                query="ssh failures",
+                llm_url=None,
+                llm_model=None,
+            ):
+                events.append(event)
+
+        # run_pipeline must NOT have been called
+        mock_pipeline.assert_not_called()
+
+        # SSE sequence must end with done
+        types = [e["type"] for e in events]
+        assert "done" in types
+        assert types[-1] == "done"
+
+    @pytest.mark.asyncio
+    async def test_legacy_done_event_is_last(self):
+        """Legacy path: done is always the last event."""
+        from app.services import diagnose as diagnose_module
+
+        with (
+            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
+            patch("app.services.diagnose.search", return_value=[]),
+            patch("app.services.diagnose.entries_in_window", return_value=[]),
+            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
+            patch("app.services.diagnose.format_context_block", return_value=None),
+        ):
+            events = []
+            async for event in diagnose_module.diagnose_stream(
+                db_path=Path("/tmp/fake.db"),
+                query="check logs",
+            ):
+                events.append(event)
+
+        assert events[-1] == {"type": "done"}
+
+
+# ---------------------------------------------------------------------------
+# 2. Feature flag on, all stages mocked: verify SSE event sequence
+# ---------------------------------------------------------------------------
+
+class TestFeatureFlagOn:
+    @pytest.mark.asyncio
+    async def test_pipeline_stage_events_in_order(self):
+        """pipeline_stage events must be emitted stages 1→2→3→4 in order."""
+        mocks = _mock_all_stages()
+        kwargs = _default_pipeline_kwargs()
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
+        stages = [e["stage"] for e in stage_events]
+        assert stages == [1, 2, 3, 4]
+
+    @pytest.mark.asyncio
+    async def test_hypotheses_event_after_stage4(self):
+        """hypotheses event must appear after pipeline_stage stage=4."""
+        mocks = _mock_all_stages()
+        kwargs = _default_pipeline_kwargs()
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        stage4_idx = next(
+            i for i, e in enumerate(events)
+            if e.get("type") == "pipeline_stage" and e.get("stage") == 4
+        )
+        hyp_idx = next(i for i, e in enumerate(events) if e.get("type") == "hypotheses")
+        assert hyp_idx > stage4_idx
+
+    @pytest.mark.asyncio
+    async def test_reasoning_event_emitted(self):
+        """reasoning event must be present when synthesizer returns text."""
+        mocks = _mock_all_stages(synthesis_text="VERDICT: CRITICAL — SSH flood")
+        kwargs = _default_pipeline_kwargs()
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        reasoning_events = [e for e in events if e.get("type") == "reasoning"]
+        assert len(reasoning_events) == 1
+        assert "VERDICT" in reasoning_events[0]["text"]
+
+    @pytest.mark.asyncio
+    async def test_done_event_is_last(self):
+        """done must always be the last event in the pipeline sequence."""
+        mocks = _mock_all_stages()
+        kwargs = _default_pipeline_kwargs()
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        assert events[-1] == {"type": "done"}
+
+    @pytest.mark.asyncio
+    async def test_pipeline_wired_from_diagnose_stream(self):
+        """diagnose_stream routes through run_pipeline when flag is on."""
+        from app.services import diagnose as diagnose_module
+
+        entries = [_make_search_result()]
+
+        async def fake_pipeline(**kwargs):
+            yield {"type": "status", "message": "Building timeline…"}
+            yield {"type": "pipeline_stage", "stage": 1, "name": "timeline", "message": "Built 1 clusters, 0 bursts"}
+            yield {"type": "done"}
+
+        with (
+            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", True),
+            patch("app.services.diagnose.search", return_value=entries),
+            patch("app.services.diagnose.entries_in_window", return_value=[]),
+            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
+            patch("app.services.diagnose.format_context_block", return_value=None),
+            patch("app.services.diagnose.run_pipeline", side_effect=fake_pipeline),
+        ):
+            events = []
+            async for event in diagnose_module.diagnose_stream(
+                db_path=Path("/tmp/fake.db"),
+                query="ssh failures",
+            ):
+                events.append(event)
+
+        types = [e["type"] for e in events]
+        assert "pipeline_stage" in types
+        assert types[-1] == "done"
+        # Legacy summarize() must NOT have been called — done event came from pipeline
+        assert types.count("done") == 1
+
+
+# ---------------------------------------------------------------------------
+# 3. Empty entries: pipeline completes with done
+# ---------------------------------------------------------------------------
+
+class TestEmptyEntries:
+    @pytest.mark.asyncio
+    async def test_empty_entries_pipeline_completes(self):
+        """Pipeline with entries=[] must still complete and emit done."""
+        mocks = _mock_all_stages(hypotheses=[], ranked=[])
+        kwargs = _default_pipeline_kwargs(entries=[])
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        types = [e["type"] for e in events]
+        assert "done" in types
+        assert types[-1] == "done"
+
+    @pytest.mark.asyncio
+    async def test_empty_entries_all_stage_events_present(self):
+        """Even with empty entries, all 4 pipeline_stage events are emitted."""
+        mocks = _mock_all_stages(hypotheses=[], ranked=[])
+        kwargs = _default_pipeline_kwargs(entries=[])
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
+        assert len(stage_events) == 4
+
+
+# ---------------------------------------------------------------------------
+# 4. No LLM: Stage 3 and Stage 5 return empty/fallback; done still emitted
+# ---------------------------------------------------------------------------
+
+class TestNoLLM:
+    @pytest.mark.asyncio
+    async def test_no_llm_pipeline_completes_with_done(self):
+        """No llm_url/llm_model: pipeline runs all stages and emits done."""
+        mocks = _mock_all_stages(hypotheses=[], ranked=[], synthesis_text="VERDICT: UNKNOWN — no hypotheses generated")
+        kwargs = _default_pipeline_kwargs()
+        # llm_url and llm_model already None in default kwargs
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        assert events[-1] == {"type": "done"}
+
+    @pytest.mark.asyncio
+    async def test_no_llm_no_reasoning_event_when_synthesis_empty(self):
+        """When synthesizer returns empty string, no reasoning event is emitted."""
+        mocks = _mock_all_stages(synthesis_text="")
+        kwargs = _default_pipeline_kwargs()
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        reasoning_events = [e for e in events if e.get("type") == "reasoning"]
+        assert len(reasoning_events) == 0
+
+
+# ---------------------------------------------------------------------------
+# 5. Stage 1 cluster count in pipeline_stage message
+# ---------------------------------------------------------------------------
+
+class TestStage1Message:
+    @pytest.mark.asyncio
+    async def test_stage1_message_contains_cluster_count(self):
+        """pipeline_stage stage=1 message must report cluster count."""
+        timeline = TimelineResult(
+            clusters=tuple(),
+            total_entries=10,
+            window_start=None,
+            window_end=None,
+            gap_count=0,
+            burst_count=3,
+            dominant_sources=("syslog",),
+        )
+        classified = _make_classified(timeline)
+
+        mock_reconstructor = MagicMock()
+        mock_reconstructor.return_value.reconstruct.return_value = timeline
+        mock_classifier = MagicMock()
+        mock_classifier.return_value.classify.return_value = classified
+        mock_hypothesizer = MagicMock()
+        mock_hypothesizer.return_value.hypothesize.return_value = []
+        mock_suppressor = MagicMock()
+        mock_suppressor.return_value.suppress.return_value = []
+        mock_synthesizer = MagicMock()
+        mock_synthesizer.return_value.synthesize.return_value = "VERDICT: INFO — nothing found"
+
+        kwargs = _default_pipeline_kwargs()
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mock_reconstructor),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mock_classifier),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mock_hypothesizer),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mock_suppressor),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mock_synthesizer),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
+        # 0 clusters (empty tuple), 3 bursts
+        assert "0" in stage1["message"]  # cluster count
+        assert "3" in stage1["message"]  # burst count
+
+    @pytest.mark.asyncio
+    async def test_stage1_name_is_timeline(self):
+        """pipeline_stage stage=1 must have name='timeline'."""
+        mocks = _mock_all_stages()
+        kwargs = _default_pipeline_kwargs()
+
+        with (
+            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
+            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
+            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
+            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
+            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
+        ):
+            events = await _collect_pipeline_events(**kwargs)
+
+        stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
+        assert stage1["name"] == "timeline"
--- a/tests/test_diagnose_suppressor.py
+++ b/tests/test_diagnose_suppressor.py
@ -0,0 +1,432 @@
+"""Tests for app/services/diagnose/suppressor.py — FalsePositiveSuppressor.
+
+All tests use mocking; no real model downloads are made.
+"""
+from __future__ import annotations
+
+import math
+import sqlite3
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import app.services.diagnose.suppressor as sup_module
+from app.services.diagnose.models import Hypothesis, RankedHypothesis
+from app.services.diagnose.suppressor import FalsePositiveSuppressor
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_hypothesis(
+    title: str = "Test",
+    description: str = "A test hypothesis.",
+    confidence: float = 0.8,
+    severity: str = "ERROR",
+) -> Hypothesis:
+    return Hypothesis(
+        hypothesis_id="test-id",
+        title=title,
+        description=description,
+        confidence=confidence,
+        supporting_cluster_ids=(),
+        runbook_refs=(),
+        severity=severity,  # type: ignore[arg-type]
+    )
+
+
+def _make_db_with_incidents(incidents: list[tuple[str, str]], db_path: Path) -> Path:
+    """Create a temporary SQLite database with resolved incidents. Returns the db path."""
+    with sqlite3.connect(str(db_path)) as conn:
+        conn.execute(
+            "CREATE TABLE incidents "
+            "(id INTEGER PRIMARY KEY, label TEXT, notes TEXT, ended_at TEXT)"
+        )
+        for label, notes in incidents:
+            conn.execute(
+                "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
+                (label, notes, "2024-01-01T00:00:00"),
+            )
+        conn.commit()
+    return db_path
+
+
+def _make_empty_db(db_path: Path) -> Path:
+    """Create a temporary SQLite DB with no incidents table."""
+    with sqlite3.connect(str(db_path)) as conn:
+        conn.execute("CREATE TABLE unrelated (id INTEGER PRIMARY KEY)")
+        conn.commit()
+    return db_path
+
+
+def _make_mock_embedder(
+    embed_return: list[float] | None = None,
+    embed_batch_return: list[list[float]] | None = None,
+) -> MagicMock:
+    """Build a mock embedder with controllable embed/embed_batch responses."""
+    embedder = MagicMock()
+
+    # Default: unit vector along first dimension
+    default_vec = [1.0] + [0.0] * 383
+
+    raw_single = embed_return if embed_return is not None else default_vec
+    raw_batch = embed_batch_return if embed_batch_return is not None else [default_vec]
+
+    # Wrap scalars in numpy-like MagicMock with .tolist()
+    def _wrap(vec: list[float]) -> MagicMock:
+        m = MagicMock()
+        m.tolist.return_value = vec
+        return m
+
+    embedder.embed.return_value = _wrap(raw_single)
+    embedder.embed_batch.return_value = [_wrap(v) for v in raw_batch]
+    return embedder
+
+
+# ---------------------------------------------------------------------------
+# Autouse fixture: reset module-level cache between tests
+# ---------------------------------------------------------------------------
+
+@pytest.fixture(autouse=True)
+def reset_suppressor_cache():
+    sup_module._corpus_cache.clear()
+    yield
+    sup_module._corpus_cache.clear()
+
+
+# ---------------------------------------------------------------------------
+# Test 1: No model configured — passthrough, ranked by confidence
+# ---------------------------------------------------------------------------
+
+def test_no_model_passthrough_ranked_by_confidence(tmp_path):
+    """model_id='' → all novelty_score=1.0, suppress=False, ranked by confidence desc."""
+    h_low = _make_hypothesis(title="Low", confidence=0.3)
+    h_high = _make_hypothesis(title="High", confidence=0.9)
+    h_mid = _make_hypothesis(title="Mid", confidence=0.6)
+
+    db_path = tmp_path / "turnstone.db"
+    suppressor = FalsePositiveSuppressor(model_id="")
+    results = suppressor.suppress([h_low, h_high, h_mid], db_path)
+
+    assert len(results) == 3
+    assert all(isinstance(r, RankedHypothesis) for r in results)
+    assert all(r.novelty_score == pytest.approx(1.0) for r in results)
+    assert all(r.similarity_to_known == pytest.approx(0.0) for r in results)
+    assert all(r.suppress is False for r in results)
+    assert all(r.suppression_reason is None for r in results)
+    # Ranked by confidence descending
+    confidences = [r.hypothesis.confidence for r in results]
+    assert confidences == sorted(confidences, reverse=True)
+
+
+# ---------------------------------------------------------------------------
+# Test 2: High similarity → suppressed
+# ---------------------------------------------------------------------------
+
+def test_high_similarity_suppresses_hypothesis(tmp_path):
+    """Hypothesis with embedding nearly identical to corpus → suppress=True."""
+    identical_vec = [1.0] + [0.0] * 383
+    corpus_vec = [1.0] + [0.0] * 383  # cosine similarity = 1.0
+
+    mock_embedder = _make_mock_embedder(
+        embed_return=identical_vec,
+        embed_batch_return=[corpus_vec],
+    )
+
+    db_path = _make_db_with_incidents(
+        [("OOM killer", "Memory pressure caused OOM kill")],
+        tmp_path / "turnstone.db",
+    )
+    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        results = suppressor.suppress([_make_hypothesis()], db_path)
+
+    assert len(results) == 1
+    result = results[0]
+    assert result.suppress is True
+    assert result.suppression_reason is not None
+    assert "Similar to resolved incident" in result.suppression_reason
+    assert result.similarity_to_known == pytest.approx(1.0, abs=0.01)
+    assert result.novelty_score == pytest.approx(0.0, abs=0.01)
+
+
+# ---------------------------------------------------------------------------
+# Test 3: Low similarity → not suppressed
+# ---------------------------------------------------------------------------
+
+def test_low_similarity_does_not_suppress(tmp_path):
+    """Hypothesis with embedding orthogonal to corpus → suppress=False."""
+    hypothesis_vec = [1.0] + [0.0] * 383
+    corpus_vec = [0.0, 1.0] + [0.0] * 382  # orthogonal → similarity = 0.0
+
+    mock_embedder = _make_mock_embedder(
+        embed_return=hypothesis_vec,
+        embed_batch_return=[corpus_vec],
+    )
+
+    db_path = _make_db_with_incidents(
+        [("Disk I/O", "Storage saturation caused latency")],
+        tmp_path / "turnstone.db",
+    )
+    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        results = suppressor.suppress([_make_hypothesis()], db_path)
+
+    assert len(results) == 1
+    result = results[0]
+    assert result.suppress is False
+    assert result.suppression_reason is None
+    assert result.similarity_to_known == pytest.approx(0.0, abs=0.01)
+    assert result.novelty_score == pytest.approx(1.0, abs=0.01)
+
+
+# ---------------------------------------------------------------------------
+# Test 3b: Borderline similarity — exactly at threshold vs. just below
+# ---------------------------------------------------------------------------
+
+def test_similarity_threshold_boundary(tmp_path):
+    """similarity == threshold is suppressed; similarity just below threshold is not.
+
+    This test locks down the boundary semantics: suppress when max_sim >= threshold,
+    not when novelty_score < threshold (the inverted form that was the original bug).
+    With threshold=0.85:
+      - similarity=0.85 → suppressed (at boundary, inclusive)
+      - similarity=0.84 → NOT suppressed (just below)
+    """
+    db_path = _make_db_with_incidents(
+        [("Disk I/O", "Storage saturation caused latency")],
+        tmp_path / "turnstone.db",
+    )
+
+    # Corpus unit vector along first axis
+    corpus_vec = [1.0] + [0.0] * 383
+
+    for sim_value, expected_suppress in [(0.85, True), (0.84, False)]:
+        # Build a hypothesis embedding whose cosine similarity to corpus_vec ≈ sim_value.
+        # query = [sim, sqrt(1 - sim^2), 0, ...] → cosine sim = sim exactly.
+        import math
+        hyp_vec = [sim_value, math.sqrt(max(0.0, 1.0 - sim_value ** 2))] + [0.0] * 382
+
+        mock_embedder = _make_mock_embedder(
+            embed_return=hyp_vec,
+            embed_batch_return=[corpus_vec],
+        )
+
+        suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
+
+        with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+            results = suppressor.suppress([_make_hypothesis()], db_path)
+
+        assert len(results) == 1
+        result = results[0]
+        assert result.suppress is expected_suppress, (
+            f"similarity={sim_value:.2f}: expected suppress={expected_suppress}, "
+            f"got suppress={result.suppress} (similarity_to_known={result.similarity_to_known:.4f})"
+        )
+
+
+# ---------------------------------------------------------------------------
+# Test 4: Empty hypotheses list returns []
+# ---------------------------------------------------------------------------
+
+def test_empty_hypotheses_returns_empty(tmp_path):
+    """suppress([]) → [] regardless of model or db state."""
+    db_path = tmp_path / "turnstone.db"
+    suppressor = FalsePositiveSuppressor(model_id="test-model")
+    results = suppressor.suppress([], db_path)
+    assert results == []
+
+
+# ---------------------------------------------------------------------------
+# Test 5: Ranking by novelty_score * confidence
+# ---------------------------------------------------------------------------
+
+def test_ranking_by_novelty_times_confidence(tmp_path):
+    """Results are sorted by novelty_score * confidence descending."""
+    # Hypothesis A: novelty=0.9, confidence=0.5 → score=0.45
+    # Hypothesis B: novelty=0.5, confidence=0.9 → score=0.45 (tie, order stable-ish)
+    # Hypothesis C: novelty=0.8, confidence=0.9 → score=0.72  (highest)
+    # Expected order: C, then A or B
+
+    # We'll use orthogonal embeddings to get predictable similarities.
+    # Corpus has 3 incidents with different embeddings.
+    # We'll control novelty_score by setting similarity carefully.
+
+    # Simplest: set up so each hypothesis gets a specific similarity to its corpus.
+    # corpus_embs[0] = [1,0,0,...], [0,1,0,...], [0,0,1,...] — unit vectors
+    # hyp A embed  = [cos(0.1), sin(0.1), 0...] → sim to corpus[0] = cos(0.1) ≈ 0.995 high
+    # This gets complex. Instead, mock _load_embedder to return None and rely
+    # on passthrough with controlled confidence, then verify confidence-based ranking.
+    # Then do a second test variant with manual novelty injection via embed return values.
+
+    # Simpler approach: create 3 hypotheses and verify output is sorted correctly
+    # by providing distinct embeddings that produce known similarities.
+
+    # Corpus: single vector [1, 0, 0, ...]
+    corpus_vec = [1.0] + [0.0] * 383
+
+    # H_A: similarity = 0.1 → novelty = 0.9, confidence = 0.5 → score = 0.45
+    angle_a = math.acos(0.1)
+    vec_a = [0.1, math.sin(angle_a)] + [0.0] * 382
+
+    # H_B: similarity = 0.5 → novelty = 0.5, confidence = 0.9 → score = 0.45
+    angle_b = math.acos(0.5)
+    vec_b = [0.5, math.sin(angle_b)] + [0.0] * 382
+
+    # H_C: similarity = 0.2 → novelty = 0.8, confidence = 0.9 → score = 0.72 (highest)
+    angle_c = math.acos(0.2)
+    vec_c = [0.2, math.sin(angle_c)] + [0.0] * 382
+
+    h_a = _make_hypothesis(title="A", confidence=0.5)
+    h_b = _make_hypothesis(title="B", confidence=0.9)
+    h_c = _make_hypothesis(title="C", confidence=0.9)
+
+    call_count = [0]
+    vecs_in_order = [vec_a, vec_b, vec_c]
+
+    def side_effect_embed(text: str) -> MagicMock:
+        m = MagicMock()
+        m.tolist.return_value = vecs_in_order[call_count[0] % len(vecs_in_order)]
+        call_count[0] += 1
+        return m
+
+    mock_embedder = MagicMock()
+    batch_m = MagicMock()
+    batch_m.tolist.return_value = corpus_vec
+    mock_embedder.embed_batch.return_value = [batch_m]
+    mock_embedder.embed.side_effect = side_effect_embed
+
+    db_path = _make_db_with_incidents(
+        [("OOM", "Memory exhaustion")],
+        tmp_path / "turnstone.db",
+    )
+    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        results = suppressor.suppress([h_a, h_b, h_c], db_path)
+
+    assert len(results) == 3
+    titles = [r.hypothesis.title for r in results]
+    # H_C should be first (highest novelty*confidence score)
+    assert titles[0] == "C", f"Expected C first, got {titles}"
+    # Verify sort is descending by novelty*confidence
+    scores = [r.novelty_score * r.hypothesis.confidence for r in results]
+    assert scores == sorted(scores, reverse=True)
+
+
+# ---------------------------------------------------------------------------
+# Test 6: DB with no resolved incidents → novelty_score=1.0
+# ---------------------------------------------------------------------------
+
+def test_no_resolved_incidents_in_db_passthrough(tmp_path):
+    """When incidents table is empty, all hypotheses get novelty_score=1.0."""
+    db_path = _make_db_with_incidents([], tmp_path / "turnstone.db")  # table exists but zero rows
+    mock_embedder = _make_mock_embedder()
+    suppressor = FalsePositiveSuppressor(model_id="test-model")
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        results = suppressor.suppress([_make_hypothesis()], db_path)
+
+    assert len(results) == 1
+    assert results[0].novelty_score == pytest.approx(1.0)
+    assert results[0].suppress is False
+    # embed_batch should NOT have been called (empty corpus short-circuits)
+    mock_embedder.embed_batch.assert_not_called()
+
+
+# ---------------------------------------------------------------------------
+# Test 7: DB query failure → graceful fallback, no crash
+# ---------------------------------------------------------------------------
+
+def test_db_query_failure_graceful_fallback(tmp_path):
+    """When the incidents table is missing, suppress() returns passthrough without raising."""
+    db_path = _make_empty_db(tmp_path / "turnstone.db")  # no 'incidents' table
+    mock_embedder = _make_mock_embedder()
+    suppressor = FalsePositiveSuppressor(model_id="test-model")
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        results = suppressor.suppress([_make_hypothesis()], db_path)
+
+    assert len(results) == 1
+    assert results[0].novelty_score == pytest.approx(1.0)
+    assert results[0].suppress is False
+
+
+# ---------------------------------------------------------------------------
+# Test 8: Embedding service unavailable (returns None) → graceful fallback
+# ---------------------------------------------------------------------------
+
+def test_embedding_service_unavailable_passthrough(tmp_path):
+    """When get_embedder() returns None, suppress() falls back without crashing."""
+    db_path = _make_db_with_incidents(
+        [("OOM", "Memory pressure")],
+        tmp_path / "turnstone.db",
+    )
+    suppressor = FalsePositiveSuppressor(model_id="test-model")
+
+    with patch.object(suppressor, "_load_embedder", return_value=None):
+        results = suppressor.suppress([_make_hypothesis(confidence=0.7)], db_path)
+
+    assert len(results) == 1
+    assert results[0].novelty_score == pytest.approx(1.0)
+    assert results[0].suppress is False
+    assert results[0].suppression_reason is None
+
+
+# ---------------------------------------------------------------------------
+# Test 9: Corpus cache invalidated when corpus changes
+# ---------------------------------------------------------------------------
+
+def test_corpus_cache_invalidated_on_corpus_change(tmp_path):
+    """When the corpus changes between calls, embed_batch is called again."""
+    # First DB: one incident
+    db_path = _make_db_with_incidents(
+        [("OOM", "Memory pressure")],
+        tmp_path / "turnstone.db",
+    )
+
+    corpus_vec_1 = [1.0] + [0.0] * 383
+    corpus_vec_2 = [0.0, 1.0] + [0.0] * 382
+
+    hyp_vec = [1.0] + [0.0] * 383
+
+    # embedder will be called twice for embed_batch (different corpus each time)
+    mock_embedder = MagicMock()
+    single_m = MagicMock()
+    single_m.tolist.return_value = hyp_vec
+
+    batch_m1 = MagicMock()
+    batch_m1.tolist.return_value = corpus_vec_1
+    batch_m2 = MagicMock()
+    batch_m2.tolist.return_value = corpus_vec_2
+
+    mock_embedder.embed.return_value = single_m
+    mock_embedder.embed_batch.side_effect = [[batch_m1], [batch_m2]]
+
+    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
+
+    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
+        # First call — populates cache
+        results_1 = suppressor.suppress([_make_hypothesis()], db_path)
+        assert mock_embedder.embed_batch.call_count == 1
+
+        # Mutate the DB to add a second incident (changes corpus)
+        with sqlite3.connect(str(db_path)) as conn:
+            conn.execute(
+                "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
+                ("Disk I/O", "Storage saturation", "2024-01-02T00:00:00"),
+            )
+            conn.commit()
+
+        # Second call — corpus changed, should re-embed
+        results_2 = suppressor.suppress([_make_hypothesis()], db_path)
+        assert mock_embedder.embed_batch.call_count == 2, (
+            "embed_batch should be called again when corpus changes"
+        )
+
+    assert len(results_1) == 1
+    assert len(results_2) == 1
--- a/tests/test_diagnose_synthesizer.py
+++ b/tests/test_diagnose_synthesizer.py
@ -0,0 +1,285 @@
+"""Tests for app/services/diagnose/synthesizer.py — SummarySynthesizer.
+
+All tests use mocking; no real LLM calls are made.
+"""
+from __future__ import annotations
+
+from unittest.mock import MagicMock, patch
+
+from app.context.retriever import RetrievedContext
+from app.services.diagnose.models import Hypothesis, RankedHypothesis, TimelineResult
+from app.services.diagnose.synthesizer import SummarySynthesizer
+
+
+# ---------------------------------------------------------------------------
+# Fixture helpers
+# ---------------------------------------------------------------------------
+
+def _make_hypothesis(
+    hypothesis_id: str = "h1",
+    title: str = "SSH flood from external IPs",
+    description: str = "Repeated failed login attempts from multiple IPs.",
+    confidence: float = 0.87,
+    severity: str = "CRITICAL",
+) -> Hypothesis:
+    return Hypothesis(
+        hypothesis_id=hypothesis_id,
+        title=title,
+        description=description,
+        confidence=confidence,
+        supporting_cluster_ids=("c1",),
+        runbook_refs=(),
+        severity=severity,  # type: ignore[arg-type]
+    )
+
+
+def _make_ranked(
+    hypothesis: Hypothesis | None = None,
+    novelty_score: float = 0.95,
+    similarity_to_known: float = 0.05,
+    suppress: bool = False,
+    suppression_reason: str | None = None,
+) -> RankedHypothesis:
+    h = hypothesis or _make_hypothesis()
+    return RankedHypothesis(
+        hypothesis=h,
+        novelty_score=novelty_score,
+        similarity_to_known=similarity_to_known,
+        suppress=suppress,
+        suppression_reason=suppression_reason,
+    )
+
+
+def _make_timeline(
+    total_entries: int = 42,
+    n_clusters: int = 3,
+) -> TimelineResult:
+    return TimelineResult(
+        clusters=tuple(),
+        total_entries=total_entries,
+        window_start="2026-01-01T00:00:00+00:00",
+        window_end="2026-01-01T01:00:00+00:00",
+        gap_count=1,
+        burst_count=2,
+        dominant_sources=("syslog", "auth"),
+    )
+
+
+def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
+    return RetrievedContext(
+        facts=[{"category": "network", "key": "host", "value": "heimdall", "source": "facts"}],
+        chunks=chunks or [{"filename": "runbook.md", "text": "Restart sshd if flooded"}],
+    )
+
+
+# ---------------------------------------------------------------------------
+# Test cases
+# ---------------------------------------------------------------------------
+
+class TestSynthesizerWithHypotheses:
+    """With hypotheses, result must contain VERDICT."""
+
+    def test_returns_verdict_string_with_llm(self):
+        synthesizer = SummarySynthesizer()
+        ranked = [_make_ranked()]
+        timeline = _make_timeline()
+        ctx = _make_ctx()
+
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.json.return_value = {
+            "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)\nTIMELINE: lots of hits."}}]
+        }
+
+        with patch("httpx.post", return_value=mock_resp):
+            result = synthesizer.synthesize(
+                ranked=ranked,
+                timeline=timeline,
+                ctx=ctx,
+                query="ssh brute force",
+                llm_url="http://localhost:11434",
+                llm_model="llama3",
+            )
+
+        assert "VERDICT" in result
+
+    def test_returns_nonempty_string(self):
+        synthesizer = SummarySynthesizer()
+        ranked = [_make_ranked()]
+        timeline = _make_timeline()
+        ctx = _make_ctx()
+
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.json.return_value = {
+            "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood (87% confidence)"}}]
+        }
+
+        with patch("httpx.post", return_value=mock_resp):
+            result = synthesizer.synthesize(
+                ranked=ranked,
+                timeline=timeline,
+                ctx=ctx,
+                query="why is auth failing",
+                llm_url="http://localhost:11434",
+                llm_model="llama3",
+            )
+
+        assert isinstance(result, str)
+        assert len(result) > 0
+
+
+class TestSynthesizerSuppressedHypotheses:
+    """Suppressed hypotheses must be excluded from the LLM prompt."""
+
+    def test_suppressed_hypotheses_excluded_from_prompt(self):
+        suppressed = _make_ranked(
+            hypothesis=_make_hypothesis(
+                hypothesis_id="h2",
+                title="Wazuh alert processing backlog",
+                severity="ERROR",
+                confidence=0.72,
+            ),
+            suppress=True,
+            suppression_reason="similar to 2025-04 SSH incident",
+            novelty_score=0.1,
+        )
+        active = _make_ranked(
+            hypothesis=_make_hypothesis(
+                hypothesis_id="h1",
+                title="SSH flood from external IPs",
+                severity="CRITICAL",
+                confidence=0.87,
+            ),
+            suppress=False,
+            novelty_score=0.95,
+        )
+
+        captured_messages: list = []
+
+        def fake_post(url, json=None, headers=None, timeout=None):
+            if json and "payload" in json:
+                captured_messages.extend(json["payload"].get("messages", []))
+            elif json and "messages" in json:
+                captured_messages.extend(json.get("messages", []))
+            mock_resp = MagicMock()
+            mock_resp.status_code = 200
+            mock_resp.json.return_value = {
+                "choices": [{"message": {"content": "VERDICT: CRITICAL — SSH flood"}}]
+            }
+            return mock_resp
+
+        synthesizer = SummarySynthesizer()
+        with patch("httpx.post", side_effect=fake_post):
+            synthesizer.synthesize(
+                ranked=[active, suppressed],
+                timeline=_make_timeline(),
+                ctx=_make_ctx(),
+                query="auth failures",
+                llm_url="http://localhost:11434",
+                llm_model="llama3",
+            )
+
+        # The user message should contain the active hypothesis title
+        # and NOT contain the suppressed one (or mark it suppressed)
+        user_content = next(
+            (m["content"] for m in captured_messages if m.get("role") == "user"), ""
+        )
+        assert "SSH flood from external IPs" in user_content
+        # Wazuh should not appear as a standalone top-level hypothesis
+        # (suppressed items are excluded from the active list sent to the LLM)
+        assert "Wazuh alert processing backlog" not in user_content
+
+
+class TestSynthesizerNoLLM:
+    """No LLM configured: must return deterministic fallback (not empty)."""
+
+    def test_no_llm_url_returns_fallback(self):
+        synthesizer = SummarySynthesizer()
+        ranked = [_make_ranked()]
+        timeline = _make_timeline()
+        ctx = _make_ctx()
+
+        result = synthesizer.synthesize(
+            ranked=ranked,
+            timeline=timeline,
+            ctx=ctx,
+            query="disk errors",
+        )
+
+        assert isinstance(result, str)
+        assert len(result) > 0
+        assert "VERDICT" in result
+
+    def test_no_llm_model_returns_fallback(self):
+        synthesizer = SummarySynthesizer()
+        ranked = [_make_ranked()]
+
+        result = synthesizer.synthesize(
+            ranked=ranked,
+            timeline=_make_timeline(),
+            ctx=_make_ctx(),
+            query="oom killer",
+            llm_url="http://localhost:11434",
+            # llm_model omitted
+        )
+
+        assert "VERDICT" in result
+        assert "SSH flood from external IPs" in result
+
+    def test_llm_failure_returns_fallback(self):
+        synthesizer = SummarySynthesizer()
+        ranked = [_make_ranked()]
+
+        with patch("httpx.post", side_effect=ConnectionError("refused")):
+            result = synthesizer.synthesize(
+                ranked=ranked,
+                timeline=_make_timeline(),
+                ctx=_make_ctx(),
+                query="why is disk full",
+                llm_url="http://localhost:11434",
+                llm_model="llama3",
+            )
+
+        assert "VERDICT" in result
+        assert len(result) > 0
+
+
+class TestSynthesizerEmptyRanked:
+    """Empty ranked list: must return deterministic fallback text, not raise."""
+
+    def test_empty_ranked_no_llm_returns_fallback(self):
+        synthesizer = SummarySynthesizer()
+        result = synthesizer.synthesize(
+            ranked=[],
+            timeline=_make_timeline(),
+            ctx=_make_ctx(),
+            query="check everything",
+        )
+
+        assert isinstance(result, str)
+        assert len(result) > 0
+        assert "VERDICT" in result
+
+    def test_empty_ranked_with_llm_returns_fallback_or_llm_text(self):
+        """Even with empty ranked, we attempt LLM and return something."""
+        synthesizer = SummarySynthesizer()
+
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.json.return_value = {
+            "choices": [{"message": {"content": "VERDICT: UNKNOWN — no hypotheses generated"}}]
+        }
+
+        with patch("httpx.post", return_value=mock_resp):
+            result = synthesizer.synthesize(
+                ranked=[],
+                timeline=_make_timeline(),
+                ctx=_make_ctx(),
+                query="nothing found",
+                llm_url="http://localhost:11434",
+                llm_model="llama3",
+            )
+
+        assert isinstance(result, str)
+        assert len(result) > 0
--- a/tests/test_diagnose_timeline.py
+++ b/tests/test_diagnose_timeline.py
@ -0,0 +1,234 @@
+"""Tests for app/services/diagnose/timeline.py — TimelineReconstructor."""
+from __future__ import annotations
+
+from app.services.diagnose.timeline import TimelineReconstructor
+from app.services.diagnose.models import TimelineResult
+from app.services.search import SearchResult
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def _make_entry(
+    entry_id: str = "e1",
+    source_id: str = "src-a",
+    timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
+    severity: str | None = "INFO",
+    rank: float = 0.0,
+    text: str = "log line",
+    matched_patterns: list[str] | None = None,
+    sequence: int = 1,
+) -> SearchResult:
+    return SearchResult(
+        entry_id=entry_id,
+        source_id=source_id,
+        sequence=sequence,
+        timestamp_iso=timestamp_iso,
+        severity=severity,
+        repeat_count=1,
+        out_of_order=False,
+        matched_patterns=matched_patterns or [],
+        text=text,
+        rank=rank,
+    )
+
+
+def _ts(offset_seconds: int) -> str:
+    """Return an ISO timestamp offset_seconds after 2026-01-01T00:00:00+00:00."""
+    from datetime import datetime, timezone, timedelta
+    base = datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
+    dt = base + timedelta(seconds=offset_seconds)
+    return dt.isoformat()
+
+
+# ---------------------------------------------------------------------------
+# Test cases
+# ---------------------------------------------------------------------------
+
+class TestEmptyInput:
+    def test_empty_returns_empty_timeline(self):
+        rt = TimelineReconstructor()
+        result = rt.reconstruct([])
+        assert result == TimelineResult(
+            clusters=(),
+            total_entries=0,
+            gap_count=0,
+            burst_count=0,
+            window_start=None,
+            window_end=None,
+            dominant_sources=(),
+        )
+
+
+class TestSingleEntry:
+    def test_single_entry_one_cluster(self):
+        rt = TimelineReconstructor()
+        entry = _make_entry(entry_id="e1", timestamp_iso=_ts(0))
+        result = rt.reconstruct([entry])
+        assert len(result.clusters) == 1
+        cluster = result.clusters[0]
+        assert cluster.gap_before_seconds == 0.0
+        assert cluster.burst is False
+        assert result.total_entries == 1
+
+
+class TestClusteringWithinWindow:
+    def test_two_entries_10s_apart_same_cluster(self):
+        rt = TimelineReconstructor(cluster_window_seconds=30)
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(10)),
+        ]
+        result = rt.reconstruct(entries)
+        assert len(result.clusters) == 1
+        assert len(result.clusters[0].entries) == 2
+
+
+class TestClusteringOutsideWindow:
+    def test_two_entries_60s_apart_two_clusters(self):
+        rt = TimelineReconstructor(cluster_window_seconds=30)
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(60)),
+        ]
+        result = rt.reconstruct(entries)
+        assert len(result.clusters) == 2
+        second_cluster = result.clusters[1]
+        assert second_cluster.gap_before_seconds >= 60.0
+
+    def test_gap_count_correct_for_60s_gap(self):
+        rt = TimelineReconstructor(cluster_window_seconds=30)
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(60)),
+        ]
+        result = rt.reconstruct(entries)
+        assert result.gap_count == 1
+
+
+class TestBurst:
+    def test_15_entries_within_3s_is_burst(self):
+        rt = TimelineReconstructor(
+            cluster_window_seconds=30,
+            burst_threshold=10,
+            burst_window_seconds=5,
+        )
+        # All 15 entries within a 3-second window — well under burst_window=5
+        entries = [
+            _make_entry(entry_id=f"e{i}", timestamp_iso=_ts(i % 3), sequence=i)
+            for i in range(15)
+        ]
+        result = rt.reconstruct(entries)
+        # All should land in one cluster
+        assert len(result.clusters) == 1
+        assert result.clusters[0].burst is True
+        assert result.burst_count == 1
+
+
+class TestNullTimestamps:
+    def test_null_timestamp_joins_current_cluster(self):
+        rt = TimelineReconstructor(cluster_window_seconds=30)
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", timestamp_iso=None),
+        ]
+        # Should not raise, and null entry should join the existing cluster
+        result = rt.reconstruct(entries)
+        assert len(result.clusters) == 1
+        assert "e2" in result.clusters[0].entries
+
+    def test_null_timestamp_does_not_start_new_cluster(self):
+        rt = TimelineReconstructor(cluster_window_seconds=30)
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", timestamp_iso=None),
+            _make_entry(entry_id="e3", timestamp_iso=_ts(5)),
+        ]
+        result = rt.reconstruct(entries)
+        # e3 is within 30s of e1, so all three in one cluster
+        assert len(result.clusters) == 1
+
+    def test_all_null_timestamps_one_cluster_no_crash(self):
+        rt = TimelineReconstructor()
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=None),
+            _make_entry(entry_id="e2", timestamp_iso=None),
+        ]
+        result = rt.reconstruct(entries)
+        assert len(result.clusters) == 1
+        cluster = result.clusters[0]
+        assert cluster.start_iso is None
+        assert cluster.end_iso is None
+        assert result.window_start is None
+        assert result.window_end is None
+
+
+class TestDominantSources:
+    def test_dominant_sources_ordered_by_count_descending(self):
+        rt = TimelineReconstructor()
+        # src-b has 3 entries, src-a has 1
+        entries = [
+            _make_entry(entry_id="e1", source_id="src-a", timestamp_iso=_ts(0)),
+            _make_entry(entry_id="e2", source_id="src-b", timestamp_iso=_ts(1)),
+            _make_entry(entry_id="e3", source_id="src-b", timestamp_iso=_ts(2)),
+            _make_entry(entry_id="e4", source_id="src-b", timestamp_iso=_ts(3)),
+        ]
+        result = rt.reconstruct(entries)
+        assert result.dominant_sources[0] == "src-b"
+        assert result.dominant_sources[1] == "src-a"
+
+
+class TestRepresentativeText:
+    def test_representative_text_uses_highest_rank(self):
+        rt = TimelineReconstructor()
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=-5.0, text="low score"),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=-1.0, text="high score"),
+        ]
+        result = rt.reconstruct(entries)
+        assert result.clusters[0].representative_text == "high score"
+
+    def test_representative_text_tiebreak_on_longest_text(self):
+        rt = TimelineReconstructor()
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0), rank=0.0, text="short"),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(1), rank=0.0, text="much longer text here"),
+        ]
+        result = rt.reconstruct(entries)
+        assert result.clusters[0].representative_text == "much longer text here"
+
+
+class TestClusterId:
+    def test_cluster_id_is_12_char_hex(self):
+        rt = TimelineReconstructor()
+        entry = _make_entry(entry_id="abc123", timestamp_iso=_ts(0))
+        result = rt.reconstruct([entry])
+        cluster_id = result.clusters[0].cluster_id
+        assert len(cluster_id) == 12
+        assert all(c in "0123456789abcdef" for c in cluster_id)
+
+
+class TestSeverity:
+    def test_critical_wins_over_error(self):
+        rt = TimelineReconstructor()
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0), severity="ERROR"),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(1), severity="CRITICAL"),
+            _make_entry(entry_id="e3", timestamp_iso=_ts(2), severity="INFO"),
+        ]
+        result = rt.reconstruct(entries)
+        assert result.clusters[0].severity == "CRITICAL"
+
+
+class TestPatternTags:
+    def test_pattern_tags_union_across_entries(self):
+        rt = TimelineReconstructor()
+        entries = [
+            _make_entry(entry_id="e1", timestamp_iso=_ts(0), matched_patterns=["oom-killer"]),
+            _make_entry(entry_id="e2", timestamp_iso=_ts(1), matched_patterns=["disk-full"]),
+        ]
+        result = rt.reconstruct(entries)
+        tags = set(result.clusters[0].pattern_tags)
+        assert "oom-killer" in tags
+        assert "disk-full" in tags
--- a/tests/test_ingest_dmesg.py
+++ b/tests/test_ingest_dmesg.py
@ -1,7 +1,7 @@
-"""Tests for the dmesg log ingestor."""
+"""Tests for the dmesg log gleaner."""
 from __future__ import annotations

-from app.ingest.dmesg_log import is_dmesg_log, parse
+from app.glean.dmesg_log import is_dmesg_log, parse

 RELATIVE_SAMPLE = """\
 [    0.000000] Linux version 6.8.0-65-generic
--- a/tests/test_glean_fingerprint.py
+++ b/tests/test_glean_fingerprint.py
@ -0,0 +1,236 @@
+"""Tests for fingerprint-based incremental glean skipping (issue #30).
+
+Verifies that _glean_files() (and its public wrappers) skip local files whose
+mtime+size fingerprint has not changed since the last glean, and that force=True
+bypasses that check.
+"""
+from __future__ import annotations
+
+import sqlite3
+import time
+from pathlib import Path
+
+import pytest
+
+from app.glean.pipeline import (
+    _fingerprint,
+    _fp_unchanged,
+    _save_fingerprint,
+    ensure_schema,
+    glean_dir,
+    glean_file,
+)
+from app.glean.base import now_iso
+
+
+# ── Fixtures ──────────────────────────────────────────────────────────────────
+
+@pytest.fixture()
+def db_path(tmp_path: Path) -> Path:
+    path = tmp_path / "test.db"
+    ensure_schema(path)
+    return path
+
+
+@pytest.fixture()
+def log_file(tmp_path: Path) -> Path:
+    """A minimal plaintext log file."""
+    f = tmp_path / "test.log"
+    f.write_text("May 24 10:00:00 heimdall kernel: test message\n")
+    return f
+
+
+# ── Unit: fingerprint helpers ──────────────────────────────────────────────────
+
+class TestFingerprintHelpers:
+    def test_fingerprint_returns_mtime_and_size(self, log_file: Path) -> None:
+        mtime, size = _fingerprint(log_file)
+        st = log_file.stat()
+        assert mtime == st.st_mtime
+        assert size == st.st_size
+
+    def test_fp_unchanged_returns_false_when_no_record(self, db_path: Path, log_file: Path) -> None:
+        conn = sqlite3.connect(str(db_path))
+        mtime, size = _fingerprint(log_file)
+        assert _fp_unchanged(conn, log_file, mtime, size) is False
+        conn.close()
+
+    def test_fp_unchanged_returns_true_after_save(self, db_path: Path, log_file: Path) -> None:
+        conn = sqlite3.connect(str(db_path))
+        mtime, size = _fingerprint(log_file)
+        _save_fingerprint(conn, log_file, mtime, size, now_iso())
+        conn.commit()
+        assert _fp_unchanged(conn, log_file, mtime, size) is True
+        conn.close()
+
+    def test_fp_unchanged_returns_false_on_size_change(self, db_path: Path, log_file: Path) -> None:
+        conn = sqlite3.connect(str(db_path))
+        mtime, size = _fingerprint(log_file)
+        _save_fingerprint(conn, log_file, mtime, size, now_iso())
+        conn.commit()
+        # Simulate size change (new content appended)
+        assert _fp_unchanged(conn, log_file, mtime, size + 1) is False
+        conn.close()
+
+    def test_fp_unchanged_returns_false_on_mtime_change(self, db_path: Path, log_file: Path) -> None:
+        conn = sqlite3.connect(str(db_path))
+        mtime, size = _fingerprint(log_file)
+        _save_fingerprint(conn, log_file, mtime, size, now_iso())
+        conn.commit()
+        assert _fp_unchanged(conn, log_file, mtime + 1.0, size) is False
+        conn.close()
+
+    def test_save_fingerprint_upserts(self, db_path: Path, log_file: Path) -> None:
+        """Second save with different values replaces the first (UPSERT semantics)."""
+        conn = sqlite3.connect(str(db_path))
+        _save_fingerprint(conn, log_file, 1000.0, 100, "2026-01-01T00:00:00Z")
+        conn.commit()
+        _save_fingerprint(conn, log_file, 2000.0, 200, "2026-01-02T00:00:00Z")
+        conn.commit()
+        row = conn.execute(
+            "SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
+            (str(log_file),),
+        ).fetchone()
+        assert row == (2000.0, 200)
+        conn.close()
+
+
+# ── Integration: glean_file skipping ─────────────────────────────────────────
+
+class TestGleanFileFingerprint:
+    def test_first_glean_writes_fingerprint(self, db_path: Path, log_file: Path) -> None:
+        glean_file(log_file, db_path)
+        conn = sqlite3.connect(str(db_path))
+        row = conn.execute(
+            "SELECT mtime, size FROM glean_fingerprints WHERE path = ?",
+            (str(log_file),),
+        ).fetchone()
+        conn.close()
+        assert row is not None
+        mtime, size = _fingerprint(log_file)
+        assert row == (mtime, size)
+
+    def test_second_glean_skips_unchanged_file(self, db_path: Path, log_file: Path) -> None:
+        stats_first = glean_file(log_file, db_path)
+        count_first = sum(stats_first.values())
+
+        # Re-glean without touching the file — should produce 0 new entries.
+        stats_second = glean_file(log_file, db_path)
+        count_second = sum(stats_second.values())
+
+        assert count_first >= 1, "First glean should find at least one entry"
+        assert count_second == 0, "Second glean should skip unchanged file"
+
+    def test_second_glean_runs_when_file_grows(self, db_path: Path, log_file: Path) -> None:
+        glean_file(log_file, db_path)
+
+        # Append a new line and update mtime by rewriting.
+        original = log_file.read_text()
+        log_file.write_text(original + "May 24 10:01:00 heimdall kernel: second message\n")
+
+        stats_second = glean_file(log_file, db_path)
+        # INSERT OR IGNORE means the original entry won't re-count, but parsing
+        # does happen — at minimum the new line is processed.
+        assert sum(stats_second.values()) >= 0  # glean ran (not skipped)
+
+        # Confirm fingerprint updated to new size.
+        conn = sqlite3.connect(str(db_path))
+        row = conn.execute(
+            "SELECT size FROM glean_fingerprints WHERE path = ?",
+            (str(log_file),),
+        ).fetchone()
+        conn.close()
+        assert row is not None
+        assert row[0] == log_file.stat().st_size
+
+    def test_force_bypasses_fingerprint(self, db_path: Path, log_file: Path) -> None:
+        glean_file(log_file, db_path)
+
+        # Without force: skipped.
+        stats_no_force = glean_file(log_file, db_path)
+        assert sum(stats_no_force.values()) == 0
+
+        # With force: glean runs (INSERT OR IGNORE means count may be 0, but
+        # we verify the fingerprint was re-saved with a fresh gleaned_at).
+        conn_before = sqlite3.connect(str(db_path))
+        ts_before = conn_before.execute(
+            "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
+            (str(log_file),),
+        ).fetchone()[0]
+        conn_before.close()
+
+        time.sleep(0.01)  # ensure gleaned_at advances
+        glean_file(log_file, db_path, force=True)
+
+        conn_after = sqlite3.connect(str(db_path))
+        ts_after = conn_after.execute(
+            "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
+            (str(log_file),),
+        ).fetchone()[0]
+        conn_after.close()
+
+        assert ts_after > ts_before, "force=True should update gleaned_at timestamp"
+
+
+# ── Integration: glean_dir skipping ──────────────────────────────────────────
+
+class TestGleanDirFingerprint:
+    def test_glean_dir_skips_unchanged_on_second_run(self, db_path: Path, tmp_path: Path) -> None:
+        log1 = tmp_path / "a.log"
+        log2 = tmp_path / "b.log"
+        log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
+        log2.write_text("May 24 10:00:00 heimdall kernel: msg two\n")
+
+        glean_dir(tmp_path, db_path)
+
+        stats_second = glean_dir(tmp_path, db_path)
+        assert sum(stats_second.values()) == 0, "Both unchanged files should be skipped"
+
+    def test_glean_dir_force_reruns_all(self, db_path: Path, tmp_path: Path) -> None:
+        log1 = tmp_path / "a.log"
+        log1.write_text("May 24 10:00:00 heimdall kernel: msg one\n")
+
+        glean_dir(tmp_path, db_path)
+
+        # force=True: runs even though nothing changed; INSERT OR IGNORE keeps DB clean.
+        conn_before = sqlite3.connect(str(db_path))
+        ts_before = conn_before.execute(
+            "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
+            (str(log1),),
+        ).fetchone()[0]
+        conn_before.close()
+
+        time.sleep(0.01)
+        glean_dir(tmp_path, db_path, force=True)
+
+        conn_after = sqlite3.connect(str(db_path))
+        ts_after = conn_after.execute(
+            "SELECT gleaned_at FROM glean_fingerprints WHERE path = ?",
+            (str(log1),),
+        ).fetchone()[0]
+        conn_after.close()
+
+        assert ts_after > ts_before
+
+
+# ── Schema: ensure fingerprints table created ─────────────────────────────────
+
+class TestEnsureSchema:
+    def test_fingerprints_table_exists_after_ensure_schema(self, tmp_path: Path) -> None:
+        db = tmp_path / "fresh.db"
+        ensure_schema(db)
+        conn = sqlite3.connect(str(db))
+        tables = {
+            row[0]
+            for row in conn.execute(
+                "SELECT name FROM sqlite_master WHERE type='table'"
+            ).fetchall()
+        }
+        conn.close()
+        assert "glean_fingerprints" in tables
+
+    def test_ensure_schema_idempotent(self, tmp_path: Path) -> None:
+        """Calling ensure_schema twice on the same DB must not raise."""
+        db = tmp_path / "fresh.db"
+        ensure_schema(db)
+        ensure_schema(db)  # second call — should be a no-op
--- a/tests/test_glean_pipeline_ssh.py
+++ b/tests/test_glean_pipeline_ssh.py
@ -0,0 +1,444 @@
+"""Tests for SSH source handling in app/glean/pipeline.py.
+
+Verifies that glean_sources() correctly:
+- Dispatches SSH sources to SSHTransport (local sources unchanged)
+- Routes each glean-type to the right command builder + parser
+- Writes parsed entries to SQLite
+- Gracefully skips sources on SSHConnectionError or SSHCommandError
+"""
+from __future__ import annotations
+
+import json
+import sqlite3
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+import yaml
+
+from app.glean.pipeline import glean_sources, ensure_schema
+from app.glean.ssh import SSHConnectionError, SSHCommandError
+
+
+# ── Shared fixtures ───────────────────────────────────────────────────────────
+
+JOURNALD_LINE = json.dumps({
+    "__REALTIME_TIMESTAMP": "1747000000000000",
+    "PRIORITY": "3",
+    "MESSAGE": "SSH brute force detected from 192.168.1.99",
+    "SYSLOG_IDENTIFIER": "sshd",
+    "_HOSTNAME": "rack01",
+}) + "\n"
+
+SYSLOG_LINE = "May 20 22:00:00 rack01 sshd[1234]: Failed password for invalid user admin\n"
+
+PLAINTEXT_LINE = "2026-05-20 22:00:00 ERROR app crashed with exit code 1\n"
+
+DOCKER_LINE = "2026-05-20T22:00:00.000000000Z stderr F container startup failed\n"
+
+
+def _ssh_sources_yaml(sources: list[dict]) -> str:
+    return yaml.dump({"sources": sources})
+
+
+def _mock_transport(lines: list[str] | None = None):
+    """Return a mock SSHTransport context manager whose exec_stream yields given lines."""
+    mock_t = MagicMock()
+    mock_t.exec_stream.return_value = iter(lines or [])
+    return mock_t
+
+
+def _patch_transport(mock_t):
+    """Patch SSHTransport in pipeline so __enter__ returns mock_t."""
+    p = patch("app.glean.pipeline.SSHTransport")
+    MockClass = p.start()
+    MockClass.return_value.__enter__.return_value = mock_t
+    MockClass.return_value.__exit__.return_value = None
+    return p, MockClass
+
+
+def _entry_count(db_path: Path) -> int:
+    conn = sqlite3.connect(db_path)
+    n = conn.execute("SELECT COUNT(*) FROM log_entries").fetchone()[0]
+    conn.close()
+    return n
+
+
+# ── journald type ─────────────────────────────────────────────────────────────
+
+class TestSSHJournaldGlean:
+    def test_journald_entries_written_to_db(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "journald"}],
+        }]))
+
+        mock_t = _mock_transport([JOURNALD_LINE])
+        p, MockClass = _patch_transport(mock_t)
+        try:
+            stats = glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        assert _entry_count(db_path) >= 1
+        assert any("rack01" in k for k in stats)
+
+    def test_journald_args_passed_to_command_builder(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "journald", "args": ["--since", "1 hour ago"]}],
+        }]))
+
+        mock_t = _mock_transport([JOURNALD_LINE])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        # The command passed to exec_stream must contain the args
+        call_args = mock_t.exec_stream.call_args[0][0]
+        assert "--since" in call_args
+        assert "1 hour ago" in call_args
+
+    def test_journald_unit_shorthand(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "journald", "unit": "sshd"}],
+        }]))
+
+        mock_t = _mock_transport([])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        call_args = mock_t.exec_stream.call_args[0][0]
+        assert "sshd" in call_args
+
+
+# ── syslog type ───────────────────────────────────────────────────────────────
+
+class TestSSHSyslogGlean:
+    def test_syslog_entries_written_to_db(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01-syslog",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "syslog", "path": "/var/log/syslog"}],
+        }]))
+
+        mock_t = _mock_transport([SYSLOG_LINE])
+        p, _ = _patch_transport(mock_t)
+        try:
+            stats = glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        assert _entry_count(db_path) >= 1
+
+    def test_syslog_command_contains_path(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "syslog", "path": "/var/log/auth.log"}],
+        }]))
+
+        mock_t = _mock_transport([])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        call_args = mock_t.exec_stream.call_args[0][0]
+        assert "/var/log/auth.log" in call_args
+
+
+# ── plaintext type ────────────────────────────────────────────────────────────
+
+class TestSSHPlaintextGlean:
+    def test_plaintext_entries_written_to_db(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01-app",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "plaintext", "path": "/var/log/app/error.log"}],
+        }]))
+
+        mock_t = _mock_transport([PLAINTEXT_LINE])
+        p, _ = _patch_transport(mock_t)
+        try:
+            stats = glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        assert _entry_count(db_path) >= 1
+
+    def test_plaintext_command_contains_path(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "plaintext", "path": "/opt/myapp/app.log"}],
+        }]))
+
+        mock_t = _mock_transport([])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        call_args = mock_t.exec_stream.call_args[0][0]
+        assert "/opt/myapp/app.log" in call_args
+
+
+# ── docker type ───────────────────────────────────────────────────────────────
+
+class TestSSHDockerGlean:
+    def test_docker_single_container_command_issued(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "docker", "containers": ["myapp"]}],
+        }]))
+
+        mock_t = _mock_transport([DOCKER_LINE])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        call_args = mock_t.exec_stream.call_args[0][0]
+        assert "myapp" in call_args
+
+    def test_docker_multiple_containers_exec_per_container(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "docker", "containers": ["app", "nginx"]}],
+        }]))
+
+        mock_t = MagicMock()
+        mock_t.exec_stream.return_value = iter([])
+        p, _ = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        # One exec_stream call per container
+        assert mock_t.exec_stream.call_count == 2
+        all_cmds = " ".join(c[0][0] for c in mock_t.exec_stream.call_args_list)
+        assert "app" in all_cmds
+        assert "nginx" in all_cmds
+
+
+# ── error handling ────────────────────────────────────────────────────────────
+
+class TestSSHGleanErrorHandling:
+    def test_connection_error_skips_source_returns_empty_stats(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "unreachable",
+            "transport": "ssh",
+            "host": "192.168.99.99",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "journald"}],
+        }]))
+
+        with patch("app.glean.pipeline.SSHTransport") as MockClass:
+            MockClass.return_value.__enter__.side_effect = SSHConnectionError("no route")
+            MockClass.return_value.__exit__.return_value = None
+            stats = glean_sources(sources_file, db_path)
+
+        assert _entry_count(db_path) == 0
+        # Stats for the source should either be absent or zero
+        for v in stats.values():
+            assert v == 0
+
+    def test_command_error_skips_item_continues_next(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        # Two glean items: first raises SSHCommandError, second yields a valid line
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [
+                {"type": "journald"},
+                {"type": "syslog", "path": "/var/log/syslog"},
+            ],
+        }]))
+
+        mock_t = MagicMock()
+        # side_effect list: exception instances are raised; other values are returned
+        mock_t.exec_stream.side_effect = [
+            SSHCommandError("journalctl: command not found"),  # raised on first call
+            iter([SYSLOG_LINE]),                               # returned on second call
+        ]
+
+        p, _ = _patch_transport(mock_t)
+        try:
+            # Should not raise — bad item is skipped, good item is processed
+            stats = glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        # The syslog line should have been written
+        assert _entry_count(db_path) >= 1
+
+    def test_unknown_glean_type_skipped(self, tmp_path):
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [{"type": "mqtt"}],  # not a valid remote type
+        }]))
+
+        mock_t = _mock_transport([])
+        p, _ = _patch_transport(mock_t)
+        try:
+            stats = glean_sources(sources_file, db_path)  # must not raise
+        finally:
+            p.stop()
+
+        assert _entry_count(db_path) == 0
+
+
+# ── mixed local + SSH sources ─────────────────────────────────────────────────
+
+class TestMixedLocalAndSSH:
+    def test_local_and_ssh_both_processed(self, tmp_path):
+        # Local syslog file
+        local_log = tmp_path / "local.log"
+        local_log.write_text(SYSLOG_LINE)
+
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([
+            {"id": "local-syslog", "path": str(local_log)},
+            {
+                "id": "remote01",
+                "transport": "ssh",
+                "host": "192.168.1.10",
+                "user": "admin",
+                "key_path": "~/.ssh/id_ed25519",
+                "glean": [{"type": "syslog", "path": "/var/log/syslog"}],
+            },
+        ]))
+
+        mock_t = _mock_transport([SYSLOG_LINE])
+        p, _ = _patch_transport(mock_t)
+        try:
+            stats = glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        # Both sources should have contributed entries
+        assert _entry_count(db_path) >= 2
+        assert "local-syslog" in stats
+        assert any("remote01" in k for k in stats)
+
+    def test_local_only_sources_never_calls_ssh(self, tmp_path):
+        local_log = tmp_path / "local.log"
+        local_log.write_text(SYSLOG_LINE)
+
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([
+            {"id": "local", "path": str(local_log)},
+        ]))
+
+        with patch("app.glean.pipeline.SSHTransport") as MockClass:
+            glean_sources(sources_file, db_path)
+            MockClass.assert_not_called()
+
+
+# ── multiple glean items per SSH source ───────────────────────────────────────
+
+class TestMultipleGleanItemsPerHost:
+    def test_one_connection_multiple_commands(self, tmp_path):
+        """One SSHTransport instance is shared across all glean items for a host."""
+        sources_file = tmp_path / "sources.yaml"
+        db_path = tmp_path / "test.db"
+        sources_file.write_text(_ssh_sources_yaml([{
+            "id": "rack01",
+            "transport": "ssh",
+            "host": "192.168.1.10",
+            "user": "admin",
+            "key_path": "~/.ssh/id_ed25519",
+            "glean": [
+                {"type": "journald"},
+                {"type": "syslog", "path": "/var/log/syslog"},
+                {"type": "plaintext", "path": "/var/log/app.log"},
+            ],
+        }]))
+
+        mock_t = _mock_transport([])
+        p, MockClass = _patch_transport(mock_t)
+        try:
+            glean_sources(sources_file, db_path)
+        finally:
+            p.stop()
+
+        # SSHTransport() should be instantiated only once for the one host
+        MockClass.assert_called_once()
+        # exec_stream should be called once per glean item
+        assert mock_t.exec_stream.call_count == 3
--- a/tests/test_ingest_qbittorrent.py
+++ b/tests/test_ingest_qbittorrent.py
@ -1,9 +1,9 @@
-"""Tests for the qBittorrent log ingestor."""
+"""Tests for the qBittorrent log gleaner."""
 from __future__ import annotations

 import pytest

-from app.ingest.qbittorrent import is_qbit_log, parse
+from app.glean.qbittorrent import is_qbit_log, parse

 # ---------------------------------------------------------------------------
 # Classic format sample  (pre-5.x GUI builds)
--- a/tests/test_glean_ssh.py
+++ b/tests/test_glean_ssh.py
@ -0,0 +1,185 @@
+"""Tests for SSH transport layer (app/glean/ssh.py).
+
+All SSH network I/O is mocked — no real SSH connection required.
+"""
+from __future__ import annotations
+
+import io
+from unittest.mock import MagicMock, patch, call
+
+import pytest
+
+from app.glean.ssh import (
+    SSHTransport,
+    SSHConnectionError,
+    SSHCommandError,
+    _build_journald_command,
+    _build_syslog_command,
+    _build_plaintext_command,
+    _build_docker_command,
+)
+
+
+# ── Command builders ──────────────────────────────────────────────────────────
+
+class TestBuildJournaldCommand:
+    def test_no_args_returns_base_command(self):
+        cmd = _build_journald_command({})
+        assert "journalctl" in cmd
+        assert "-o json" in cmd
+
+    def test_args_list_appended(self):
+        cmd = _build_journald_command({"args": ["--since", "2 hours ago", "--unit", "sshd"]})
+        assert "--since" in cmd
+        assert "2 hours ago" in cmd
+        assert "--unit" in cmd
+        assert "sshd" in cmd
+
+    def test_unit_shorthand(self):
+        cmd = _build_journald_command({"unit": "docker"})
+        assert "--unit docker" in cmd or "--unit=docker" in cmd
+
+
+class TestBuildSyslogCommand:
+    def test_returns_cat_command(self):
+        cmd = _build_syslog_command({"path": "/var/log/syslog"})
+        assert "cat" in cmd
+        assert "/var/log/syslog" in cmd
+
+    def test_default_path_when_omitted(self):
+        cmd = _build_syslog_command({})
+        assert "cat" in cmd
+        assert "/var/log" in cmd
+
+
+class TestBuildPlaintextCommand:
+    def test_cat_with_path(self):
+        cmd = _build_plaintext_command({"path": "/var/log/app/error.log"})
+        assert "cat" in cmd
+        assert "/var/log/app/error.log" in cmd
+
+    def test_raises_without_path(self):
+        with pytest.raises((ValueError, KeyError)):
+            _build_plaintext_command({})
+
+
+class TestBuildDockerCommand:
+    def test_single_container(self):
+        cmd = _build_docker_command({"containers": ["myapp"]})
+        assert "myapp" in cmd
+
+    def test_multiple_containers_returns_list(self):
+        cmds = _build_docker_command({"containers": ["app", "nginx"]})
+        # Multiple containers → must produce a command per container OR joined
+        assert "app" in (cmds if isinstance(cmds, str) else " ".join(cmds))
+        assert "nginx" in (cmds if isinstance(cmds, str) else " ".join(cmds))
+
+    def test_raises_without_containers(self):
+        with pytest.raises((ValueError, KeyError)):
+            _build_docker_command({})
+
+
+# ── SSHTransport context manager ──────────────────────────────────────────────
+
+def _mock_ssh_client(stdout_lines: list[str] | None = None):
+    """Return a mock SSHClient whose exec_command yields the given lines."""
+    client = MagicMock()
+    stdout = MagicMock()
+    stdout.__iter__ = MagicMock(return_value=iter(stdout_lines or []))
+    stderr = MagicMock()
+    stderr.read.return_value = b""
+    client.exec_command.return_value = (MagicMock(), stdout, stderr)
+    return client
+
+
+class TestSSHTransportConnect:
+    def test_connects_with_key_path(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            MockClient.return_value = _mock_ssh_client()
+            with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
+                pass
+        MockClient.return_value.connect.assert_called_once()
+        call_kwargs = MockClient.return_value.connect.call_args
+        assert call_kwargs.kwargs.get("hostname") == "10.0.0.1" or \
+               call_kwargs.args[0] == "10.0.0.1"
+
+    def test_disconnects_on_exit(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            mock_client = _mock_ssh_client()
+            MockClient.return_value = mock_client
+            with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
+                pass
+        mock_client.close.assert_called_once()
+
+    def test_disconnects_on_exception(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            mock_client = _mock_ssh_client()
+            MockClient.return_value = mock_client
+            with pytest.raises(RuntimeError):
+                with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
+                    raise RuntimeError("boom")
+        mock_client.close.assert_called_once()
+
+    def test_raises_ssh_connection_error_on_auth_failure(self, tmp_path):
+        import paramiko
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            MockClient.return_value.connect.side_effect = paramiko.AuthenticationException("denied")
+            with pytest.raises(SSHConnectionError, match="auth"):
+                with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
+                    pass
+
+    def test_raises_ssh_connection_error_on_no_route(self, tmp_path):
+        import paramiko
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            MockClient.return_value.connect.side_effect = paramiko.SSHException("no route")
+            with pytest.raises(SSHConnectionError):
+                with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)):
+                    pass
+
+
+class TestSSHTransportExecStream:
+    def test_yields_stdout_lines(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        lines = ["line one\n", "line two\n", "line three\n"]
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            MockClient.return_value = _mock_ssh_client(lines)
+            with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
+                result = list(t.exec_stream("echo hello"))
+        assert result == lines
+
+    def test_raises_ssh_command_error_on_nonzero_exit(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            mock_client = _mock_ssh_client([])
+            # Simulate non-zero exit code
+            channel = MagicMock()
+            channel.recv_exit_status.return_value = 1
+            mock_client.exec_command.return_value[1].channel = channel
+            mock_client.exec_command.return_value[2].read.return_value = b"command not found"
+            MockClient.return_value = mock_client
+            with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
+                with pytest.raises(SSHCommandError, match="command not found"):
+                    list(t.exec_stream("notacommand"))
+
+    def test_strips_trailing_newlines(self, tmp_path):
+        key_file = tmp_path / "id_ed25519"
+        key_file.write_bytes(b"fake-key")
+        lines = ["  line with spaces  \n"]
+        with patch("app.glean.ssh.paramiko.SSHClient") as MockClient:
+            MockClient.return_value = _mock_ssh_client(lines)
+            with SSHTransport(host="10.0.0.1", user="admin", key_path=str(key_file)) as t:
+                # exec_stream should yield the raw lines; stripping is parser's job
+                result = list(t.exec_stream("echo hello"))
+        assert len(result) == 1
--- a/tests/test_ingest_syslog.py
+++ b/tests/test_ingest_syslog.py
@ -1,7 +1,7 @@
-"""Tests for the syslog (RFC 3164) ingestor."""
+"""Tests for the syslog (RFC 3164) gleaner."""
 from __future__ import annotations

-from app.ingest.syslog import is_syslog, parse
+from app.glean.syslog import is_syslog, parse

 SYSLOG_SAMPLE = """\
 May 11 14:23:01 example-node sshd[1234]: Accepted publickey for x from 192.168.1.1 port 54321 ssh2
--- a/tests/test_ingest_tautulli.py
+++ b/tests/test_ingest_tautulli.py
@ -1,10 +1,10 @@
-"""Tests for the Tautulli webhook ingestor."""
+"""Tests for the Tautulli webhook gleaner."""
 from __future__ import annotations

 import pytest
 from unittest.mock import patch

-from app.ingest.tautulli import is_tautulli_payload, parse_webhook
+from app.glean.tautulli import is_tautulli_payload, parse_webhook


 # ---------------------------------------------------------------------------
@ -253,7 +253,7 @@ class TestEndpoint:
    @pytest.fixture
    def client(self, tmp_path):
        from fastapi.testclient import TestClient
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        import app.rest as rest_module

        db = tmp_path / "test.db"
@ -267,14 +267,14 @@ class TestEndpoint:

    def test_missing_action_returns_400(self, client):
        resp = client.post(
-            "/turnstone/api/ingest/tautulli",
+            "/turnstone/api/glean/tautulli",
            json={"session_key": "x"},
        )
        assert resp.status_code == 400

    def test_wrong_token_returns_403(self, tmp_path):
        from fastapi.testclient import TestClient
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        import app.rest as rest_module

        db = tmp_path / "test.db"
@ -288,7 +288,7 @@ class TestEndpoint:
             patch.object(rest_module, "_compiled_patterns", []):
            with TestClient(rest_module.app, raise_server_exceptions=True) as c:
                resp = c.post(
-                    "/turnstone/api/ingest/tautulli",
+                    "/turnstone/api/glean/tautulli",
                    json=_ERROR_PAYLOAD,
                    headers={"X-Tautulli-Token": "wrong"},
                )
@ -296,7 +296,7 @@ class TestEndpoint:

    def test_valid_payload_returns_200(self, client):
        resp = client.post(
-            "/turnstone/api/ingest/tautulli",
+            "/turnstone/api/glean/tautulli",
            json=_ERROR_PAYLOAD,
        )
        assert resp.status_code == 200
--- a/tests/test_ingest_wazuh.py
+++ b/tests/test_ingest_wazuh.py
@ -1,11 +1,11 @@
-"""Tests for the Wazuh alert ingestor."""
+"""Tests for the Wazuh alert gleaner."""
 from __future__ import annotations

 import json
 from datetime import datetime

-from app.ingest.wazuh import is_wazuh_alert, parse
-from app.ingest.pipeline import _detect_format
+from app.glean.wazuh import is_wazuh_alert, parse
+from app.glean.pipeline import _detect_format

 _ALERT = {
    "timestamp": "2024-01-15T10:23:45.123+0000",
--- a/tests/test_service_blocklist.py
+++ b/tests/test_service_blocklist.py
@ -8,7 +8,7 @@ from pathlib import Path

 class TestSchema:
    def test_blocklist_candidates_table_exists(self, tmp_path):
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        db = tmp_path / "test.db"
        ensure_schema(db)
        conn = sqlite3.connect(str(db))
@ -16,7 +16,7 @@ class TestSchema:
        assert "blocklist_candidates" in tables

    def test_blocklist_candidates_columns(self, tmp_path):
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        db = tmp_path / "test.db"
        ensure_schema(db)
        conn = sqlite3.connect(str(db))
@ -28,7 +28,7 @@ class TestSchema:
        }

    def test_status_default_is_pending(self, tmp_path):
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        import uuid
        db = tmp_path / "test.db"
        ensure_schema(db)
@ -89,7 +89,7 @@ class TestTelemetry:
 class TestExtraction:
    @pytest.fixture
    def db(self, tmp_path):
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        p = tmp_path / "test.db"
        ensure_schema(p)
        return p
@ -195,7 +195,7 @@ class TestExtraction:
 class TestCandidateManagement:
    @pytest.fixture
    def db_with_candidate(self, tmp_path):
-        from app.ingest.pipeline import ensure_schema
+        from app.glean.pipeline import ensure_schema
        import sqlite3, uuid
        db = tmp_path / "test.db"
        ensure_schema(db)
--- a/tests/test_services_diagnose.py
+++ b/tests/test_services_diagnose.py
@ -54,7 +54,7 @@ def test_keywords_cleaned_of_extra_spaces():


 def test_diagnose_with_explicit_window_sets_time_detected(tmp_path):
-    from app.ingest.pipeline import ensure_schema
+    from app.glean.pipeline import ensure_schema
    db = tmp_path / "test.db"
    ensure_schema(db)
    result = diagnose(db, query="plex", since="2026-05-11T14:00:00+00:00", until="2026-05-11T15:00:00+00:00")
--- a/web/src/components/QuickCapture.vue
+++ b/web/src/components/QuickCapture.vue
@ -104,7 +104,7 @@
      <p v-if="severityFilter" class="mb-1">No {{ severityFilter }} entries in this result set.</p>
      <template v-else>
        <p class="mb-1">No log evidence found for "{{ lastQuery }}"</p>
-        <p class="text-sm">Check the Sources tab to confirm data is ingested, or try a broader description.</p>
+        <p class="text-sm">Check the Sources tab to confirm data is gleaned, or try a broader description.</p>
      </template>
    </div>

--- a/web/src/views/DashboardView.vue
+++ b/web/src/views/DashboardView.vue
@ -10,7 +10,7 @@
          class="w-2 h-2 rounded-full flex-shrink-0"
        ></span>
        <span :class="watchActive ? 'text-green-400' : 'text-text-dim'" class="text-xs">
-          {{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual ingest mode' }}
+          {{ watchActive ? `Live — ${watchSources.length} source${watchSources.length !== 1 ? 's' : ''} watched` : 'Manual glean mode' }}
        </span>
      </div>

@ -20,8 +20,8 @@
        class="flex items-center gap-2 rounded border border-surface-border bg-surface-raised px-4 py-2.5 text-xs text-text-dim"
      >
        <span class="text-sev-warn">⚠</span>
-        <span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span>. Waiting for new entries to arrive.</span>
-        <span v-else>Last ingested: <span class="text-text-muted">{{ shortTs(stats.last_ingested) }}</span> — 24h counts reflect this window, not today.</span>
+        <span v-if="watchActive">Live watch active — last event: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span>. Waiting for new entries to arrive.</span>
+        <span v-else>Last gleaned: <span class="text-text-muted">{{ shortTs(stats.last_gleaned) }}</span> — 24h counts reflect this window, not today.</span>
      </div>
    </div>

@ -171,7 +171,7 @@ interface StatsResponse {
  criticals_24h: number
  errors_24h: number
  suppressed_criticals: number
-  last_ingested: string | null
+  last_gleaned: string | null
  source_health: SourceHealth[]
  recent_criticals: Array<{
    entry_id: string
@ -186,7 +186,7 @@ interface WatchSourceStatus {
  source_id: string
  type: string
  running: boolean
-  entries_ingested: number
+  entries_gleaned: number
  last_event: string | null
  error: string | null
 }
@ -211,8 +211,8 @@ const watchActive = computed(() =>
 )

 const isStale = computed(() => {
-  if (!stats.value?.last_ingested) return false
-  const age = Date.now() - new Date(stats.value.last_ingested).getTime()
+  if (!stats.value?.last_gleaned) return false
+  const age = Date.now() - new Date(stats.value.last_gleaned).getTime()
  return age > 25 * 60 * 60 * 1000  // older than 25h
 })

--- a/web/src/views/LogSearchView.vue
+++ b/web/src/views/LogSearchView.vue
@ -106,7 +106,7 @@
        </div>
        <div v-else class="text-center">
          <p class="text-base mb-1">No results for "{{ store.query }}"</p>
-          <p class="text-sm">Try broader terms or check the Sources tab to confirm data is ingested.</p>
+          <p class="text-sm">Try broader terms or check the Sources tab to confirm data is gleaned.</p>
        </div>
      </div>

--- a/web/src/views/SourcesView.vue
+++ b/web/src/views/SourcesView.vue
@ -3,7 +3,7 @@
    <div class="mb-6 flex items-start justify-between gap-4">
      <div>
        <h1 class="text-text-primary text-xl font-semibold mb-1">Log Sources</h1>
-        <p class="text-text-dim text-sm">All hosts and services in the ingested corpus.</p>
+        <p class="text-text-dim text-sm">All hosts and services in the gleaned corpus.</p>
      </div>
      <label class="btn-secondary text-sm cursor-pointer shrink-0">
        <span>Upload log file</span>
@ -21,12 +21,12 @@

    <div v-else-if="sources.length === 0" class="text-text-dim py-12 text-center">
      <p class="mb-1">No log sources found.</p>
-      <p class="text-sm">Run the ingest pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/ingest_corpus.py</code></p>
+      <p class="text-sm">Run the glean pipeline: <code class="bg-surface-raised px-1 rounded">python scripts/glean_corpus.py</code></p>
    </div>

    <div v-else class="rounded border border-surface-border overflow-hidden">
      <div class="overflow-x-auto">
-        <table class="w-full text-sm min-w-[560px]">
+        <table class="w-full text-sm min-w-[620px]">
          <thead class="bg-surface-raised border-b border-surface-border">
            <tr>
              <th class="text-left px-4 py-2.5 text-text-dim font-medium text-xs uppercase tracking-wider">Source</th>
@ -40,29 +40,72 @@
          <tbody>
            <tr
              v-for="src in sources"
-              :key="src.source_id"
+              :key="src.id"
              class="border-b border-surface-border hover:bg-surface-raised transition-colors"
            >
-              <td class="px-4 py-2.5 text-accent">{{ src.source_id }}</td>
-              <td class="px-4 py-2.5 text-text-muted text-right tabular-nums">{{ src.entry_count.toLocaleString() }}</td>
+              <!-- Source name + badges -->
+              <td class="px-4 py-2.5">
+                <div class="flex flex-wrap items-center gap-1.5">
+                  <span class="text-accent font-mono text-xs">{{ src.id }}</span>
+                  <!-- SSH transport badge -->
+                  <span
+                    v-if="src.transport === 'ssh'"
+                    class="inline-flex items-center gap-1 px-1.5 py-0.5 rounded text-[10px] font-medium
+                           bg-blue-900/30 text-blue-400 border border-blue-800/40"
+                    :title="`SSH: ${src.user}@${src.host}`"
+                  >
+                    <svg class="w-2.5 h-2.5" viewBox="0 0 16 16" fill="currentColor" aria-hidden="true">
+                      <path d="M2 3a1 1 0 011-1h10a1 1 0 011 1v2a1 1 0 01-1 1H3a1 1 0 01-1-1V3zm0 5a1 1 0 011-1h4a1 1 0 110 2H3a1 1 0 01-1-1zm0 4a1 1 0 011-1h2a1 1 0 110 2H3a1 1 0 01-1-1z"/>
+                    </svg>
+                    ssh
+                  </span>
+                  <!-- Glean-type pills for SSH sources -->
+                  <span
+                    v-for="gtype in (src.glean_types ?? [])"
+                    :key="gtype"
+                    class="px-1.5 py-0.5 rounded text-[10px] font-medium
+                           bg-surface-raised text-text-dim border border-surface-border"
+                  >{{ gtype }}</span>
+                  <!-- Upload badge for DB-only sources not in sources.yaml -->
+                  <span
+                    v-if="src.dbOnly"
+                    class="px-1.5 py-0.5 rounded text-[10px] font-medium
+                           bg-surface-raised text-text-dim border border-surface-border"
+                  >uploaded</span>
+                </div>
+                <!-- SSH host subtitle -->
+                <div v-if="src.transport === 'ssh'" class="text-text-dim text-xs mt-0.5 font-mono">
+                  {{ src.user }}@{{ src.host }}
+                </div>
+              </td>
+
+              <!-- Entry count -->
+              <td class="px-4 py-2.5 text-text-muted text-right tabular-nums">
+                {{ src.entry_count.toLocaleString() }}
+              </td>
+
+              <!-- Error count -->
              <td class="px-4 py-2.5 text-right tabular-nums">
                <span :class="src.error_count > 0 ? 'text-sev-error' : 'text-text-dim'">
                  {{ src.error_count.toLocaleString() }}
                </span>
              </td>
+
              <td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.earliest) }}</td>
              <td class="px-4 py-2.5 text-text-dim text-xs">{{ formatTs(src.latest) }}</td>
+
+              <!-- Actions -->
              <td class="px-4 py-2.5">
                <div class="flex items-center justify-end gap-2">
                  <button
-                    :disabled="busy.has(src.source_id)"
-                    @click="reingest(src.source_id)"
+                    :disabled="busy.has(src.id) || src.dbOnly"
+                    @click="reglean(src.id)"
                    class="text-text-dim hover:text-accent transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
-                    title="Re-ingest from sources.yaml"
-                  >{{ busy.has(src.source_id) ? '…' : 'reingest' }}</button>
+                    :title="src.dbOnly ? 'Not in sources.yaml — cannot re-glean' : 'Re-glean from sources.yaml'"
+                  >{{ busy.has(src.id) ? '…' : 'reglean' }}</button>
                  <button
-                    :disabled="busy.has(src.source_id)"
-                    @click="deleteSource(src.source_id)"
+                    :disabled="busy.has(src.id)"
+                    @click="deleteSource(src.id)"
                    class="text-text-dim hover:text-sev-error transition-colors text-xs px-2 py-1 rounded hover:bg-surface disabled:opacity-40"
                    title="Delete all entries for this source"
                  >delete</button>
@ -78,9 +121,36 @@

 <script setup lang="ts">
 import { ref, onMounted } from 'vue'
-import type { LogSource } from '@/stores/search'

-const sources = ref<LogSource[]>([])
+// Unified source row shown in the table (merges configured + DB-only sources).
+interface SourceRow {
+  id: string
+  transport: 'local' | 'ssh'
+  // SSH-specific
+  host?: string
+  user?: string
+  glean_types?: string[]
+  // Local-specific
+  path?: string
+  // DB stats (always present, default 0/null)
+  entry_count: number
+  error_count: number
+  earliest: string | null
+  latest: string | null
+  // True when this source exists in the DB but not in sources.yaml (e.g. uploads)
+  dbOnly?: boolean
+}
+
+interface ConfiguredSource extends Omit<SourceRow, 'dbOnly'> {}
+interface DbSource {
+  source_id: string
+  entry_count: number
+  error_count: number
+  earliest: string | null
+  latest: string | null
+}
+
+const sources = ref<SourceRow[]>([])
 const loading = ref(true)
 const busy = ref(new Set<string>())
 const actionMsg = ref('')
@ -90,11 +160,52 @@ const BASE = import.meta.env.BASE_URL.replace(/\/$/, '')

 async function loadSources(): Promise<void> {
  try {
-    const res = await fetch(`${BASE}/api/sources`)
-    if (res.ok) {
-      const data = await res.json()
-      sources.value = data.sources
+    // Primary list: configured sources from sources.yaml (enriched with DB stats).
+    // This makes SSH sources visible even before their first glean.
+    const [configuredRes, dbRes] = await Promise.all([
+      fetch(`${BASE}/api/sources/configured`),
+      fetch(`${BASE}/api/sources`),
+    ])
+
+    const configuredData = configuredRes.ok ? await configuredRes.json() : { sources: [] }
+    const dbData = dbRes.ok ? await dbRes.json() : { sources: [] }
+
+    const configuredSources: ConfiguredSource[] = configuredData.sources ?? []
+    const dbSources: DbSource[] = dbData.sources ?? []
+
+    // Build a set of all IDs represented by configured sources.
+    // SSH sources own all sub-source IDs like "rack01/journald" too.
+    const coveredIds = new Set<string>()
+    for (const s of configuredSources) {
+      coveredIds.add(s.id)
    }
+
+    // For SSH sources, also mark sub-source IDs (rack01/…) as covered so they
+    // don't appear as separate "uploaded" rows.
+    for (const s of configuredSources) {
+      if (s.transport === 'ssh') {
+        for (const db of dbSources) {
+          if (db.source_id.startsWith(s.id + '/') || db.source_id === s.id) {
+            coveredIds.add(db.source_id)
+          }
+        }
+      }
+    }
+
+    // DB-only sources: uploaded files or manually gleaned sources not in sources.yaml.
+    const dbOnly: SourceRow[] = dbSources
+      .filter(db => !coveredIds.has(db.source_id))
+      .map(db => ({
+        id: db.source_id,
+        transport: 'local' as const,
+        entry_count: db.entry_count,
+        error_count: db.error_count,
+        earliest: db.earliest,
+        latest: db.latest,
+        dbOnly: true,
+      }))
+
+    sources.value = [...configuredSources as SourceRow[], ...dbOnly]
  } finally {
    loading.value = false
  }
@ -118,7 +229,13 @@ async function deleteSource(sourceId: string): Promise<void> {
      const data = await res.json()
      actionMsg.value = `Deleted ${data.deleted.toLocaleString()} entries for "${sourceId}"`
      actionError.value = false
-      sources.value = sources.value.filter(s => s.source_id !== sourceId)
+      // Remove DB-only rows; zero-out configured-source stats instead of hiding.
+      sources.value = sources.value
+        .filter(s => !(s.id === sourceId && s.dbOnly))
+        .map(s => s.id === sourceId
+          ? { ...s, entry_count: 0, error_count: 0, earliest: null, latest: null }
+          : s
+        )
    } else {
      const data = await res.json()
      actionMsg.value = data.detail ?? 'Delete failed'
@ -129,19 +246,19 @@ async function deleteSource(sourceId: string): Promise<void> {
  }
 }

-async function reingest(sourceId: string): Promise<void> {
+async function reglean(sourceId: string): Promise<void> {
  setBusy(sourceId, true)
  actionMsg.value = ''
  actionError.value = false
  try {
-    const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/ingest`, { method: 'POST' })
+    const res = await fetch(`${BASE}/api/sources/${encodeURIComponent(sourceId)}/glean`, { method: 'POST' })
    const data = await res.json()
    if (res.ok) {
-      actionMsg.value = `Re-ingest complete: ${data.ingested.toLocaleString()} new entries for "${sourceId}"`
+      actionMsg.value = `Re-glean complete: ${data.gleaned.toLocaleString()} new entries for "${sourceId}"`
      actionError.value = false
      await loadSources()
    } else {
-      actionMsg.value = data.detail ?? 'Re-ingest failed'
+      actionMsg.value = data.detail ?? 'Re-glean failed'
      actionError.value = true
    }
  } finally {
@ -156,10 +273,10 @@ async function handleUpload(e: Event): Promise<void> {
  actionError.value = false
  const form = new FormData()
  form.append('file', file)
-  const res = await fetch(`${BASE}/api/ingest/upload`, { method: 'POST', body: form })
+  const res = await fetch(`${BASE}/api/glean/upload`, { method: 'POST', body: form })
  const data = await res.json()
  if (res.ok) {
-    actionMsg.value = `Uploaded: ${data.ingested.toLocaleString()} entries ingested as "${data.source_id}"`
+    actionMsg.value = `Uploaded: ${data.gleaned.toLocaleString()} entries gleaned as "${data.source_id}"`
    actionError.value = false
    await loadSources()
  } else {