134 changed files with 1236 additions and 19372 deletions
--- a/.env.example
+++ b/.env.example
@ -10,7 +10,7 @@
 # GPU_SERVER_URL — URL of your GPU inference server (Ollama, vLLM, or cf-orch coordinator).
 # Paid+ users: leave unset to auto-default to https://orch.circuitforge.tech via CF_LICENSE_KEY.
 # Local Ollama (default if unset): http://localhost:11434
-# Local cf-orch coordinator: http://<YOUR_HOST_IP>:7700
+# Local cf-orch coordinator: http://10.1.10.71:7700
 # CF_ORCH_URL is also accepted as a backward-compatible alias.
 # GPU_SERVER_URL=http://localhost:11434
@ -22,83 +22,3 @@
 # --- Bundle endpoint (optional) ---
 # Remote endpoint to push diagnostic bundles for escalation.
 # TURNSTONE_BUNDLE_ENDPOINT=https://example.com/api/bundles
 # --- Log corpus export to Avocet (optional) ---
 # Push ERROR/CRITICAL entries and labeled incidents to the Avocet corpus endpoint
 # for logreading fine-tune training. Requires a consent token issued by CF.
 # Contact alan@circuitforge.tech to register your node and receive a token.
 # Watermarks are stored at data/corpus_watermark.txt and data/incident_watermark.txt.
 # AVOCET_CORPUS_ENDPOINT=https://avocet.circuitforge.tech/api/corpus/log-batch
 # AVOCET_CONSENT_TOKEN=your-uuid-token-here
 # TURNSTONE_SOURCE_HOST=my-server-name   # defaults to system hostname if unset
 # --- Periodic batch glean ---
 # Seconds between automatic glean runs from sources.yaml. Set to 0 to disable.
 # TURNSTONE_GLEAN_INTERVAL=900
 # --- Multi-agent diagnose pipeline (experimental) ---
 # Enable the 5-stage ML pipeline instead of the single-LLM summarize() call.
 # TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 # Stage 2 — ML severity classifier (optional; falls back to pattern_tags then regex).
 # Recommended: byviz/bylastic_classification_logs (~300MB, downloaded from HuggingFace)
 # TURNSTONE_CLASSIFIER_MODEL=byviz/bylastic_classification_logs
 # Stage 4 — Embedding backend for false-positive suppression.
 # sentence_transformers: in-process local model (downloads on first use)
 # ollama: uses a running Ollama instance (no download needed if model is already pulled)
 # TURNSTONE_EMBED_BACKEND=sentence_transformers
 # TURNSTONE_EMBED_MODEL=BAAI/bge-small-en-v1.5
 # TURNSTONE_EMBED_DEVICE=cpu
 # --- Cybersec scoring pipeline (zero-shot, second-pass on flagged entries) ---
 # Runs a zero-shot classifier on entries already flagged by the anomaly scorer
 # or that have pattern matches — a focused second opinion using cybersec vocabulary.
 # The DeBERTa-v3-base-mnli model (required by the diagnose pipeline) is the recommended
 # zero-shot classifier — it produces human-readable cybersec labels with no fine-tuning.
 # TURNSTONE_CYBERSEC_MODEL=MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
 # TURNSTONE_CYBERSEC_DEVICE=cpu
 # TURNSTONE_CYBERSEC_THRESHOLD=0.60   # lower than anomaly threshold (zero-shot is calibrated differently)
 # --- Anomaly scoring pipeline (IDS / watchdog) ---
 # Batch-scores every ingested log entry after each glean cycle.
 # Any HuggingFace text-classification model works; the byviz classifier (already
 # required by the diagnose pipeline) is the recommended starting point.
 # Detections above the threshold are inserted into the detections table and
 # surfaced in the Security Alerts tab.
 #
 # Set TURNSTONE_ANOMALY_MODEL to enable; leave unset to disable (safe default).
 # TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
 # TURNSTONE_ANOMALY_DEVICE=cpu          # or "cuda" / "mps" for GPU inference
 # TURNSTONE_ANOMALY_THRESHOLD=0.80      # confidence floor for detection insertion
 # TURNSTONE_ANOMALY_INTERVAL=0          # standalone loop (0 = glean-triggered only)
 #
 # HuggingFace model cache — share with the host to avoid re-downloading models.
 # HF_HOME=/hf_cache                     # inside container (set in docker-compose)
 # HF_CACHE_PATH=/Library/Assets/LLM    # host bind-mount source (docker-compose only)
 # --- Air-gapped / offline deployment ---
 # Set to 1 to block all HuggingFace hub network access at runtime.
 # Pre-download models to ~/.cache/huggingface/ before deploying — see docs/air-gapped-deployment.md.
 # TURNSTONE_OFFLINE_MODE=1
 # --- API authentication ---
 # When set, all /api/ requests require: Authorization: Bearer <token>
 # Generate a token: python -c "import secrets; print(secrets.token_urlsafe(32))"
 # TURNSTONE_API_KEY=your-secret-token-here
 # --- The Orchard (harvest receiver only) ---
 # Set on the central harvest.circuitforge.tech instance to enable branch management.
 # TURNSTONE_ORCHARD_ADMIN_KEY=your-admin-secret-here
 # TURNSTONE_ORCHARD_DATA_ROOT=/devl/docker/turnstone-submissions
 # TURNSTONE_ORCHARD_CADDYFILE=/devl/caddy-proxy/Caddyfile
 # TURNSTONE_ORCHARD_CADDY_CONTAINER=caddy-proxy
 # TURNSTONE_ORCHARD_HARVEST_HOST=https://harvest.circuitforge.tech
 # TURNSTONE_ORCHARD_PORT_BASE=8538
 # TURNSTONE_ORCHARD_IMAGE=localhost/turnstone:latest
 # --- Orchard branch (submitting node) ---
 # Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to the harvest receiver.
 # Generate your branch slug and API key via: POST /api/orchard/graft on the harvest instance.
 # TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/your-slug
 # TURNSTONE_BRANCH_KEY=api-key-from-graft-response
--- a/.nfs0000000000bbcf52000002e7
+++ b/.nfs0000000000bbcf52000002e7
@ -1,308 +0,0 @@
 #!/usr/bin/env bash
 # manage.sh — Turnstone diagnostic intelligence layer
 # Usage: ./manage.sh <command> [args]
 set -euo pipefail
 # Only emit color codes when stdout is a real terminal
 if [[ -t 1 ]]; then
    RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'; BLUE='\033[0;34m'; NC='\033[0m'
 else
    RED=''; GREEN=''; YELLOW=''; BLUE=''; NC=''
 fi
 info()    { echo -e "${BLUE}[turnstone]${NC} $*"; }
 success() { echo -e "${GREEN}[turnstone]${NC} $*"; }
 warn()    { echo -e "${YELLOW}[turnstone]${NC} $*"; }
 error()   { echo -e "${RED}[turnstone]${NC} $*" >&2; exit 1; }
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR"
 API_PORT=8534           # FastAPI: serves REST API + built Vue SPA
 VITE_PORT=5174          # Vite HMR port in dev mode (proxies /api → 8534)
 LOG_DIR="log"
 API_PID_FILE=".turnstone-api.pid"
 DB="${TURNSTONE_DB:-${SCRIPT_DIR}/data/turnstone.db}"
 CONDA_BASE="${CONDA_BASE:-/devl/miniconda3}"
 PYTHON="${CONDA_BASE}/envs/cf/bin/python"
 # ── Helpers ───────────────────────────────────────────────────────────────────
 _is_alive() {
    local pid_file="$1"
    [[ -f "$pid_file" ]] && kill -0 "$(<"$pid_file")" 2>/dev/null
 }
 _kill_pid_file() {
    local pid_file="$1" label="$2"
    if [[ -f "$pid_file" ]]; then
        local pid
        pid=$(<"$pid_file")
        if kill -0 "$pid" 2>/dev/null; then
            kill "$pid" && rm -f "$pid_file"
            success "$label stopped (PID $pid)."
        else
            warn "Stale PID file for $label (PID $pid not running). Cleaning up."
            rm -f "$pid_file"
        fi
    else
        warn "$label not running."
    fi
 }
 _wait_for_port() {
    local port="$1" label="$2" pid_file="$3"
    for _i in $(seq 1 20); do
        sleep 0.5
        (echo "" >/dev/tcp/127.0.0.1/"$port") 2>/dev/null && return 0
        if ! _is_alive "$pid_file"; then
            rm -f "$pid_file"
            error "$label died during startup. Check ${LOG_DIR}/api.log"
        fi
    done
    error "$label did not bind to port $port within 10 s."
 }
 # ── Usage ─────────────────────────────────────────────────────────────────────
 usage() {
    echo ""
    echo -e "  ${BLUE}Turnstone — Diagnostic Log Intelligence${NC}"
    echo ""
    echo "  Usage: ./manage.sh <command> [args]"
    echo ""
    echo "  Production-like (built SPA + uvicorn):"
    echo -e "    ${GREEN}start${NC}                    Build Vue SPA, start FastAPI + SPA on :${API_PORT}"
    echo -e "    ${GREEN}stop${NC}                     Stop the server"
    echo -e "    ${GREEN}restart${NC}                  Stop then start"
    echo -e "    ${GREEN}status${NC}                   Show running process"
    echo -e "    ${GREEN}logs${NC}                     Tail server log"
    echo -e "    ${GREEN}open${NC}                     Open UI in browser"
    echo ""
    echo "  Development (hot-reload):"
    echo -e "    ${GREEN}dev${NC}                      uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})"
    echo ""
    echo "  Data:"
    echo -e "    ${GREEN}ingest PATH [DB]${NC}         Ingest a log file or corpus directory"
    echo -e "    ${GREEN}ingest-plex [HOST]${NC}       Pull Plex log from Cass (or HOST) and ingest"
    echo -e "    ${GREEN}ingest-qbit [HOST]${NC}       Pull qBittorrent log locally or from HOST via SSH"
    echo -e "    ${GREEN}build-fts${NC}                Rebuild the FTS search index"
    echo ""
    echo "  Tests:"
    echo -e "    ${GREEN}test [args]${NC}              Run pytest suite"
    echo ""
    echo "  DB: ${DB}"
    echo "  Conda env: cf"
    echo ""
    echo "  Examples:"
    echo "    ./manage.sh start"
    echo "    ./manage.sh dev"
    echo "    ./manage.sh ingest corpus/raw/"
    echo "    ./manage.sh ingest corpus/raw/ data/custom.db"
    echo ""
 }
 # ── Commands ──────────────────────────────────────────────────────────────────
 CMD="${1:-help}"
 shift || true
 case "$CMD" in
    start)
        if _is_alive "$API_PID_FILE"; then
            warn "Already running (PID $(<"$API_PID_FILE")) — use 'restart' to rebuild."
            exit 0
        fi
        mkdir -p "$LOG_DIR" data
        info "Building Vue SPA…"
        (cd web && npm run build) 2>&1 | tee "${LOG_DIR}/build.log" | grep -E "built in|error" || true
        success "SPA built → web/dist/"
        info "Starting on port ${API_PORT}…"
        TURNSTONE_DB="$DB" nohup "$PYTHON" -m uvicorn app.rest:app \
            --host 0.0.0.0 --port "$API_PORT" \
            >> "${LOG_DIR}/api.log" 2>&1 &
        echo $! > "$API_PID_FILE"
        _wait_for_port "$API_PORT" "Turnstone" "$API_PID_FILE"
        success "Running → http://localhost:${API_PORT}  (PID $(<"$API_PID_FILE"))"
        ;;
    stop)
        _kill_pid_file "$API_PID_FILE" "Turnstone"
        ;;
    restart)
        bash "$0" stop
        exec bash "$0" start
        ;;
    status)
        echo ""
        if _is_alive "$API_PID_FILE"; then
            success "Turnstone RUNNING  PID $(<"$API_PID_FILE")  → http://localhost:${API_PORT}"
        else
            echo -e "  Turnstone ${RED}STOPPED${NC}"
        fi
        echo ""
        ;;
    logs)
        tail -f "${LOG_DIR}/api.log"
        ;;
    open)
        URL="http://localhost:${API_PORT}"
        info "Opening ${URL}"
        if command -v xdg-open &>/dev/null; then xdg-open "$URL"
        elif command -v open &>/dev/null; then open "$URL"
        else echo "$URL"
        fi
        ;;
    dev)
        DEV_API_PID=".turnstone-dev-api.pid"
        mkdir -p "$LOG_DIR" data
        if _is_alive "$DEV_API_PID"; then
            warn "Dev API already running (PID $(<"$DEV_API_PID"))"
        else
            info "Starting uvicorn --reload on port ${API_PORT}…"
            TURNSTONE_DB="$DB" nohup "$PYTHON" -m uvicorn app.rest:app \
                --host 0.0.0.0 --port "$API_PORT" --reload \
                >> "${LOG_DIR}/api.log" 2>&1 &
            echo $! > "$DEV_API_PID"
            _wait_for_port "$API_PORT" "FastAPI (dev)" "$DEV_API_PID"
            success "API (hot-reload) → http://localhost:${API_PORT}"
        fi
        _cleanup_dev() {
            local pid
            pid=$(<"$DEV_API_PID" 2>/dev/null) || true
            [[ -n "${pid:-}" ]] && kill "$pid" 2>/dev/null && rm -f "$DEV_API_PID"
            info "Dev servers stopped."
        }
        trap _cleanup_dev EXIT INT TERM
        info "Starting Vite HMR on port ${VITE_PORT}…"
        success "Frontend (HMR) → http://localhost:${VITE_PORT}"
        (cd web && npm run dev -- --port "$VITE_PORT")
        ;;
    ingest)
        if [[ $# -lt 1 ]]; then
            error "Usage: ./manage.sh ingest <file_or_dir> [DB_PATH]"
        fi
        info "Ingesting $1 → ${2:-$DB}…"
        "$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}"
        ;;
    ingest-plex)
        PLEX_HOST="${1:-cass}"
        PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs"
        TMP_DIR="/tmp/turnstone-plex-$$"
        mkdir -p "$TMP_DIR"
        info "Listing Plex logs on ${PLEX_HOST}…"
        # Get list of all rotated + active Plex logs
        mapfile -t REMOTE_LOGS < <(ssh "$PLEX_HOST" \
            "ls '${PLEX_LOG_DIR}'/Plex\ Media\ Server*.log 2>/dev/null") \
            || { rm -rf "$TMP_DIR"; error "SSH to ${PLEX_HOST} failed."; }
        if [[ ${#REMOTE_LOGS[@]} -eq 0 ]]; then
            rm -rf "$TMP_DIR"
            error "No Plex logs found on ${PLEX_HOST} at ${PLEX_LOG_DIR}"
        fi
        for remote_path in "${REMOTE_LOGS[@]}"; do
            # Plex Media Server.1.log → cass-plex_media_server.1.log
            local_name="${PLEX_HOST}-$(basename "$remote_path" | tr ' ' '_' | tr '[:upper:]' '[:lower:]')"
            local_path="${TMP_DIR}/${local_name}"
            info "  ← $(basename "$remote_path")"
            ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path"
        done
        info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
        for f in "$TMP_DIR"/*.log; do
            "$PYTHON" scripts/ingest_corpus.py "$f" "$DB"
        done
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
        ;;
    ingest-qbit)
        QBIT_HOST="${1:-}"
        # Default log locations in priority order
        QBIT_LOG_PATHS=(
            "$HOME/.local/share/qBittorrent/logs/qbittorrent.log"
            "$HOME/.config/qBittorrent/logs/qbittorrent.log"
            "/var/log/qbittorrent/qbittorrent.log"
        )
        TMP_DIR="/tmp/turnstone-qbit-$$"
        mkdir -p "$TMP_DIR"
        if [[ -n "$QBIT_HOST" ]]; then
            info "Fetching qBittorrent log from ${QBIT_HOST}…"
            REMOTE_LOG=""
            for p in "${QBIT_LOG_PATHS[@]}"; do
                if ssh "$QBIT_HOST" "test -f '$p'" 2>/dev/null; then
                    REMOTE_LOG="$p"
                    break
                fi
            done
            if [[ -z "$REMOTE_LOG" ]]; then
                rm -rf "$TMP_DIR"
                error "No qBittorrent log found on ${QBIT_HOST}. Tried: ${QBIT_LOG_PATHS[*]}"
            fi
            local_name="${QBIT_HOST}-qbittorrent.log"
            ssh "$QBIT_HOST" "cat '$REMOTE_LOG'" > "${TMP_DIR}/${local_name}"
            info "  ← ${REMOTE_LOG} (${QBIT_HOST})"
        else
            LOCAL_LOG=""
            for p in "${QBIT_LOG_PATHS[@]}"; do
                if [[ -f "$p" ]]; then
                    LOCAL_LOG="$p"
                    break
                fi
            done
            if [[ -z "$LOCAL_LOG" ]]; then
                rm -rf "$TMP_DIR"
                error "No qBittorrent log found locally. Tried: ${QBIT_LOG_PATHS[*]}"
            fi
            cp "$LOCAL_LOG" "${TMP_DIR}/qbittorrent.log"
            info "  ← ${LOCAL_LOG}"
        fi
        info "Ingesting into ${DB}…"
        "$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB"
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
        ;;
    build-fts)
        info "Rebuilding FTS index for ${DB}…"
        TURNSTONE_DB="$DB" "$PYTHON" scripts/build_fts_index.py "$DB"
        success "FTS index rebuilt."
        ;;
    test)
        info "Running test suite…"
        PYTEST="${CONDA_BASE}/envs/cf/bin/pytest"
        [[ -x "$PYTEST" ]] || error "pytest not found in cf env at ${PYTEST}"
        TURNSTONE_DB=":memory:" "$PYTEST" tests/ -v "$@"
        ;;
    help|--help|-h)
        usage
        ;;
    *)
        error "Unknown command: ${CMD}. Run './manage.sh help' for usage."
        ;;
 esac
--- a/11
+++ b/11
@ -18,7 +18,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # sqlite-vec: optional vector search extension for context embedding (Paid tier)
-RUN set -e; \
+RUN set -eux; \
    SVEC_VER=0.1.6; \
    ARCH=$(uname -m); \
    case "$ARCH" in \
@ -26,11 +26,10 @@ RUN set -e; \
      aarch64) SVEC_ARCH="aarch64-linux-gnu" ;; \
      *)        echo "sqlite-vec: unsupported arch $ARCH — skipping" && exit 0 ;; \
    esac; \
-    curl -fsSL -o /tmp/sqlite_vec.tar.gz \
+    wget -q -O /tmp/sqlite_vec.tar.gz \
-      "https://github.com/asg017/sqlite-vec/releases/download/v${SVEC_VER}/sqlite-vec-${SVEC_VER}-loadable-linux-${SVEC_ARCH}.tar.gz" \
+      "https://github.com/asg017/sqlite-vec/releases/download/v${SVEC_VER}/sqlite-vec-${SVEC_VER}-loadable-linux-${SVEC_ARCH}.tar.gz"; \
-    && tar -xz -C /usr/lib/python3/ -f /tmp/sqlite_vec.tar.gz --wildcards '*.so' \
+    tar -xz -C /usr/lib/python3/ -f /tmp/sqlite_vec.tar.gz --wildcards '*.so' || true; \
-    && rm /tmp/sqlite_vec.tar.gz \
+    rm /tmp/sqlite_vec.tar.gz
    || echo "sqlite-vec optional extension unavailable — vector search disabled"
 COPY app/ ./app/
 COPY patterns/ ./patterns/
--- a/README.md
+++ b/README.md
@ -28,8 +28,8 @@ Service logs (journald, Docker, syslog, Caddy, Plex, arr stack, qBittorrent, dme
 ## Features
- **Multi-source glean** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
+- **Multi-source ingest** — journald, Docker, syslog, Caddy, dmesg, Plex, Servarr (arr stack), qBittorrent, plaintext; paths configured in `patterns/sources.yaml`
- **Pattern tagging** — named regex patterns applied at glean time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
+- **Pattern tagging** — named regex patterns applied at ingest time (`service_restart`, `auth_failure`, `oom`, `segfault`, `disk_full`, `timeout`, …); extend in `patterns/default.yaml`
 - **Full-text search** — SQLite FTS5 index across all ingested entries; filter by source, severity, time window
 - **Natural-language time queries** — "what happened yesterday morning", "show me errors from the last 3 hours"; powered by dateparser
 - **Incident management** — create, label, and track incidents; attach supporting log entries
@ -101,13 +101,13 @@ sources:
    path: /var/log/caddy/access.log
 ```
-For `journald` sources, run `scripts/export_journal.sh` on the host before each glean (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
+For `journald` sources, run `scripts/export_journal.sh` on the host before each ingest (e.g. via cron). Missing paths are skipped with a warning — safe to leave entries for services that are temporarily down.
 ---
 ## Pattern library
-Named patterns in `patterns/default.yaml` are matched against every log entry at glean time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
+Named patterns in `patterns/default.yaml` are matched against every log entry at ingest time. Matched pattern names are stored and used to boost search relevance for diagnostic queries.
 ```yaml
 patterns:
@ -157,7 +157,6 @@ Copy `.env.example` to `.env` (or pass as `-e` flags to Docker/Podman). All vari
 | `TURNSTONE_PATTERNS` | `./patterns` | Pattern directory (default.yaml, sources.yaml, watch.yaml). |
 | `TURNSTONE_SOURCE_HOST` | `unknown` | Host identifier stamped on ingested entries. |
 | `TURNSTONE_BUNDLE_ENDPOINT` | — | Remote URL to push diagnostic bundles for escalation. |
 | `TURNSTONE_GLEAN_INTERVAL` | `900` | Seconds between automatic batch glean runs. Set to `0` to disable. |
 ---
--- a/app/context/embedder.py
+++ b/app/context/embedder.py
@ -1,81 +1,64 @@
-"""Context chunk embedding — BSL licensed.
+"""Ollama embedding client with sqlite-vec storage — BSL licensed."""
 Thin wrapper around app.services.embeddings that handles the DB I/O for
 context_chunks.  All backend configuration (model, device, backend type) is
 delegated to the service layer via TURNSTONE_EMBED_* env vars.
 Re-exports EMBEDDING_AVAILABLE so callers that imported it from here continue
 to work without changes.
 """
 from __future__ import annotations
 import logging
 import sqlite3
 import struct
 from pathlib import Path
-from app.services.embeddings import (
+import httpx
    EMBEDDING_AVAILABLE,  # re-export for backward compat
    get_embedder,
    pack_vector,
 )
 __all__ = ["EMBEDDING_AVAILABLE", "embed_chunks"]
 logger = logging.getLogger(__name__)
 EMBEDDING_AVAILABLE: bool = False
 try:
    import sqlite_vec  # type: ignore[import]  # noqa: F401
    EMBEDDING_AVAILABLE = True
    logger.debug("sqlite-vec loaded — embedding pipeline enabled")
 except ImportError:
    logger.debug("sqlite-vec not available — embedding pipeline disabled")
 def embed_chunks(
    db_path: Path,
    document_id: str,
-    # Legacy params kept for backward compat — ignored when the ST backend is active.
+    llm_url: str,
-    llm_url: str = "",
+    model: str = "nomic-embed-text",
    model: str = "",
    timeout: float = 60.0,
 ) -> int:
-    """Embed all un-embedded chunks for *document_id*.
+    """Embed all unembedded chunks for a document. Returns count embedded. No-op when EMBEDDING_AVAILABLE is False."""
-
+    if not EMBEDDING_AVAILABLE:
    Uses the configured embedder (sentence-transformers by default; Ollama when
    TURNSTONE_EMBED_BACKEND=ollama).  Returns the count of newly embedded chunks.
    Returns 0 silently when no embedder is available.
    The legacy ``llm_url`` and ``model`` parameters are accepted but ignored when
    the sentence-transformers backend is active — configure via env vars instead.
    """
    embedder = get_embedder()
    if embedder is None:
        return 0
-    conn = sqlite3.connect(str(db_path), timeout=30.0)
+    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row
    rows = conn.execute(
-        "SELECT id, text FROM context_chunks WHERE document_id = ? AND embedding IS NULL",
+        "SELECT id, text FROM context_chunks WHERE document_id=? AND embedding IS NULL",
        (document_id,),
    ).fetchall()
    if not rows:
        conn.close()
        return 0
    texts = [r["text"] for r in rows]
    ids   = [r["id"]   for r in rows]
    count = 0
-    try:
+    for row in rows:
-        vectors = embedder.embed_batch(texts)
+        try:
-        for chunk_id, vec in zip(ids, vectors):
+            resp = httpx.post(
-            blob = pack_vector(vec)
+                f"{llm_url.rstrip('/')}/api/embeddings",
-            conn.execute(
+                json={"model": model, "prompt": row["text"]},
-                "UPDATE context_chunks SET embedding = ? WHERE id = ?",
+                timeout=timeout,
                (blob, chunk_id),
            )
-            count += 1
+            resp.raise_for_status()
-        conn.commit()
+            vector: list[float] = resp.json().get("embedding") or []
-    except Exception as exc:
+            if vector:
-        logger.warning("Batch embedding failed for document %s: %s", document_id, exc)
+                blob = struct.pack(f"{len(vector)}f", *vector)
-    finally:
+                conn.execute(
-        conn.close()
+                    "UPDATE context_chunks SET embedding=? WHERE id=?",
                    (blob, row["id"]),
                )
                count += 1
        except Exception as exc:
            logger.warning("Embedding chunk %s failed: %s", row["id"], exc)
-    logger.debug("Embedded %d chunk(s) for document %s", count, document_id)
+    conn.commit()
    conn.close()
    return count
--- a/app/context/retriever.py
+++ b/app/context/retriever.py
@ -1,30 +1,10 @@
-"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed.
+"""Context retrieval — structured keyword lookup (Free) + chunk search — MIT licensed."""
 Two retrieval modes for context_chunks:
  Vector search  — cosine similarity over stored embeddings (when available)
  Keyword search — LIKE-based fallback when no embedder is configured
 Both modes are called from retrieve_context(); the best available mode is used
 automatically so callers need not check EMBEDDING_AVAILABLE themselves.
 """
 from __future__ import annotations
 import logging
 import sqlite3
 from dataclasses import dataclass, field
 from pathlib import Path
 import numpy as np
 from app.services.embeddings import (
    EMBEDDING_AVAILABLE,
    cosine_similarity,
    get_embedder,
    unpack_vector,
 )
 logger = logging.getLogger(__name__)
@dataclass
 class RetrievedContext:
@ -32,12 +12,10 @@ class RetrievedContext:
    chunks: list[dict[str, str]] = field(default_factory=list)
 # ── Structured fact retrieval (always runs) ───────────────────────────────────
 def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
    """Keyword match against context_facts. Always runs — Free tier."""
    try:
-        conn = sqlite3.connect(str(db_path), timeout=30.0)
+        conn = sqlite3.connect(str(db_path))
        conn.execute("PRAGMA journal_mode=WAL")
        conn.row_factory = sqlite3.Row
        keywords = [w.lower() for w in query.split() if len(w) > 2]
@ -64,70 +42,10 @@ def get_relevant_facts(db_path: Path, query: str) -> list[dict[str, str]]:
        return []
-# ── Chunk retrieval: vector path ──────────────────────────────────────────────
+def _search_chunks(db_path: Path, query: str) -> list[dict[str, str]]:
-
+    """Keyword search across context_chunks. Fallback when no embeddings."""
 def _search_chunks_vector(
    db_path: Path,
    query: str,
    top_k: int = 3,
 ) -> list[dict[str, str]]:
    """Cosine similarity search over embedded context_chunks.
    Loads all stored embeddings into memory and scores in-process with numpy.
    Skips any chunk whose BLOB dimension does not match the current model dim
    (stale embeddings from a previous model — they will be re-embedded on the
    next document upload).
    Returns at most *top_k* results ordered by similarity descending.
    """
    embedder = get_embedder()
    if embedder is None:
        return []
    try:
-        query_vec: np.ndarray = embedder.embed(query)
+        conn = sqlite3.connect(str(db_path))
        model_dim: int = embedder.dim
    except Exception as exc:
        logger.warning("Query embedding failed: %s", exc)
        return []
    try:
        conn = sqlite3.connect(str(db_path), timeout=30.0)
        conn.execute("PRAGMA journal_mode=WAL")
        conn.row_factory = sqlite3.Row
        rows = conn.execute(
            "SELECT cc.id, cc.text, cc.embedding, cd.filename"
            " FROM context_chunks cc"
            " JOIN context_documents cd ON cc.document_id = cd.id"
            " WHERE cc.embedding IS NOT NULL"
        ).fetchall()
        conn.close()
    except sqlite3.OperationalError:
        return []
    scored: list[tuple[float, dict[str, str]]] = []
    for row in rows:
        blob: bytes = row["embedding"]
        # Guard against blobs from a different-dimension model
        if len(blob) // 4 != model_dim:
            continue
        try:
            chunk_vec = unpack_vector(blob)
            score = cosine_similarity(query_vec, chunk_vec)
            scored.append((score, {"text": row["text"], "filename": row["filename"]}))
        except Exception:
            continue
    scored.sort(key=lambda t: t[0], reverse=True)
    return [item for _, item in scored[:top_k]]
 # ── Chunk retrieval: keyword fallback ─────────────────────────────────────────
 def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]:
    """LIKE-based keyword search across context_chunks. Fallback when no embedder."""
    try:
        conn = sqlite3.connect(str(db_path), timeout=30.0)
        conn.execute("PRAGMA journal_mode=WAL")
        conn.row_factory = sqlite3.Row
        keywords = [w.lower() for w in query.split() if len(w) > 2][:5]
@ -148,29 +66,16 @@ def _search_chunks_keyword(db_path: Path, query: str) -> list[dict[str, str]]:
        return []
 # ── Public interface ──────────────────────────────────────────────────────────
 def retrieve_context(db_path: Path, query: str) -> RetrievedContext:
-    """Retrieve structured facts and relevant chunks for a query.
+    """Retrieve structured facts and relevant chunks for a query."""
-
+    return RetrievedContext(
-    Chunk retrieval uses vector search when an embedder is available and at
+        facts=get_relevant_facts(db_path, query),
-    least one embedded chunk exists; falls back to keyword search otherwise.
+        chunks=_search_chunks(db_path, query),
-    """
+    )
    facts = get_relevant_facts(db_path, query)
    if EMBEDDING_AVAILABLE:
        chunks = _search_chunks_vector(db_path, query)
        if not chunks:
            # Vector search returned nothing (no embedded chunks yet) — fall back.
            chunks = _search_chunks_keyword(db_path, query)
    else:
        chunks = _search_chunks_keyword(db_path, query)
    return RetrievedContext(facts=facts, chunks=chunks)
 def format_context_block(ctx: RetrievedContext) -> str | None:
-    """Format context for injection into an LLM prompt. Returns None when empty."""
+    """Format context for injection into LLM prompt. Returns None when empty."""
    lines: list[str] = []
    if ctx.facts:
        lines.append("Known environment facts:")
--- a/app/context/store.py
+++ b/app/context/store.py
@ -1,13 +1,12 @@
 """Context fact and document CRUD — MIT licensed."""
 from __future__ import annotations
 import sqlite3
 import uuid
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
 from app.db import get_conn, resolve_tenant_id
@dataclass(frozen=True)
 class ContextFact:
@ -29,8 +28,15 @@ class ContextDocument:
    uploaded_at: str
 def _connect(db_path: Path) -> sqlite3.Connection:
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA foreign_keys=ON")
    conn.row_factory = sqlite3.Row
    return conn
 def add_fact(db_path: Path, category: str, key: str, value: str, source: str | None = None) -> ContextFact:
    tid = resolve_tenant_id()
    fact = ContextFact(
        id=str(uuid.uuid4()),
        category=category,
@ -39,28 +45,27 @@ def add_fact(db_path: Path, category: str, key: str, value: str, source: str | N
        source=source,
        created_at=datetime.now(timezone.utc).isoformat(),
    )
-    with get_conn(db_path) as conn:
+    conn = _connect(db_path)
-        conn.execute(
+    conn.execute(
-            "INSERT INTO context_facts(id, tenant_id, category, key, value, source, created_at) VALUES (?,?,?,?,?,?,?)",
+        "INSERT INTO context_facts(id, category, key, value, source, created_at) VALUES (?,?,?,?,?,?)",
-            (fact.id, tid, fact.category, fact.key, fact.value, fact.source, fact.created_at),
+        (fact.id, fact.category, fact.key, fact.value, fact.source, fact.created_at),
-        )
+    )
-        conn.commit()
+    conn.commit()
    conn.close()
    return fact
 def list_facts(db_path: Path, category: str | None = None) -> list[ContextFact]:
-    tid = resolve_tenant_id()
+    conn = _connect(db_path)
-    with get_conn(db_path) as conn:
+    if category:
-        if category:
+        rows = conn.execute(
-            rows = conn.execute(
+            "SELECT * FROM context_facts WHERE category=? ORDER BY created_at", (category,)
-                "SELECT * FROM context_facts WHERE category=? AND (tenant_id=? OR tenant_id='') ORDER BY created_at",
+        ).fetchall()
-                (category, tid),
+    else:
-            ).fetchall()
+        rows = conn.execute(
-        else:
+            "SELECT * FROM context_facts ORDER BY category, created_at"
-            rows = conn.execute(
+        ).fetchall()
-                "SELECT * FROM context_facts WHERE (tenant_id=? OR tenant_id='') ORDER BY category, created_at",
+    conn.close()
                (tid,),
            ).fetchall()
    return [
        ContextFact(
            id=r["id"], category=r["category"], key=r["key"],
@ -71,13 +76,10 @@ def list_facts(db_path: Path, category: str | None = None) -> list[ContextFact]:
 def delete_fact(db_path: Path, fact_id: str) -> bool:
-    tid = resolve_tenant_id()
+    conn = _connect(db_path)
-    with get_conn(db_path) as conn:
+    cursor = conn.execute("DELETE FROM context_facts WHERE id=?", (fact_id,))
-        cursor = conn.execute(
+    conn.commit()
-            "DELETE FROM context_facts WHERE id=? AND (tenant_id=? OR tenant_id='')",
+    conn.close()
            (fact_id, tid),
        )
        conn.commit()
    return cursor.rowcount > 0
@ -88,7 +90,6 @@ def add_document(
    full_text: str,
    file_size: int | None = None,
 ) -> ContextDocument:
    tid = resolve_tenant_id()
    doc = ContextDocument(
        id=str(uuid.uuid4()),
        filename=filename,
@ -97,24 +98,24 @@ def add_document(
        file_size=file_size,
        uploaded_at=datetime.now(timezone.utc).isoformat(),
    )
-    with get_conn(db_path) as conn:
+    conn = _connect(db_path)
-        conn.execute(
+    conn.execute(
-            "INSERT INTO context_documents(id, tenant_id, filename, doc_type, full_text, file_size, uploaded_at)"
+        "INSERT INTO context_documents(id, filename, doc_type, full_text, file_size, uploaded_at)"
-            " VALUES (?,?,?,?,?,?,?)",
+        " VALUES (?,?,?,?,?,?)",
-            (doc.id, tid, doc.filename, doc.doc_type, doc.full_text, doc.file_size, doc.uploaded_at),
+        (doc.id, doc.filename, doc.doc_type, doc.full_text, doc.file_size, doc.uploaded_at),
-        )
+    )
-        conn.commit()
+    conn.commit()
    conn.close()
    return doc
 def list_documents(db_path: Path) -> list[ContextDocument]:
-    tid = resolve_tenant_id()
+    conn = _connect(db_path)
-    with get_conn(db_path) as conn:
+    rows = conn.execute(
-        rows = conn.execute(
+        "SELECT id, filename, doc_type, full_text, file_size, uploaded_at"
-            "SELECT id, filename, doc_type, full_text, file_size, uploaded_at"
+        " FROM context_documents ORDER BY uploaded_at DESC"
-            " FROM context_documents WHERE (tenant_id=? OR tenant_id='') ORDER BY uploaded_at DESC",
+    ).fetchall()
-            (tid,),
+    conn.close()
        ).fetchall()
    return [
        ContextDocument(
            id=r["id"], filename=r["filename"], doc_type=r["doc_type"],
@ -125,11 +126,8 @@ def list_documents(db_path: Path) -> list[ContextDocument]:
 def delete_document(db_path: Path, doc_id: str) -> bool:
-    tid = resolve_tenant_id()
+    conn = _connect(db_path)
-    with get_conn(db_path) as conn:
+    cursor = conn.execute("DELETE FROM context_documents WHERE id=?", (doc_id,))
-        cursor = conn.execute(
+    conn.commit()
-            "DELETE FROM context_documents WHERE id=? AND (tenant_id=? OR tenant_id='')",
+    conn.close()
            (doc_id, tid),
        )
        conn.commit()
    return cursor.rowcount > 0
--- a/app/db/init.py
+++ b/app/db/init.py
@ -1,36 +0,0 @@
 """Turnstone database abstraction — unified SQLite / Postgres interface.
 Public API:
    BACKEND          — Backend.SQLITE or Backend.POSTGRES
    get_conn(path)   — context manager yielding a DbConn
    resolve_tenant_id() — this node's tenant ID (env or hostname)
    q(sql)           — rewrite ? placeholders to %s for Postgres
    frag             — SQL fragment helpers (insert_or_ignore, source_group_expr, …)
    ensure_schema    — idempotent schema init
    close_pool       — call during shutdown when using Postgres
 """
 from app.db.backend import BACKEND, Backend
 from app.db.conn import DbConn, close_pool, get_conn
 from app.db.dialect import frag, q
 from app.db.schema import (
    ensure_context_schema,
    ensure_incidents_schema,
    ensure_schema,
    migrate_incidents_to_dedicated_db,
 )
 from app.db.tenant import resolve_tenant_id
 __all__ = [
    "BACKEND",
    "Backend",
    "DbConn",
    "close_pool",
    "get_conn",
    "frag",
    "q",
    "ensure_schema",
    "ensure_context_schema",
    "ensure_incidents_schema",
    "migrate_incidents_to_dedicated_db",
    "resolve_tenant_id",
 ]
--- a/app/db/backend.py
+++ b/app/db/backend.py
@ -1,20 +0,0 @@
 """Backend detection — SQLITE (default) or POSTGRES based on DATABASE_URL."""
 from __future__ import annotations
 import os
 from enum import Enum
 class Backend(Enum):
    SQLITE = "sqlite"
    POSTGRES = "postgres"
 def _detect() -> Backend:
    url = os.environ.get("DATABASE_URL", "")
    if url.startswith(("postgresql://", "postgres://", "postgresql+psycopg://")):
        return Backend.POSTGRES
    return Backend.SQLITE
 BACKEND: Backend = _detect()
--- a/app/db/conn.py
+++ b/app/db/conn.py
@ -1,137 +0,0 @@
 """Uniform connection wrapper over sqlite3 and psycopg3.
 Usage:
    with get_conn(db_path) as conn:
        conn.execute("SELECT ...", (param,))
        conn.commit()
 For Postgres, db_path is ignored — all connections go through the shared pool.
 The pool is initialized lazily on first use from DATABASE_URL.
 """
 from __future__ import annotations
 import logging
 import os
 import sqlite3
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Generator
 from app.db.backend import BACKEND, Backend
 logger = logging.getLogger(__name__)
 _pool: Any = None  # psycopg_pool.ConnectionPool, typed as Any to avoid import-time errors
 class _NopCursor:
    """Returned when a PRAGMA or other SQLite-only statement is skipped on Postgres."""
    rowcount = 0
    def fetchall(self) -> list:
        return []
    def fetchone(self) -> None:
        return None
    def __iter__(self):
        return iter([])
 class DbConn:
    """Wraps a raw sqlite3 or psycopg connection with a uniform execute API.
    Row access is always dict-like:
      - SQLite: conn.row_factory = sqlite3.Row  (supports row["col"] and row[0])
      - Postgres: row_factory = dict_row        (returns plain dicts)
    """
    __slots__ = ("_c", "_backend")
    def __init__(self, raw: Any, backend: Backend) -> None:
        self._c = raw
        self._backend = backend
    def _prep(self, sql: str) -> str | None:
        """Return None to skip (PRAGMA on Postgres), else return ready-to-execute SQL."""
        stripped = sql.strip()
        if self._backend == Backend.POSTGRES and stripped.lower().startswith("pragma"):
            return None
        if self._backend == Backend.POSTGRES:
            return stripped.replace("?", "%s")
        return stripped
    def execute(self, sql: str, params: Any = ()) -> Any:
        prepared = self._prep(sql)
        if prepared is None:
            return _NopCursor()
        return self._c.execute(prepared, params)
    def executemany(self, sql: str, params_seq: Any) -> Any:
        prepared = self._prep(sql)
        if prepared is None:
            return _NopCursor()
        return self._c.executemany(prepared, params_seq)
    def commit(self) -> None:
        self._c.commit()
    def close(self) -> None:
        self._c.close()
    def __enter__(self) -> "DbConn":
        return self
    def __exit__(self, *_: Any) -> None:
        self.close()
 def _get_pool() -> Any:
    global _pool
    if _pool is not None:
        return _pool
    try:
        from psycopg_pool import ConnectionPool  # type: ignore[import]
        url = os.environ["DATABASE_URL"]
        _pool = ConnectionPool(url, min_size=2, max_size=10, open=True)
        logger.info("Postgres connection pool opened (DATABASE_URL set)")
        return _pool
    except ImportError as exc:
        raise RuntimeError(
            "psycopg[binary,pool] is required for Postgres backend. "
            "Run: pip install 'psycopg[binary,pool]'"
        ) from exc
    except KeyError:
        raise RuntimeError("DATABASE_URL must be set when using Postgres backend") from None
@contextmanager
 def get_conn(db_path: Path | None = None) -> Generator[DbConn, None, None]:
    """Yield a DbConn backed by sqlite3 (db_path required) or the Postgres pool."""
    if BACKEND == Backend.POSTGRES:
        pool = _get_pool()
        from psycopg.rows import dict_row  # type: ignore[import]
        with pool.connection() as raw:
            raw.row_factory = dict_row
            yield DbConn(raw, BACKEND)
    else:
        if db_path is None:
            raise ValueError("db_path is required for SQLite backend")
        raw = sqlite3.connect(str(db_path), timeout=90.0)
        raw.row_factory = sqlite3.Row
        try:
            raw.execute("PRAGMA journal_mode=WAL")
            raw.execute("PRAGMA busy_timeout=90000")
            raw.execute("PRAGMA foreign_keys=ON")
            yield DbConn(raw, BACKEND)
        finally:
            raw.close()
 def close_pool() -> None:
    """Close the Postgres connection pool — call during application shutdown."""
    global _pool
    if _pool is not None:
        _pool.close()
        _pool = None
        logger.info("Postgres connection pool closed")
--- a/app/db/dialect.py
+++ b/app/db/dialect.py
@ -1,93 +0,0 @@
 """Per-backend SQL fragments and placeholder rewriting.
 All production SQL should be written with SQLite-style `?` placeholders.
 Call q(sql) before passing to execute/executemany — it rewrites to %s for
 Postgres and leaves SQLite queries untouched.
 """
 from __future__ import annotations
 from app.db.backend import BACKEND, Backend
 def q(sql: str) -> str:
    """Rewrite ? placeholders to %s for Postgres; no-op for SQLite."""
    if BACKEND == Backend.POSTGRES:
        return sql.replace("?", "%s")
    return sql
 class _Fragments:
    """SQL fragments that differ between backends."""
    @property
    def insert_or_ignore(self) -> str:
        return "INSERT" if BACKEND == Backend.POSTGRES else "INSERT OR IGNORE"
    @property
    def on_conflict_ignore(self) -> str:
        # Caller must substitute the column name(s) at use time when using Postgres.
        # For log_entries: ON CONFLICT (tenant_id, id) DO NOTHING
        # For generic use this property is a no-op sentinel; prefer insert_ignore_into().
        return ""
    def insert_ignore_entries(self) -> str:
        """Full INSERT ... ON CONFLICT clause for log_entries."""
        if BACKEND == Backend.POSTGRES:
            return "INSERT INTO log_entries"
        return "INSERT OR IGNORE INTO log_entries"
    def entries_conflict_clause(self) -> str:
        if BACKEND == Backend.POSTGRES:
            return "ON CONFLICT (tenant_id, id) DO NOTHING"
        return ""
    def fingerprint_upsert(self) -> str:
        if BACKEND == Backend.POSTGRES:
            return (
                "INSERT INTO glean_fingerprints (tenant_id, path, mtime, size, gleaned_at)"
                " VALUES (%s, %s, %s, %s, %s)"
                " ON CONFLICT (tenant_id, path)"
                " DO UPDATE SET mtime=EXCLUDED.mtime, size=EXCLUDED.size, gleaned_at=EXCLUDED.gleaned_at"
            )
        return (
            "INSERT OR REPLACE INTO glean_fingerprints (tenant_id, path, mtime, size, gleaned_at)"
            " VALUES (?,?,?,?,?)"
        )
    def source_group_expr(self, col: str = "source_id") -> str:
        """SQL expression that collapses prefix:host:unit → prefix:host stem."""
        if BACKEND == Backend.POSTGRES:
            return f"""
                CASE
                    WHEN array_length(string_to_array({col}, ':'), 1) >= 3
                    THEN split_part({col}, ':', 1) || ':' || split_part({col}, ':', 2)
                    ELSE {col}
                END
            """
        return f"""
            CASE
                WHEN INSTR(SUBSTR({col}, INSTR({col}, ':')+1), ':') > 0
                THEN SUBSTR({col}, 1,
                         INSTR({col}, ':')
                         + INSTR(SUBSTR({col}, INSTR({col}, ':')+1), ':')
                         - 1)
                ELSE {col}
            END
        """
    def fts_match_clause(self) -> str:
        """WHERE clause fragment for FTS query. Caller supplies the query param."""
        if BACKEND == Backend.POSTGRES:
            return "text_tsv @@ websearch_to_tsquery('english', %s)"
        return "log_fts MATCH ?"
    def fts_rank_expr(self) -> str:
        """ORDER BY expression for FTS rank (best match first). Postgres needs the query twice."""
        if BACKEND == Backend.POSTGRES:
            # ts_rank returns 0..1 where higher is better; pass the query again as param
            return "ts_rank(text_tsv, websearch_to_tsquery('english', %s)) DESC"
        # FTS5 rank is negative BM25; ASC = most-negative = best match
        return "rank ASC"
 frag = _Fragments()
--- a/app/db/schema.py
+++ b/app/db/schema.py
@ -1,537 +0,0 @@
 """Schema creation and idempotent migrations for all Turnstone databases.
 Three logical databases (main, context, incidents) map to:
  - SQLite: three separate .db files (avoids write-lock contention)
  - Postgres: three table-groups in one physical DB (row-level locking makes separation unnecessary)
 All ensure_* functions are idempotent: safe to call on every startup.
 """
 from __future__ import annotations
 import logging
 import sqlite3
 from pathlib import Path
 from app.db.backend import BACKEND, Backend
 from app.db.conn import get_conn
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # SQLite DDL — kept as executescript strings (SQLite only)
 # ---------------------------------------------------------------------------
 _MAIN_SCHEMA_SQLITE = """
 CREATE TABLE IF NOT EXISTS log_entries (
    id               TEXT NOT NULL,
    tenant_id        TEXT NOT NULL DEFAULT '',
    source_id        TEXT NOT NULL,
    sequence         INTEGER NOT NULL,
    timestamp_raw    TEXT,
    timestamp_iso    TEXT,
    ingest_time      TEXT NOT NULL,
    severity         TEXT,
    repeat_count     INTEGER DEFAULT 1,
    out_of_order     INTEGER DEFAULT 0,
    matched_patterns TEXT DEFAULT '[]',
    text             TEXT NOT NULL,
    anomaly_score    REAL,
    anomaly_label    TEXT,
    anomaly_scored_at TEXT,
    ml_score         REAL,
    ml_label         TEXT,
    ml_scored_at     TEXT,
    PRIMARY KEY (tenant_id, id)
 );
 CREATE INDEX IF NOT EXISTS idx_source      ON log_entries(source_id);
 CREATE INDEX IF NOT EXISTS idx_tenant_src  ON log_entries(tenant_id, source_id);
 CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso);
 CREATE INDEX IF NOT EXISTS idx_ts_repeat   ON log_entries(timestamp_iso, repeat_count);
 CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(tenant_id, severity);
 CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns);
 CREATE INDEX IF NOT EXISTS idx_anomaly     ON log_entries(tenant_id, anomaly_score);
 CREATE INDEX IF NOT EXISTS idx_ml_scored   ON log_entries(tenant_id, ml_scored_at);
 CREATE TABLE IF NOT EXISTS detections (
    id              TEXT PRIMARY KEY,
    tenant_id       TEXT NOT NULL DEFAULT '',
    entry_id        TEXT NOT NULL,
    source_id       TEXT NOT NULL,
    anomaly_label   TEXT NOT NULL,
    anomaly_score   REAL NOT NULL,
    severity        TEXT NOT NULL,
    text            TEXT NOT NULL,
    timestamp_iso   TEXT,
    detected_at     TEXT NOT NULL,
    acknowledged    INTEGER NOT NULL DEFAULT 0,
    acknowledged_at TEXT,
    notes           TEXT NOT NULL DEFAULT '',
    scorer          TEXT NOT NULL DEFAULT 'anomaly'
 );
 CREATE INDEX IF NOT EXISTS idx_detections_tenant   ON detections(tenant_id, detected_at);
 CREATE INDEX IF NOT EXISTS idx_detections_ack      ON detections(acknowledged);
 CREATE INDEX IF NOT EXISTS idx_detections_label    ON detections(anomaly_label);
 CREATE INDEX IF NOT EXISTS idx_detections_entry    ON detections(entry_id);
 CREATE INDEX IF NOT EXISTS idx_detections_scorer   ON detections(scorer);
 CREATE TABLE IF NOT EXISTS glean_fingerprints (
    tenant_id  TEXT NOT NULL DEFAULT '',
    path       TEXT NOT NULL,
    mtime      REAL NOT NULL,
    size       INTEGER NOT NULL,
    gleaned_at TEXT NOT NULL,
    PRIMARY KEY (tenant_id, path)
 );
 CREATE TABLE IF NOT EXISTS incidents (
    id          TEXT PRIMARY KEY,
    tenant_id   TEXT NOT NULL DEFAULT '',
    label       TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    started_at  TEXT,
    ended_at    TEXT,
    notes       TEXT NOT NULL DEFAULT '',
    created_at  TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium'
 );
 CREATE INDEX IF NOT EXISTS idx_incidents_time   ON incidents(started_at, ended_at);
 CREATE INDEX IF NOT EXISTS idx_incidents_tenant ON incidents(tenant_id);
 CREATE TABLE IF NOT EXISTS received_bundles (
    id          TEXT PRIMARY KEY,
    tenant_id   TEXT NOT NULL DEFAULT '',
    source_host TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    label       TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium',
    started_at  TEXT,
    bundled_at  TEXT NOT NULL,
    entry_count INTEGER NOT NULL DEFAULT 0,
    bundle_json TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
 CREATE INDEX IF NOT EXISTS idx_bundles_type    ON received_bundles(issue_type);
 CREATE TABLE IF NOT EXISTS sent_bundles (
    id           TEXT PRIMARY KEY,
    tenant_id    TEXT NOT NULL DEFAULT '',
    incident_id  TEXT NOT NULL,
    exported_at  TEXT NOT NULL,
    sanitized    INTEGER NOT NULL DEFAULT 0,
    entry_count  INTEGER NOT NULL DEFAULT 0,
    bundle_json  TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_sent_bundles_incident ON sent_bundles(incident_id);
 CREATE INDEX IF NOT EXISTS idx_sent_bundles_time     ON sent_bundles(exported_at);
 CREATE TABLE IF NOT EXISTS blocklist_candidates (
    id                 TEXT PRIMARY KEY,
    tenant_id          TEXT NOT NULL DEFAULT '',
    domain_or_ip       TEXT NOT NULL,
    source_device_ip   TEXT,
    source_device_name TEXT,
    first_seen         TEXT NOT NULL,
    last_seen          TEXT NOT NULL,
    hit_count          INTEGER DEFAULT 1,
    status             TEXT DEFAULT 'pending',
    pushed_at          TEXT,
    log_evidence       TEXT DEFAULT '[]',
    matched_rule       TEXT,
    llm_score          REAL,
    llm_reason         TEXT
 );
 CREATE INDEX IF NOT EXISTS idx_blocklist_device  ON blocklist_candidates(source_device_ip);
 CREATE INDEX IF NOT EXISTS idx_blocklist_status  ON blocklist_candidates(status);
 CREATE INDEX IF NOT EXISTS idx_blocklist_domain  ON blocklist_candidates(domain_or_ip);
 CREATE INDEX IF NOT EXISTS idx_blocklist_tenant  ON blocklist_candidates(tenant_id);
 CREATE TABLE IF NOT EXISTS ssh_targets (
    id            TEXT PRIMARY KEY,
    label         TEXT NOT NULL,
    host          TEXT NOT NULL,
    port          INTEGER NOT NULL DEFAULT 22,
    user          TEXT NOT NULL,
    key_path      TEXT NOT NULL,
    last_tested   TEXT,
    last_ok       INTEGER DEFAULT NULL,
    last_error    TEXT,
    created_at    TEXT NOT NULL,
    updated_at    TEXT NOT NULL
 );
 """
 _CONTEXT_SCHEMA_SQLITE = """
 CREATE TABLE IF NOT EXISTS context_facts (
    id           TEXT PRIMARY KEY,
    tenant_id    TEXT NOT NULL DEFAULT '',
    category     TEXT NOT NULL,
    key          TEXT NOT NULL,
    value        TEXT NOT NULL,
    source       TEXT,
    created_at   TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
 CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
 CREATE INDEX IF NOT EXISTS idx_facts_tenant   ON context_facts(tenant_id);
 CREATE TABLE IF NOT EXISTS context_documents (
    id           TEXT PRIMARY KEY,
    tenant_id    TEXT NOT NULL DEFAULT '',
    filename     TEXT NOT NULL,
    doc_type     TEXT NOT NULL,
    full_text    TEXT NOT NULL,
    file_size    INTEGER,
    uploaded_at  TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_docs_tenant ON context_documents(tenant_id);
 CREATE TABLE IF NOT EXISTS context_chunks (
    id           TEXT PRIMARY KEY,
    tenant_id    TEXT NOT NULL DEFAULT '',
    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
    chunk_index  INTEGER NOT NULL,
    text         TEXT NOT NULL,
    embedding    BLOB
 );
 CREATE INDEX IF NOT EXISTS idx_chunks_doc    ON context_chunks(document_id);
 CREATE INDEX IF NOT EXISTS idx_chunks_tenant ON context_chunks(tenant_id);
 """
 # ---------------------------------------------------------------------------
 # Postgres DDL — executed statement-by-statement
 # ---------------------------------------------------------------------------
 _MAIN_SCHEMA_PG_STMTS = [
    """
    CREATE TABLE IF NOT EXISTS log_entries (
        id               TEXT NOT NULL,
        tenant_id        TEXT NOT NULL DEFAULT '',
        source_id        TEXT NOT NULL,
        sequence         INTEGER NOT NULL,
        timestamp_raw    TEXT,
        timestamp_iso    TEXT,
        ingest_time      TEXT NOT NULL,
        severity         TEXT,
        repeat_count     INTEGER DEFAULT 1,
        out_of_order     INTEGER DEFAULT 0,
        matched_patterns TEXT DEFAULT '[]',
        text             TEXT NOT NULL,
        text_tsv         tsvector,
        anomaly_score    DOUBLE PRECISION,
        anomaly_label    TEXT,
        anomaly_scored_at TEXT,
        ml_score         DOUBLE PRECISION,
        ml_label         TEXT,
        ml_scored_at     TEXT,
        PRIMARY KEY (tenant_id, id)
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_tenant_src  ON log_entries(tenant_id, source_id)",
    "CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso)",
    "CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(tenant_id, severity)",
    "CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns)",
    "CREATE INDEX IF NOT EXISTS idx_fts_gin     ON log_entries USING GIN(text_tsv)",
    "CREATE INDEX IF NOT EXISTS idx_anomaly     ON log_entries(tenant_id, anomaly_score)",
    "CREATE INDEX IF NOT EXISTS idx_ml_scored   ON log_entries(tenant_id, ml_scored_at)",
    """
    CREATE TABLE IF NOT EXISTS detections (
        id              TEXT PRIMARY KEY,
        tenant_id       TEXT NOT NULL DEFAULT '',
        entry_id        TEXT NOT NULL,
        source_id       TEXT NOT NULL,
        anomaly_label   TEXT NOT NULL,
        anomaly_score   DOUBLE PRECISION NOT NULL,
        severity        TEXT NOT NULL,
        text            TEXT NOT NULL,
        timestamp_iso   TEXT,
        detected_at     TEXT NOT NULL,
        acknowledged    INTEGER NOT NULL DEFAULT 0,
        acknowledged_at TEXT,
        notes           TEXT NOT NULL DEFAULT '',
        scorer          TEXT NOT NULL DEFAULT 'anomaly'
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_detections_tenant   ON detections(tenant_id, detected_at)",
    "CREATE INDEX IF NOT EXISTS idx_detections_ack      ON detections(acknowledged)",
    "CREATE INDEX IF NOT EXISTS idx_detections_label    ON detections(anomaly_label)",
    "CREATE INDEX IF NOT EXISTS idx_detections_entry    ON detections(entry_id)",
    "CREATE INDEX IF NOT EXISTS idx_detections_scorer   ON detections(scorer)",
    """
    CREATE OR REPLACE FUNCTION _ts_update_text_tsv() RETURNS trigger AS $$
    BEGIN
        NEW.text_tsv := to_tsvector('english', COALESCE(NEW.text, ''));
        RETURN NEW;
    END;
    $$ LANGUAGE plpgsql
    """,
    """
    DO $$ BEGIN
        IF NOT EXISTS (
            SELECT 1 FROM pg_trigger WHERE tgname = 'trig_log_entries_tsv'
        ) THEN
            CREATE TRIGGER trig_log_entries_tsv
                BEFORE INSERT OR UPDATE OF text ON log_entries
                FOR EACH ROW EXECUTE FUNCTION _ts_update_text_tsv();
        END IF;
    END $$
    """,
    """
    CREATE TABLE IF NOT EXISTS glean_fingerprints (
        tenant_id  TEXT NOT NULL DEFAULT '',
        path       TEXT NOT NULL,
        mtime      DOUBLE PRECISION NOT NULL,
        size       BIGINT NOT NULL,
        gleaned_at TEXT NOT NULL,
        PRIMARY KEY (tenant_id, path)
    )
    """,
    """
    CREATE TABLE IF NOT EXISTS incidents (
        id          TEXT PRIMARY KEY,
        tenant_id   TEXT NOT NULL DEFAULT '',
        label       TEXT NOT NULL,
        issue_type  TEXT NOT NULL DEFAULT '',
        started_at  TEXT,
        ended_at    TEXT,
        notes       TEXT NOT NULL DEFAULT '',
        created_at  TEXT NOT NULL,
        severity    TEXT NOT NULL DEFAULT 'medium'
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_incidents_time   ON incidents(started_at, ended_at)",
    "CREATE INDEX IF NOT EXISTS idx_incidents_tenant ON incidents(tenant_id)",
    """
    CREATE TABLE IF NOT EXISTS received_bundles (
        id          TEXT PRIMARY KEY,
        tenant_id   TEXT NOT NULL DEFAULT '',
        source_host TEXT NOT NULL,
        issue_type  TEXT NOT NULL DEFAULT '',
        label       TEXT NOT NULL,
        severity    TEXT NOT NULL DEFAULT 'medium',
        started_at  TEXT,
        bundled_at  TEXT NOT NULL,
        entry_count INTEGER NOT NULL DEFAULT 0,
        bundle_json TEXT NOT NULL
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at)",
    "CREATE INDEX IF NOT EXISTS idx_bundles_type    ON received_bundles(issue_type)",
    """
    CREATE TABLE IF NOT EXISTS sent_bundles (
        id           TEXT PRIMARY KEY,
        tenant_id    TEXT NOT NULL DEFAULT '',
        incident_id  TEXT NOT NULL,
        exported_at  TEXT NOT NULL,
        sanitized    INTEGER NOT NULL DEFAULT 0,
        entry_count  INTEGER NOT NULL DEFAULT 0,
        bundle_json  TEXT NOT NULL
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_sent_bundles_incident ON sent_bundles(incident_id)",
    "CREATE INDEX IF NOT EXISTS idx_sent_bundles_time     ON sent_bundles(exported_at)",
    """
    CREATE TABLE IF NOT EXISTS blocklist_candidates (
        id                 TEXT PRIMARY KEY,
        tenant_id          TEXT NOT NULL DEFAULT '',
        domain_or_ip       TEXT NOT NULL,
        source_device_ip   TEXT,
        source_device_name TEXT,
        first_seen         TEXT NOT NULL,
        last_seen          TEXT NOT NULL,
        hit_count          INTEGER DEFAULT 1,
        status             TEXT DEFAULT 'pending',
        pushed_at          TEXT,
        log_evidence       TEXT DEFAULT '[]',
        matched_rule       TEXT,
        llm_score          DOUBLE PRECISION,
        llm_reason         TEXT
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_blocklist_device  ON blocklist_candidates(source_device_ip)",
    "CREATE INDEX IF NOT EXISTS idx_blocklist_status  ON blocklist_candidates(status)",
    "CREATE INDEX IF NOT EXISTS idx_blocklist_domain  ON blocklist_candidates(domain_or_ip)",
    "CREATE INDEX IF NOT EXISTS idx_blocklist_tenant  ON blocklist_candidates(tenant_id)",
 ]
 _CONTEXT_SCHEMA_PG_STMTS = [
    """
    CREATE TABLE IF NOT EXISTS context_facts (
        id           TEXT PRIMARY KEY,
        tenant_id    TEXT NOT NULL DEFAULT '',
        category     TEXT NOT NULL,
        key          TEXT NOT NULL,
        value        TEXT NOT NULL,
        source       TEXT,
        created_at   TEXT NOT NULL
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category)",
    "CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key)",
    "CREATE INDEX IF NOT EXISTS idx_facts_tenant   ON context_facts(tenant_id)",
    """
    CREATE TABLE IF NOT EXISTS context_documents (
        id           TEXT PRIMARY KEY,
        tenant_id    TEXT NOT NULL DEFAULT '',
        filename     TEXT NOT NULL,
        doc_type     TEXT NOT NULL,
        full_text    TEXT NOT NULL,
        file_size    BIGINT,
        uploaded_at  TEXT NOT NULL
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_docs_tenant ON context_documents(tenant_id)",
    """
    CREATE TABLE IF NOT EXISTS context_chunks (
        id           TEXT PRIMARY KEY,
        tenant_id    TEXT NOT NULL DEFAULT '',
        document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
        chunk_index  INTEGER NOT NULL,
        text         TEXT NOT NULL,
        embedding    BYTEA
    )
    """,
    "CREATE INDEX IF NOT EXISTS idx_chunks_doc    ON context_chunks(document_id)",
    "CREATE INDEX IF NOT EXISTS idx_chunks_tenant ON context_chunks(tenant_id)",
 ]
 # ---------------------------------------------------------------------------
 # SQLite additive column migrations — applied after CREATE TABLE on every boot
 # ---------------------------------------------------------------------------
 _MAIN_MIGRATIONS_SQLITE = [
    "ALTER TABLE log_entries ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE incidents ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE received_bundles ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE sent_bundles ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE blocklist_candidates ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE glean_fingerprints ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE glean_fingerprints ADD COLUMN mtime REAL",
    "ALTER TABLE glean_fingerprints ADD COLUMN size INTEGER",
    "ALTER TABLE glean_fingerprints ADD COLUMN gleaned_at TEXT",
    "ALTER TABLE log_entries ADD COLUMN anomaly_score REAL",
    "ALTER TABLE log_entries ADD COLUMN anomaly_label TEXT",
    "ALTER TABLE log_entries ADD COLUMN anomaly_scored_at TEXT",
    "ALTER TABLE log_entries ADD COLUMN ml_score REAL",
    "ALTER TABLE log_entries ADD COLUMN ml_label TEXT",
    "ALTER TABLE log_entries ADD COLUMN ml_scored_at TEXT",
    "ALTER TABLE detections ADD COLUMN scorer TEXT NOT NULL DEFAULT 'anomaly'",
    "ALTER TABLE log_entries ADD COLUMN anonymized INTEGER DEFAULT NULL",
 ]
 _CONTEXT_MIGRATIONS_SQLITE = [
    "ALTER TABLE context_facts ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE context_documents ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
    "ALTER TABLE context_chunks ADD COLUMN tenant_id TEXT NOT NULL DEFAULT ''",
 ]
 def _run_sqlite_migrations(conn: sqlite3.Connection, stmts: list[str]) -> None:
    for stmt in stmts:
        try:
            conn.execute(stmt)
        except sqlite3.OperationalError:
            pass  # column already exists or table not present yet — both are fine
 def _run_pg_stmts(stmts: list[str]) -> None:
    """Execute Postgres DDL statements — each in its own transaction for IF NOT EXISTS safety."""
    from psycopg import connect as pg_connect  # type: ignore[import]
    import os
    url = os.environ["DATABASE_URL"]
    with pg_connect(url, autocommit=True) as conn:
        for stmt in stmts:
            stripped = stmt.strip()
            if stripped:
                conn.execute(stripped)
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 def ensure_schema(db_path: Path) -> None:
    """Ensure main log/incidents/blocklist tables exist. Idempotent."""
    if BACKEND == Backend.POSTGRES:
        _run_pg_stmts(_MAIN_SCHEMA_PG_STMTS)
        logger.debug("Postgres main schema verified")
        return
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    # Migrations first: add tenant_id to existing tables BEFORE index creation touches it
    _run_sqlite_migrations(conn, _MAIN_MIGRATIONS_SQLITE)
    conn.commit()
    conn.executescript(_MAIN_SCHEMA_SQLITE)
    conn.close()
    logger.debug("SQLite main schema verified at %s", db_path)
 def ensure_context_schema(db_path: Path) -> None:
    """Ensure context KB tables exist. Idempotent."""
    if BACKEND == Backend.POSTGRES:
        _run_pg_stmts(_CONTEXT_SCHEMA_PG_STMTS)
        logger.debug("Postgres context schema verified")
        return
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA foreign_keys=ON")
    _run_sqlite_migrations(conn, _CONTEXT_MIGRATIONS_SQLITE)
    conn.commit()
    conn.executescript(_CONTEXT_SCHEMA_SQLITE)
    conn.close()
    logger.debug("SQLite context schema verified at %s", db_path)
 def migrate_incidents_to_dedicated_db(main_db: Path, incidents_db: Path) -> int:
    """One-shot migration: copy incidents/bundles rows from main DB to incidents DB.
    Safe to call on every startup — rows already in incidents_db are skipped.
    No-op for Postgres (single DB, no migration needed).
    """
    if BACKEND == Backend.POSTGRES:
        return 0
    src = sqlite3.connect(str(main_db), timeout=30.0)
    src.row_factory = sqlite3.Row
    dst = sqlite3.connect(str(incidents_db), timeout=30.0)
    migrated = 0
    for table in ("incidents", "received_bundles", "sent_bundles"):
        try:
            rows = src.execute(f"SELECT * FROM {table}").fetchall()  # noqa: S608
        except sqlite3.OperationalError:
            continue
        if not rows:
            continue
        cols = ", ".join(rows[0].keys())
        placeholders = ", ".join("?" * len(rows[0].keys()))
        dst.executemany(
            f"INSERT OR IGNORE INTO {table} ({cols}) VALUES ({placeholders})",  # noqa: S608
            [tuple(r) for r in rows],
        )
        migrated += len(rows)
    dst.commit()
    src.close()
    dst.close()
    return migrated
 def ensure_incidents_schema(db_path: Path) -> None:
    """Ensure incidents/bundles tables exist. Idempotent.
    For Postgres, incidents live in the same DB as log_entries (already created by
    ensure_schema), so this is a no-op — the tables were created above.
    """
    if BACKEND == Backend.POSTGRES:
        return
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    _run_sqlite_migrations(conn, _MAIN_MIGRATIONS_SQLITE)
    conn.commit()
    conn.executescript(_MAIN_SCHEMA_SQLITE)
    conn.close()
    logger.debug("SQLite incidents schema verified at %s", db_path)
--- a/app/db/tenant.py
+++ b/app/db/tenant.py
@ -1,12 +0,0 @@
 """Tenant ID resolution — TURNSTONE_TENANT_ID env var, hostname fallback."""
 from __future__ import annotations
 import os
 import socket
 from functools import lru_cache
@lru_cache(maxsize=1)
 def resolve_tenant_id() -> str:
    """Return this node's tenant ID. Result is cached after first call."""
    return os.environ.get("TURNSTONE_TENANT_ID") or socket.gethostname()
--- a/app/glean/mqtt_subscriber.py
+++ b/app/glean/mqtt_subscriber.py
@ -1,166 +0,0 @@
 """Live MQTT glean subscriber for Turnstone.
 Reads ``type: mqtt`` entries from sources.yaml and subscribes to each broker
 in the background. Incoming messages are normalized to RetrievedEntry and
 written to the Turnstone SQLite database as they arrive.
 This runs as an asyncio task alongside the batch glean scheduler. It is
 started from the FastAPI lifespan in rest.py.
 MQTT source config format in sources.yaml::
    sources:
      - id: meshtastic-home
        type: mqtt
        broker_host: 10.1.10.5
        broker_port: 1883          # optional, default 1883
        broker_username: ~         # optional
        broker_password: ~         # optional
        topics:
          - msh/#                  # one or more topic patterns
        severity: INFO             # optional default severity for all messages
      - id: iot-sensors
        type: mqtt
        broker_host: localhost
        topics:
          - home/+/temperature
          - home/+/humidity
 """
 from __future__ import annotations
 import asyncio
 import hashlib
 import json
 import logging
 import sqlite3
 from datetime import datetime, timezone
 from pathlib import Path
 import yaml
 from app.services.models import RetrievedEntry
 logger = logging.getLogger(__name__)
 def _load_mqtt_sources(sources_file: Path) -> list[dict]:
    """Return only the ``type: mqtt`` entries from sources.yaml."""
    if not sources_file.exists():
        return []
    with sources_file.open() as f:
        data = yaml.safe_load(f) or {}
    return [s for s in data.get("sources", []) if s.get("type") == "mqtt"]
 def _make_entry_id(source_id: str, seq: int, text: str) -> str:
    h = hashlib.sha1(f"{source_id}:{seq}:{text}".encode()).hexdigest()[:16]
    return f"{source_id}:{seq}:{h}"
 def _write_entry(db_path: Path, entry: RetrievedEntry) -> None:
    with sqlite3.connect(db_path, timeout=30.0) as conn:
        conn.execute(
            """
            INSERT OR IGNORE INTO log_entries
                (id, source_id, sequence, timestamp_raw, timestamp_iso,
                 ingest_time, severity, repeat_count, out_of_order,
                 matched_patterns, text)
            VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
            """,
            (
                entry.entry_id,
                entry.source_id,
                entry.sequence,
                entry.timestamp_raw,
                entry.timestamp_iso,
                entry.ingest_time,
                entry.severity,
                entry.repeat_count,
                1 if entry.out_of_order else 0,
                json.dumps(entry.matched_patterns),
                entry.text,
            ),
        )
 async def _run_source_subscriber(source: dict, db_path: Path) -> None:
    """Maintain a subscription to one MQTT source, reconnecting on error."""
    try:
        from circuitforge_core.mqtt import MQTTClient, MQTTConfig
    except ImportError:
        logger.error(
            "circuitforge-core[mqtt] is not installed — MQTT source %r skipped. "
            "Run: pip install circuitforge-core[mqtt]",
            source.get("id"),
        )
        return
    source_id: str = source["id"]
    host: str = source["broker_host"]
    port: int = int(source.get("broker_port", 1883))
    username: str | None = source.get("broker_username") or source.get("username")
    password: str | None = source.get("broker_password") or source.get("password")
    topics: list[str] = source.get("topics", ["#"])
    default_severity: str = source.get("severity", "INFO")
    cfg = MQTTConfig(
        host=host,
        port=port,
        username=username,
        password=password,
        client_id=f"turnstone-{source_id}",
    )
    client = MQTTClient(cfg)
    seq = 0
    for topic in topics:
        @client.on(topic)
        async def _handle(msg, _src=source_id, _sev=default_severity):
            nonlocal seq
            seq += 1
            now = datetime.now(tz=timezone.utc).isoformat()
            text = msg.text()
            entry = RetrievedEntry(
                entry_id=_make_entry_id(_src, seq, text),
                source_id=_src,
                sequence=seq,
                timestamp_raw=now,
                timestamp_iso=now,
                ingest_time=now,
                severity=_sev,
                repeat_count=1,
                out_of_order=False,
                matched_patterns=[],
                text=f"[{msg.topic}] {text}",
            )
            _write_entry(db_path, entry)
            logger.debug("MQTT[%s] %s: %s", _src, msg.topic, text[:120])
    logger.info("MQTT subscriber starting: %s → %s:%d topics=%s", source_id, host, port, topics)
    await client.run()
 async def run_mqtt_subscribers(sources_file: Path, db_path: Path) -> None:
    """Start one subscriber task per MQTT source. Runs until cancelled."""
    sources = _load_mqtt_sources(sources_file)
    if not sources:
        logger.debug("No MQTT sources configured in %s", sources_file)
        return
    logger.info("Starting %d MQTT subscriber(s)", len(sources))
    tasks = [
        asyncio.create_task(
            _run_source_subscriber(src, db_path),
            name=f"mqtt-{src.get('id', i)}",
        )
        for i, src in enumerate(sources)
    ]
    try:
        await asyncio.gather(*tasks)
    except asyncio.CancelledError:
        for t in tasks:
            t.cancel()
        await asyncio.gather(*tasks, return_exceptions=True)
        raise
--- a/app/glean/pipeline.py
+++ b/app/glean/pipeline.py
@ -1,641 +0,0 @@
 """Glean pipeline: auto-detect format, parse, write to SQLite or Postgres."""
 from __future__ import annotations
 import json
 import logging
 import re
 import sqlite3  # still used in migrate_incidents_to_dedicated_db (SQLite-only migration)
 from pathlib import Path
 from typing import Any, Iterator
 from app.db import (
    frag,
    get_conn,
    resolve_tenant_id,
 )
 from app.db.schema import (
    ensure_context_schema,
    ensure_incidents_schema,
    ensure_schema,
    migrate_incidents_to_dedicated_db,
 )
 import yaml
 from app.glean import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog, wazuh
 from app.glean.base import _compile, load_patterns, now_iso
 from app.glean.ssh import (
    SSHTransport,
    SSHConnectionError,
    SSHCommandError,
    _build_docker_command,
    _build_journald_command,
    _build_plaintext_command,
    _build_syslog_command,
 )
 from app.services.models import LogPattern, RetrievedEntry
 from app.services.search import build_fts_index
 logger = logging.getLogger(__name__)
 _SCHEMA = """
 CREATE TABLE IF NOT EXISTS log_entries (
    id              TEXT PRIMARY KEY,
    source_id       TEXT NOT NULL,
    sequence        INTEGER NOT NULL,
    timestamp_raw   TEXT,
    timestamp_iso   TEXT,
    ingest_time     TEXT NOT NULL,
    severity        TEXT,
    repeat_count    INTEGER DEFAULT 1,
    out_of_order    INTEGER DEFAULT 0,
    matched_patterns TEXT DEFAULT '[]',
    text            TEXT NOT NULL,
    anonymized      INTEGER DEFAULT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_source      ON log_entries(source_id);
 CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso);
 CREATE INDEX IF NOT EXISTS idx_ts_repeat   ON log_entries(timestamp_iso, repeat_count);
 CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(severity);
 CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns);
 -- incidents tables moved to ensure_incidents_schema() / INCIDENTS_DB_PATH
 -- kept here as no-ops so legacy single-file deployments still work
 CREATE TABLE IF NOT EXISTS incidents (
    id          TEXT PRIMARY KEY,
    label       TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    started_at  TEXT,
    ended_at    TEXT,
    notes       TEXT NOT NULL DEFAULT '',
    created_at  TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium'
 );
 CREATE TABLE IF NOT EXISTS received_bundles (
    id          TEXT PRIMARY KEY,
    source_host TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    label       TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium',
    started_at  TEXT,
    bundled_at  TEXT NOT NULL,
    entry_count INTEGER NOT NULL DEFAULT 0,
    bundle_json TEXT NOT NULL
 );
 CREATE TABLE IF NOT EXISTS sent_bundles (
    id           TEXT PRIMARY KEY,
    incident_id  TEXT NOT NULL,
    exported_at  TEXT NOT NULL,
    sanitized    INTEGER NOT NULL DEFAULT 0,
    entry_count  INTEGER NOT NULL DEFAULT 0,
    bundle_json  TEXT NOT NULL
 );
 -- context tables moved to ensure_context_schema() / CONTEXT_DB_PATH
 -- kept here as no-ops so legacy single-file deployments still work
 CREATE TABLE IF NOT EXISTS context_facts (
    id           TEXT PRIMARY KEY,
    category     TEXT NOT NULL,
    key          TEXT NOT NULL,
    value        TEXT NOT NULL,
    source       TEXT,
    created_at   TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
 CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
 CREATE TABLE IF NOT EXISTS context_documents (
    id           TEXT PRIMARY KEY,
    filename     TEXT NOT NULL,
    doc_type     TEXT NOT NULL,
    full_text    TEXT NOT NULL,
    file_size    INTEGER,
    uploaded_at  TEXT NOT NULL
 );
 CREATE TABLE IF NOT EXISTS context_chunks (
    id           TEXT PRIMARY KEY,
    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
    chunk_index  INTEGER NOT NULL,
    text         TEXT NOT NULL,
    embedding    BLOB
 );
 CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
 CREATE TABLE IF NOT EXISTS blocklist_candidates (
    id                 TEXT PRIMARY KEY,
    domain_or_ip       TEXT NOT NULL,
    source_device_ip   TEXT,
    source_device_name TEXT,
    first_seen         TEXT NOT NULL,
    last_seen          TEXT NOT NULL,
    hit_count          INTEGER DEFAULT 1,
    status             TEXT DEFAULT 'pending',
    pushed_at          TEXT,
    log_evidence       TEXT DEFAULT '[]',
    matched_rule       TEXT,
    llm_score          REAL,
    llm_reason         TEXT
 );
 CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
 CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
 CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
 CREATE TABLE IF NOT EXISTS glean_fingerprints (
    path       TEXT PRIMARY KEY,
    mtime      REAL NOT NULL,
    size       INTEGER NOT NULL,
    gleaned_at TEXT NOT NULL
 );
 """
 _CONTEXT_SCHEMA = """
 CREATE TABLE IF NOT EXISTS context_facts (
    id           TEXT PRIMARY KEY,
    category     TEXT NOT NULL,
    key          TEXT NOT NULL,
    value        TEXT NOT NULL,
    source       TEXT,
    created_at   TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
 CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
 CREATE TABLE IF NOT EXISTS context_documents (
    id           TEXT PRIMARY KEY,
    filename     TEXT NOT NULL,
    doc_type     TEXT NOT NULL,
    full_text    TEXT NOT NULL,
    file_size    INTEGER,
    uploaded_at  TEXT NOT NULL
 );
 CREATE TABLE IF NOT EXISTS context_chunks (
    id           TEXT PRIMARY KEY,
    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
    chunk_index  INTEGER NOT NULL,
    text         TEXT NOT NULL,
    embedding    BLOB
 );
 CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
 """
 # ensure_schema / ensure_context_schema / ensure_incidents_schema / migrate_incidents_to_dedicated_db
 # are now implemented in app/db/schema.py and re-exported via app/db/__init__.py.
 # The imports at the top of this file bring them in; these names are kept as module-level
 # symbols so existing callers (rest.py, tests) still find them here without changes.
 # _INCIDENTS_SCHEMA and its ensure_/migrate_ functions moved to app/db/schema.py
 def _fingerprint(path: Path) -> tuple[float, int]:
    """Return (mtime, size) for a file — cheap identity check, no content read needed."""
    st = path.stat()
    return st.st_mtime, st.st_size
 def _fp_unchanged(conn: Any, path: Path, mtime: float, size: int) -> bool:
    """Return True only when the stored fingerprint exactly matches (mtime, size)."""
    tid = resolve_tenant_id()
    row = conn.execute(
        "SELECT mtime, size FROM glean_fingerprints WHERE path = ? AND (tenant_id = ? OR tenant_id = '')",
        (str(path), tid),
    ).fetchone()
    if row is None:
        return False
    return row["mtime"] == mtime and row["size"] == size
 def _save_fingerprint(
    conn: Any,
    path: Path,
    mtime: float,
    size: int,
    gleaned_at: str,
 ) -> None:
    """Upsert the fingerprint for *path* after a successful glean."""
    tid = resolve_tenant_id()
    conn.execute(frag.fingerprint_upsert(), (tid, str(path), mtime, size, gleaned_at))
 def _detect_format(first_line: str) -> str:
    try:
        obj = json.loads(first_line)
        if "__REALTIME_TIMESTAMP" in obj:
            return "journald"
        if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
            return "docker"
        if wazuh.is_wazuh_alert(obj):
            return "wazuh"
        if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
            return "caddy"
    except (json.JSONDecodeError, AttributeError):
        pass
    if plex.is_plex_log(first_line):
        return "plex"
    if qbittorrent.is_qbit_log(first_line):
        return "qbittorrent"
    if servarr.is_servarr_log(first_line):
        return "servarr"
    if dmesg_log.is_dmesg_log(first_line):
        return "dmesg"
    if syslog.is_syslog(first_line):
        return "syslog"
    return "plaintext"
 def _parse_file(
    path: Path,
    compiled: list[tuple[LogPattern, object]],
    ingest_time: str,
    source_id: str | None = None,
 ) -> Iterator[RetrievedEntry]:
    source_id = source_id or path.stem
    with path.open("r", errors="replace") as f:
        lines = iter(f)
        try:
            first = next(lines)
        except StopIteration:
            return
        fmt = _detect_format(first.strip())
        logger.info("Detected format %r for %s", fmt, path.name)
        def all_lines():
            yield first
            yield from lines
        if fmt == "journald":
            yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "wazuh":
            yield from wazuh.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "docker":
            yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "caddy":
            yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "plex":
            yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "qbittorrent":
            yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "servarr":
            yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "dmesg":
            yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "syslog":
            yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
        else:
            yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
 def _write_batch(conn: Any, batch: list[RetrievedEntry]) -> None:
    tid = resolve_tenant_id()
    conflict = frag.entries_conflict_clause()
    sql = f"""
        {frag.insert_ignore_entries()}
          (tenant_id, id, source_id, sequence, timestamp_raw, timestamp_iso,
           ingest_time, severity, repeat_count, out_of_order,
           matched_patterns, text)
        VALUES (?,?,?,?,?,?,?,?,?,?,?,?)
        {conflict}
    """
    conn.executemany(
        sql,
        [
            (
                tid, e.entry_id, e.source_id, e.sequence,
                e.timestamp_raw, e.timestamp_iso, e.ingest_time,
                e.severity, e.repeat_count, int(e.out_of_order),
                json.dumps(list(e.matched_patterns)), e.text,
            )
            for e in batch
        ],
    )
 def _glean_files(
    files: list[Path],
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
    source_id_map: dict[Path, str] | None = None,
    force: bool = False,
 ) -> dict[str, int]:
    pattern_file = pattern_file or Path("patterns/default.yaml")
    patterns = load_patterns(pattern_file)
    compiled = _compile(patterns)
    ingest_time = now_iso()
    source_id_map = source_id_map or {}
    ensure_schema(db_path)
    with get_conn(db_path) as conn:
        stats: dict[str, int] = {}
        skipped: list[str] = []
        for log_file in files:
            source_id = source_id_map.get(log_file, log_file.stem)
            mtime, size = _fingerprint(log_file)
            if not force and _fp_unchanged(conn, log_file, mtime, size):
                logger.debug("Skipping unchanged file: %s", log_file.name)
                skipped.append(log_file.name)
                stats[source_id] = stats.get(source_id, 0)
                continue
            count = 0
            batch: list[RetrievedEntry] = []
            for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
                batch.append(entry)
                if len(batch) >= batch_size:
                    _write_batch(conn, batch)
                    conn.commit()
                    count += len(batch)
                    batch.clear()
            if batch:
                _write_batch(conn, batch)
                conn.commit()
                count += len(batch)
            _save_fingerprint(conn, log_file, mtime, size, ingest_time)
            conn.commit()
            stats[source_id] = stats.get(source_id, 0) + count
            logger.info("Gleaned %d entries from %s (source: %s)", count, log_file.name, source_id)
    if skipped:
        logger.info("Skipped %d unchanged file(s): %s", len(skipped), ", ".join(skipped))
    logger.info("Building FTS index...")
    build_fts_index(db_path)
    logger.info("FTS index ready")
    return stats
 def _stream_and_write(
    transport: SSHTransport,
    cmd: str,
    parser,
    source_id: str,
    compiled: list[tuple[LogPattern, object]],
    ingest_time: str,
    conn: Any,
    batch_size: int,
 ) -> int:
    """Stream *cmd* output through *parser* and write entries to *conn*.
    Catches SSHCommandError per-item so one bad command doesn't abort the rest
    of the glean items for this host.  Returns the number of entries written.
    """
    count = 0
    batch: list[RetrievedEntry] = []
    try:
        for entry in parser(transport.exec_stream(cmd), source_id, compiled, ingest_time):
            batch.append(entry)
            if len(batch) >= batch_size:
                _write_batch(conn, batch)
                conn.commit()
                count += len(batch)
                batch.clear()
        if batch:
            _write_batch(conn, batch)
            conn.commit()
            count += len(batch)
    except SSHCommandError as exc:
        logger.warning("SSH command failed for source %r (cmd: %s): %s", source_id, cmd, exc)
    logger.info("Gleaned %d entries from SSH source %s", count, source_id)
    return count
 def _glean_ssh_source(
    src: dict,  # type: ignore[type-arg]
    compiled: list[tuple[LogPattern, object]],
    ingest_time: str,
    conn: Any,
    batch_size: int,
 ) -> dict[str, int]:
    """Open one SSHTransport connection for *src* and glean all its glean items.
    One SSH connection is shared across all items in the ``glean:`` list so
    the handshake overhead is paid only once per host per glean run.
    Returns a stats dict mapping ``{source_id: entry_count}`` for each item.
    Gracefully skips the entire source on SSHConnectionError.
    """
    host_id = src.get("id", src.get("host", "unknown"))
    host = src["host"]
    user = src["user"]
    key_path = str(Path(src["key_path"]).expanduser())
    port = int(src.get("port", 22))
    glean_items: list[dict] = src.get("glean", [])  # type: ignore[type-arg]
    stats: dict[str, int] = {}
    try:
        with SSHTransport(host=host, user=user, key_path=key_path, port=port) as t:
            for item in glean_items:
                item_type = item.get("type", "plaintext")
                # Per-item source_id — falls back to host_id/type for un-labelled items
                item_id = item.get("id") or f"{host_id}/{item_type}"
                if item_type == "journald":
                    cmd = _build_journald_command(item)
                    count = _stream_and_write(
                        t, cmd, journald.parse, item_id, compiled, ingest_time, conn, batch_size
                    )
                    stats[item_id] = stats.get(item_id, 0) + count
                elif item_type == "syslog":
                    cmd = _build_syslog_command(item)
                    count = _stream_and_write(
                        t, cmd, syslog.parse, item_id, compiled, ingest_time, conn, batch_size
                    )
                    stats[item_id] = stats.get(item_id, 0) + count
                elif item_type == "plaintext":
                    cmd = _build_plaintext_command(item)
                    count = _stream_and_write(
                        t, cmd, plaintext.parse, item_id, compiled, ingest_time, conn, batch_size
                    )
                    stats[item_id] = stats.get(item_id, 0) + count
                elif item_type == "docker":
                    cmds = _build_docker_command(item)
                    if isinstance(cmds, str):
                        cmds = [cmds]
                    containers: list[str] = item.get("containers", [])
                    for i, cmd in enumerate(cmds):
                        # Use the container name as the final path segment when available
                        container_name = containers[i] if i < len(containers) else str(i)
                        container_id = f"{item_id}/{container_name}" if len(cmds) > 1 else item_id
                        count = _stream_and_write(
                            t, cmd, docker_log.parse, container_id,
                            compiled, ingest_time, conn, batch_size,
                        )
                        stats[container_id] = stats.get(container_id, 0) + count
                else:
                    logger.warning(
                        "Unknown SSH glean type %r for source %r — skipping item",
                        item_type, host_id,
                    )
    except SSHConnectionError as exc:
        logger.warning("SSH connection failed for source %r: %s", host_id, exc)
    return stats
 def glean_ssh_source(
    src: dict,  # type: ignore[type-arg]
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
 ) -> dict[str, int]:
    """Glean a single SSH source dict and write results to *db_path*.
    Public wrapper around :func:`_glean_ssh_source` for the REST layer.
    Manages the DB connection, pattern compilation, and FTS rebuild so callers
    don't have to deal with those lifecycle concerns.
    Returns stats mapping ``{sub_source_id: entry_count}``.
    """
    effective_pattern_file = pattern_file or Path("patterns/default.yaml")
    compiled = _compile(load_patterns(effective_pattern_file))
    ingest_time = now_iso()
    ensure_schema(db_path)
    with get_conn(db_path) as conn:
        stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
    logger.info("Rebuilding FTS index after SSH source glean...")
    build_fts_index(db_path)
    return stats
 def glean_dir(
    corpus_dir: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
    force: bool = False,
 ) -> dict[str, int]:
    """Glean all .jsonl and .log files from a corpus directory.
    Pass ``force=True`` to bypass fingerprint checks and re-glean all files
    regardless of whether they have changed since the last run.
    """
    files = sorted(corpus_dir.rglob("*.jsonl")) + sorted(corpus_dir.rglob("*.log"))
    return _glean_files(files, db_path, pattern_file, batch_size, force=force)
 def glean_file(
    log_file: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    force: bool = False,
 ) -> dict[str, int]:
    """Glean a single log file (any supported format).
    Pass ``force=True`` to re-glean even when the file fingerprint is unchanged.
    """
    return _glean_files([log_file], db_path, pattern_file, force=force)
 def glean_sources(
    sources_file: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
    force: bool = False,
 ) -> dict[str, int]:
    """Glean all sources listed in a sources.yaml config file.
    Supports two source types:
    Local file sources (default):
        sources:
          - id: sonarr
            path: /opt/sonarr/config/logs/sonarr.0.txt
    SSH remote sources (transport: ssh):
        sources:
          - id: rack01
            transport: ssh
            host: 192.168.1.10
            user: admin
            key_path: ~/.ssh/id_ed25519
            glean:
              - type: journald
                args: ["--since", "2 hours ago"]
              - type: syslog
                path: /var/log/syslog
              - type: plaintext
                path: /var/log/app/error.log
              - type: docker
                containers: [myapp, nginx]
    Missing local paths and SSH connection failures are logged as warnings
    so the cron keeps running when a source is temporarily down.
    """
    with open(sources_file) as f:
        config = yaml.safe_load(f)
    local_sources: list[dict] = []  # type: ignore[type-arg]
    ssh_sources: list[dict] = []    # type: ignore[type-arg]
    for src in config.get("sources", []):
        if src.get("transport") == "ssh":
            ssh_sources.append(src)
        else:
            local_sources.append(src)
    # ── Local file sources ─────────────────────────────────────────────────
    files: list[Path] = []
    source_id_map: dict[Path, str] = {}
    for src in local_sources:
        path = Path(src["path"])
        if not path.exists():
            logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
            continue
        files.append(path)
        if "id" in src:
            source_id_map[path] = src["id"]
    if not files and not ssh_sources:
        logger.warning("No sources found — check sources.yaml paths")
        return {}
    stats: dict[str, int] = {}
    if files:
        stats.update(_glean_files(files, db_path, pattern_file, batch_size, source_id_map, force=force))
    # ── SSH remote sources ─────────────────────────────────────────────────
    if not ssh_sources:
        return stats
    # Compile patterns once, share across all SSH sources in this run.
    effective_pattern_file = pattern_file or Path("patterns/default.yaml")
    compiled = _compile(load_patterns(effective_pattern_file))
    ingest_time = now_iso()
    ensure_schema(db_path)
    with get_conn(db_path) as conn:
        for src in ssh_sources:
            ssh_stats = _glean_ssh_source(src, compiled, ingest_time, conn, batch_size)
            for k, v in ssh_stats.items():
                stats[k] = stats.get(k, 0) + v
        conn.commit()
    # Rebuild FTS only when SSH sources added entries (_glean_files already
    # rebuilds when local sources are present; safe to call again if both ran).
    if ssh_sources:
        logger.info("Rebuilding FTS index after SSH glean...")
        build_fts_index(db_path)
    return stats
--- a/app/glean/ssh.py
+++ b/app/glean/ssh.py
@ -1,225 +0,0 @@
 """SSH transport layer for remote log gleaning (issue #22).
 Wraps Paramiko to provide a clean context-manager interface for executing
 remote commands and streaming their stdout output.  All format parsing is
 delegated to the existing per-format parsers (journald, syslog, plaintext,
 docker); this module is transport only.
 Key design choices:
 - Key-based auth only — no password prompts in a daemon context.
 - exec_stream is a generator; exit-status check fires after all lines are
  yielded, so callers must drain the iterator (e.g. list()) to trigger it.
 - Command builders live here because they encode SSH/remote-execution idioms
  (journalctl flags, docker logs invocation) that the generic parsers don't
  need to know about.
 Example sources.yaml snippet::
    sources:
      - id: rack01
        transport: ssh
        host: 192.168.1.10
        user: admin
        key_path: ~/.ssh/id_ed25519
        glean:
          - type: journald
            args: ["--since", "2 hours ago"]
          - type: syslog
            path: /var/log/syslog
          - type: plaintext
            path: /var/log/app/error.log
          - type: docker
            containers: [myapp, nginx]
 """
 from __future__ import annotations
 import shlex
 from collections.abc import Iterator
 from typing import Union
 import paramiko
 __all__ = [
    "SSHConnectionError",
    "SSHCommandError",
    "SSHTransport",
    "_build_journald_command",
    "_build_syslog_command",
    "_build_plaintext_command",
    "_build_docker_command",
 ]
 # Default syslog path used when none is specified in the source spec.
 _SYSLOG_DEFAULT_PATH = "/var/log/syslog"
 # ── Custom exceptions ─────────────────────────────────────────────────────────
 class SSHConnectionError(Exception):
    """Raised when the SSH connection cannot be established or authenticated."""
 class SSHCommandError(Exception):
    """Raised when a remote command exits with a non-zero status code."""
 # ── Transport context manager ─────────────────────────────────────────────────
 class SSHTransport:
    """Context manager wrapping a Paramiko SSH connection.
    Opens the connection on ``__enter__`` and closes it on ``__exit__``,
    even if an exception propagates.  Key-based authentication only.
    Usage::
        with SSHTransport(host="10.0.0.1", user="admin",
                          key_path="~/.ssh/id_ed25519") as t:
            for line in t.exec_stream("journalctl -o json --since '1 hour ago'"):
                process(line)
    """
    def __init__(
        self,
        host: str,
        user: str,
        key_path: str,
        port: int = 22,
    ) -> None:
        self._host = host
        self._user = user
        self._key_path = key_path
        self._port = port
        self._client: paramiko.SSHClient | None = None
    # ── context manager protocol ──────────────────────────────────────────────
    def __enter__(self) -> "SSHTransport":
        client = paramiko.SSHClient()
        client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        try:
            client.connect(
                hostname=self._host,
                username=self._user,
                key_filename=self._key_path,
                port=self._port,
            )
        except paramiko.AuthenticationException as exc:
            client.close()
            raise SSHConnectionError(
                f"SSH auth failed for {self._user}@{self._host}: {exc}"
            ) from exc
        except paramiko.SSHException as exc:
            client.close()
            raise SSHConnectionError(
                f"SSH connection failed to {self._host}: {exc}"
            ) from exc
        self._client = client
        return self
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:  # type: ignore[override]
        if self._client is not None:
            self._client.close()
            self._client = None
        # Return None (falsy) so any in-flight exception is not suppressed.
    # ── remote execution ──────────────────────────────────────────────────────
    def exec_stream(self, command: str) -> Iterator[str]:
        """Execute *command* on the remote host and yield stdout lines.
        The exit-status check runs after all stdout lines have been yielded,
        so callers must drain the iterator to trigger it::
            list(transport.exec_stream(cmd))   # raises if exit != 0
        Raises:
            SSHConnectionError: if called outside a ``with`` block.
            SSHCommandError: if the remote command exits non-zero.
        """
        if self._client is None:
            raise SSHConnectionError(
                "Not connected — use SSHTransport as a context manager"
            )
        _, stdout, stderr = self._client.exec_command(command)
        for line in stdout:
            yield line
        exit_code = stdout.channel.recv_exit_status()
        # Guard against MagicMock in tests: only treat real integer exit codes.
        if isinstance(exit_code, int) and exit_code != 0:
            error_msg = stderr.read().decode(errors="replace").strip()
            raise SSHCommandError(
                f"Command failed (exit {exit_code}): {error_msg}"
            )
 # ── Command builders ──────────────────────────────────────────────────────────
 def _build_journald_command(spec: dict) -> str:  # type: ignore[type-arg]
    """Build a ``journalctl`` command string from a glean source spec.
    Spec keys:
    - ``args`` — list of extra journalctl arguments appended verbatim.
    - ``unit`` — shorthand for ``--unit <name>`` (inserted before ``args``).
    Returns a single shell command string.
    """
    parts = ["journalctl", "-o json", "--no-pager"]
    if "unit" in spec:
        parts.append(f"--unit {spec['unit']}")
    if "args" in spec:
        parts.extend(spec["args"])
    return " ".join(parts)
 def _build_syslog_command(spec: dict) -> str:  # type: ignore[type-arg]
    """Build a ``cat`` command for a syslog-format log file.
    Spec keys:
    - ``path`` — path to the file (default: ``/var/log/syslog``).
    Returns a single shell command string.
    """
    path = spec.get("path", _SYSLOG_DEFAULT_PATH)
    return f"cat {shlex.quote(path)}"
 def _build_plaintext_command(spec: dict) -> str:  # type: ignore[type-arg]
    """Build a ``cat`` command for an arbitrary plaintext log file.
    Spec keys:
    - ``path`` — **required** path to the log file.
    Raises:
        KeyError: if ``path`` is absent from the spec.
    """
    path = spec["path"]  # intentional KeyError if missing — callers must supply it
    return f"cat {shlex.quote(path)}"
 def _build_docker_command(
    spec: dict,  # type: ignore[type-arg]
 ) -> Union[str, list[str]]:
    """Build ``docker logs`` command(s) for one or more named containers.
    Spec keys:
    - ``containers`` — **required** list of container names or IDs.
    Returns a single command string when there is one container, or a list
    of command strings when there are multiple (one command per container so
    each can be streamed independently).
    Raises:
        KeyError: if ``containers`` is absent from the spec.
        ValueError: if ``containers`` is an empty list.
    """
    containers = spec["containers"]  # intentional KeyError if missing
    if not containers:
        raise ValueError("'containers' must be a non-empty list")
    commands = [f"docker logs {shlex.quote(c)}" for c in containers]
    return commands[0] if len(commands) == 1 else commands
--- a/app/glean/wazuh.py
+++ b/app/glean/wazuh.py
@ -1,161 +0,0 @@
 """Wazuh SIEM alert parser.
 Handles Wazuh's alerts.json format (JSON Lines — one alert object per line):
    /var/ossec/logs/alerts/alerts.json  (on the Wazuh manager)
 Each line is a complete JSON object. Key fields used:
    timestamp     — ISO 8601 with timezone offset ("2024-01-15T10:23:45.123+0000")
    rule.level    — 1-15 (maps to Turnstone severity)
    rule.id       — Wazuh rule ID
    rule.description — human-readable rule description (primary message text)
    rule.groups   — list of category tags
    agent.name    — hostname that generated the original event
    agent.ip      — agent IP address
    full_log      — original raw log line that triggered the alert
    location      — log file or input that was monitored
    data          — dict of decoded fields (srcip, dstip, url, etc.)
 """
 from __future__ import annotations
 import json
 from datetime import datetime, timezone
 from typing import Iterator
 from app.glean.base import (
    SourceState, apply_patterns, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
 # Wazuh rule levels 1-15 → Turnstone severity labels.
 # Levels < 4 are normally informational, 7+ begin to matter operationally,
 # 10+ correspond to SIEM-worthy events, 13+ are critical.
 _LEVEL_SEVERITY: dict[int, str] = {
    1:  "DEBUG",  2:  "DEBUG",  3:  "DEBUG",
    4:  "INFO",   5:  "INFO",   6:  "NOTICE",
    7:  "WARN",   8:  "WARN",   9:  "WARN",
    10: "ERROR",  11: "ERROR",  12: "ERROR",
    13: "CRITICAL", 14: "CRITICAL", 15: "CRITICAL",
 }
 def is_wazuh_alert(obj: dict) -> bool:
    """Return True if a parsed JSON object looks like a Wazuh alert."""
    return (
        isinstance(obj.get("rule"), dict)
        and isinstance(obj.get("agent"), dict)
        and ("timestamp" in obj or "manager" in obj)
    )
 def _parse_timestamp(raw: str) -> str:
    """Convert Wazuh's ISO 8601 timestamp to UTC ISO 8601."""
    if not raw:
        return ""
    for fmt in (
        "%Y-%m-%dT%H:%M:%S.%f%z",
        "%Y-%m-%dT%H:%M:%S%z",
        "%Y-%m-%dT%H:%M:%S.%fZ",
        "%Y-%m-%dT%H:%M:%SZ",
    ):
        try:
            dt = datetime.strptime(raw, fmt)
            return dt.astimezone(timezone.utc).isoformat()
        except ValueError:
            continue
    return raw
 def _build_text(alert: dict) -> str:
    """Compose a readable, searchable text representation of the alert."""
    rule = alert.get("rule", {})
    agent = alert.get("agent", {})
    agent_name = agent.get("name", "unknown")
    agent_ip = agent.get("ip", "")
    rule_id = rule.get("id", "")
    rule_desc = rule.get("description", "(no description)")
    groups = rule.get("groups", [])
    location = alert.get("location", "")
    full_log = alert.get("full_log", "")
    parts: list[str] = []
    # Header line: agent + rule context
    agent_tag = f"{agent_name}/{agent_ip}" if agent_ip else agent_name
    group_tag = ",".join(groups) if groups else ""
    header = f"[wazuh][agent:{agent_tag}][rule:{rule_id}]"
    if group_tag:
        header += f"[{group_tag}]"
    parts.append(f"{header} {rule_desc}")
    if location:
        parts.append(f"location: {location}")
    # Extra decoded fields (srcip, dstip, url, user, etc.)
    data = alert.get("data", {})
    if isinstance(data, dict) and data:
        kv = " | ".join(f"{k}={v}" for k, v in sorted(data.items()) if v)
        if kv:
            parts.append(kv)
    if full_log and full_log.strip() != rule_desc.strip():
        parts.append(f"raw: {full_log.strip()}")
    return "\n".join(parts)
 def parse(
    lines: Iterator[str],
    source_id: str,
    compiled_patterns: list[tuple[LogPattern, object]],
    ingest_time: str | None = None,
 ) -> Iterator[RetrievedEntry]:
    ingest_time = ingest_time or now_iso()
    state = SourceState()
    for raw_line in lines:
        raw_line = raw_line.strip()
        if not raw_line:
            continue
        try:
            alert = json.loads(raw_line)
        except json.JSONDecodeError:
            continue
        if not isinstance(alert, dict):
            continue
        rule = alert.get("rule", {})
        agent = alert.get("agent", {})
        ts_raw = alert.get("timestamp", "")
        ts_iso = _parse_timestamp(ts_raw)
        level = int(rule.get("level", 0))
        severity = _LEVEL_SEVERITY.get(level, "INFO")
        # Qualify source_id by agent so logs from different hosts stay separate.
        agent_name = agent.get("name", "")
        src = f"{source_id}:{agent_name}" if agent_name else source_id
        text = _build_text(alert)
        if not text:
            continue
        repeat, out_of_order = state.observe(text, ts_iso)
        matched = apply_patterns(text, compiled_patterns)
        yield RetrievedEntry(
            entry_id=make_entry_id(src, state.sequence, text),
            source_id=src,
            sequence=state.sequence,
            timestamp_raw=ts_raw,
            timestamp_iso=ts_iso,
            ingest_time=ingest_time,
            severity=severity,
            repeat_count=repeat,
            out_of_order=out_of_order,
            matched_patterns=matched,
            text=text,
        )
--- a/app/ingest/init.py
+++ b/app/ingest/init.py
--- a/app/ingest/base.py
+++ b/app/ingest/base.py
@ -33,7 +33,6 @@ def load_patterns(path: Path) -> list[LogPattern]:
            pattern=p["pattern"],
            severity=p["severity"],
            description=p["description"],
            domain=p.get("domain", ""),
        )
        for p in raw.get("patterns", [])
    ]
--- a/app/ingest/caddy.py
+++ b/app/ingest/caddy.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, epoch_float_to_iso,
    make_entry_id, now_iso,
 )
--- a/app/ingest/dmesg_log.py
+++ b/app/ingest/dmesg_log.py
@ -18,7 +18,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/doc_upload.py
+++ b/app/ingest/doc_upload.py
@ -1,19 +1,18 @@
 """Upload adapter: processes file bytes and writes to context store — MIT licensed."""
 from __future__ import annotations
 import sqlite3
 import uuid
 from pathlib import Path
 from typing import Any
 from app.context.chunker import process_upload
 from app.context.store import add_document, add_fact
 from app.db import get_conn, resolve_tenant_id
-def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
+def ingest_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]:
    """Process an uploaded file and write to context store. Returns result summary."""
    doc_type, facts, chunks = process_upload(filename, content)
    tid = resolve_tenant_id()
    doc = add_document(
        db_path,
@ -26,13 +25,15 @@ def glean_upload(db_path: Path, filename: str, content: bytes) -> dict[str, Any]
    for fact in facts:
        add_fact(db_path, fact.category, fact.key, fact.value, source="upload")
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
-        for i, chunk_text in enumerate(chunks):
+    conn.execute("PRAGMA journal_mode=WAL")
-            conn.execute(
+    for i, chunk_text in enumerate(chunks):
-                "INSERT INTO context_chunks(id, tenant_id, document_id, chunk_index, text) VALUES (?,?,?,?,?)",
+        conn.execute(
-                (str(uuid.uuid4()), tid, doc.id, i, chunk_text),
+            "INSERT INTO context_chunks(id, document_id, chunk_index, text) VALUES (?,?,?,?)",
-            )
+            (str(uuid.uuid4()), doc.id, i, chunk_text),
-        conn.commit()
+        )
    conn.commit()
    conn.close()
    return {
        "document_id": doc.id,
--- a/app/ingest/docker_log.py
+++ b/app/ingest/docker_log.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity,
    make_entry_id, now_iso,
 )
--- a/app/ingest/journald.py
+++ b/app/ingest/journald.py
@ -4,7 +4,7 @@ from __future__ import annotations
 import json
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, epoch_micros_to_iso,
    make_entry_id, now_iso, SYSLOG_PRIORITY,
 )
--- a/app/ingest/pipeline.py
+++ b/app/ingest/pipeline.py
@ -0,0 +1,324 @@
 """Ingest pipeline: auto-detect format, parse, write to SQLite."""
 from __future__ import annotations
 import json
 import logging
 import re
 import sqlite3
 from pathlib import Path
 from typing import Iterator
 import yaml
 from app.ingest import caddy, dmesg_log, docker_log, journald, plaintext, plex, qbittorrent, servarr, syslog
 from app.ingest.base import _compile, load_patterns, now_iso
 from app.services.models import LogPattern, RetrievedEntry
 from app.services.search import build_fts_index
 logger = logging.getLogger(__name__)
 _SCHEMA = """
 CREATE TABLE IF NOT EXISTS log_entries (
    id              TEXT PRIMARY KEY,
    source_id       TEXT NOT NULL,
    sequence        INTEGER NOT NULL,
    timestamp_raw   TEXT,
    timestamp_iso   TEXT,
    ingest_time     TEXT NOT NULL,
    severity        TEXT,
    repeat_count    INTEGER DEFAULT 1,
    out_of_order    INTEGER DEFAULT 0,
    matched_patterns TEXT DEFAULT '[]',
    text            TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_source      ON log_entries(source_id);
 CREATE INDEX IF NOT EXISTS idx_timestamp   ON log_entries(timestamp_iso);
 CREATE INDEX IF NOT EXISTS idx_ts_repeat   ON log_entries(timestamp_iso, repeat_count);
 CREATE INDEX IF NOT EXISTS idx_severity    ON log_entries(severity);
 CREATE INDEX IF NOT EXISTS idx_patterns    ON log_entries(matched_patterns);
 CREATE TABLE IF NOT EXISTS incidents (
    id          TEXT PRIMARY KEY,
    label       TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    started_at  TEXT,
    ended_at    TEXT,
    notes       TEXT NOT NULL DEFAULT '',
    created_at  TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium'
 );
 CREATE INDEX IF NOT EXISTS idx_incidents_time ON incidents(started_at, ended_at);
 CREATE TABLE IF NOT EXISTS received_bundles (
    id          TEXT PRIMARY KEY,
    source_host TEXT NOT NULL,
    issue_type  TEXT NOT NULL DEFAULT '',
    label       TEXT NOT NULL,
    severity    TEXT NOT NULL DEFAULT 'medium',
    started_at  TEXT,
    bundled_at  TEXT NOT NULL,
    entry_count INTEGER NOT NULL DEFAULT 0,
    bundle_json TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_bundles_bundled ON received_bundles(bundled_at);
 CREATE INDEX IF NOT EXISTS idx_bundles_type    ON received_bundles(issue_type);
 CREATE TABLE IF NOT EXISTS context_facts (
    id           TEXT PRIMARY KEY,
    category     TEXT NOT NULL,
    key          TEXT NOT NULL,
    value        TEXT NOT NULL,
    source       TEXT,
    created_at   TEXT NOT NULL
 );
 CREATE INDEX IF NOT EXISTS idx_facts_category ON context_facts(category);
 CREATE INDEX IF NOT EXISTS idx_facts_key      ON context_facts(key);
 CREATE TABLE IF NOT EXISTS context_documents (
    id           TEXT PRIMARY KEY,
    filename     TEXT NOT NULL,
    doc_type     TEXT NOT NULL,
    full_text    TEXT NOT NULL,
    file_size    INTEGER,
    uploaded_at  TEXT NOT NULL
 );
 CREATE TABLE IF NOT EXISTS context_chunks (
    id           TEXT PRIMARY KEY,
    document_id  TEXT NOT NULL REFERENCES context_documents(id) ON DELETE CASCADE,
    chunk_index  INTEGER NOT NULL,
    text         TEXT NOT NULL,
    embedding    BLOB
 );
 CREATE INDEX IF NOT EXISTS idx_chunks_doc ON context_chunks(document_id);
 CREATE TABLE IF NOT EXISTS blocklist_candidates (
    id                 TEXT PRIMARY KEY,
    domain_or_ip       TEXT NOT NULL,
    source_device_ip   TEXT,
    source_device_name TEXT,
    first_seen         TEXT NOT NULL,
    last_seen          TEXT NOT NULL,
    hit_count          INTEGER DEFAULT 1,
    status             TEXT DEFAULT 'pending',
    pushed_at          TEXT,
    log_evidence       TEXT DEFAULT '[]',
    matched_rule       TEXT,
    llm_score          REAL,
    llm_reason         TEXT
 );
 CREATE INDEX IF NOT EXISTS idx_blocklist_device ON blocklist_candidates(source_device_ip);
 CREATE INDEX IF NOT EXISTS idx_blocklist_status ON blocklist_candidates(status);
 CREATE INDEX IF NOT EXISTS idx_blocklist_domain ON blocklist_candidates(domain_or_ip);
 """
 def ensure_schema(db_path: Path) -> None:
    """Create all tables and apply additive migrations. Safe to call on every startup."""
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.executescript(_SCHEMA)
    # Additive column migrations — ALTER TABLE silently skips if column exists
    for stmt in [
        "ALTER TABLE incidents ADD COLUMN issue_type TEXT NOT NULL DEFAULT ''",
    ]:
        try:
            conn.execute(stmt)
        except sqlite3.OperationalError:
            pass
    conn.commit()
    conn.close()
 def _detect_format(first_line: str) -> str:
    try:
        obj = json.loads(first_line)
        if "__REALTIME_TIMESTAMP" in obj:
            return "journald"
        if "SOURCE" in obj and str(obj.get("SOURCE", "")).startswith("docker:"):
            return "docker"
        if "ts" in obj and ("msg" in obj or "message" in obj or "request" in obj):
            return "caddy"
    except (json.JSONDecodeError, AttributeError):
        pass
    if plex.is_plex_log(first_line):
        return "plex"
    if qbittorrent.is_qbit_log(first_line):
        return "qbittorrent"
    if servarr.is_servarr_log(first_line):
        return "servarr"
    if dmesg_log.is_dmesg_log(first_line):
        return "dmesg"
    if syslog.is_syslog(first_line):
        return "syslog"
    return "plaintext"
 def _parse_file(
    path: Path,
    compiled: list[tuple[LogPattern, object]],
    ingest_time: str,
    source_id: str | None = None,
 ) -> Iterator[RetrievedEntry]:
    source_id = source_id or path.stem
    with path.open("r", errors="replace") as f:
        lines = iter(f)
        try:
            first = next(lines)
        except StopIteration:
            return
        fmt = _detect_format(first.strip())
        logger.info("Detected format %r for %s", fmt, path.name)
        def all_lines():
            yield first
            yield from lines
        if fmt == "journald":
            yield from journald.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "docker":
            yield from docker_log.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "caddy":
            yield from caddy.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "plex":
            yield from plex.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "qbittorrent":
            yield from qbittorrent.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "servarr":
            yield from servarr.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "dmesg":
            yield from dmesg_log.parse(all_lines(), source_id, compiled, ingest_time)
        elif fmt == "syslog":
            yield from syslog.parse(all_lines(), source_id, compiled, ingest_time)
        else:
            yield from plaintext.parse(all_lines(), source_id, compiled, ingest_time)
 def _write_batch(conn: sqlite3.Connection, batch: list[RetrievedEntry]) -> None:
    conn.executemany(
        """
        INSERT OR IGNORE INTO log_entries
          (id, source_id, sequence, timestamp_raw, timestamp_iso,
           ingest_time, severity, repeat_count, out_of_order,
           matched_patterns, text)
        VALUES (?,?,?,?,?,?,?,?,?,?,?)
        """,
        [
            (
                e.entry_id, e.source_id, e.sequence,
                e.timestamp_raw, e.timestamp_iso, e.ingest_time,
                e.severity, e.repeat_count, int(e.out_of_order),
                json.dumps(list(e.matched_patterns)), e.text,
            )
            for e in batch
        ],
    )
 def _ingest_files(
    files: list[Path],
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
    source_id_map: dict[Path, str] | None = None,
 ) -> dict[str, int]:
    pattern_file = pattern_file or Path("patterns/default.yaml")
    patterns = load_patterns(pattern_file)
    compiled = _compile(patterns)
    ingest_time = now_iso()
    source_id_map = source_id_map or {}
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.executescript(_SCHEMA)
    conn.commit()
    stats: dict[str, int] = {}
    for log_file in files:
        source_id = source_id_map.get(log_file, log_file.stem)
        count = 0
        batch: list[RetrievedEntry] = []
        for entry in _parse_file(log_file, compiled, ingest_time, source_id=source_id):
            batch.append(entry)
            if len(batch) >= batch_size:
                _write_batch(conn, batch)
                conn.commit()
                count += len(batch)
                batch.clear()
        if batch:
            _write_batch(conn, batch)
            conn.commit()
            count += len(batch)
        stats[source_id] = stats.get(source_id, 0) + count
        logger.info("Ingested %d entries from %s (source: %s)", count, log_file.name, source_id)
    conn.close()
    logger.info("Building FTS index...")
    build_fts_index(db_path)
    logger.info("FTS index ready")
    return stats
 def ingest(
    corpus_dir: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
 ) -> dict[str, int]:
    """Ingest all .jsonl and .log files from a corpus directory."""
    files = sorted(corpus_dir.glob("*.jsonl")) + sorted(corpus_dir.glob("*.log"))
    return _ingest_files(files, db_path, pattern_file, batch_size)
 def ingest_file(
    log_file: Path,
    db_path: Path,
    pattern_file: Path | None = None,
 ) -> dict[str, int]:
    """Ingest a single log file (any supported format)."""
    return _ingest_files([log_file], db_path, pattern_file)
 def ingest_sources(
    sources_file: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    batch_size: int = 1000,
 ) -> dict[str, int]:
    """Ingest all sources listed in a sources.yaml config file.
    sources.yaml format:
        sources:
          - id: sonarr
            path: /opt/sonarr/config/logs/sonarr.0.txt
          - id: qbittorrent
            path: /opt/qbittorrent/config/data/logs/qbittorrent.log
    Missing paths are skipped with a warning so the cron keeps running
    when a service is temporarily down.
    """
    with open(sources_file) as f:
        config = yaml.safe_load(f)
    files: list[Path] = []
    source_id_map: dict[Path, str] = {}
    for src in config.get("sources", []):
        path = Path(src["path"])
        if not path.exists():
            logger.warning("Source %r not found, skipping: %s", src.get("id", "?"), path)
            continue
        files.append(path)
        if "id" in src:
            source_id_map[path] = src["id"]
    if not files:
        logger.warning("No source files found — check sources.yaml paths")
        return {}
    return _ingest_files(files, db_path, pattern_file, batch_size, source_id_map)
--- a/app/ingest/plaintext.py
+++ b/app/ingest/plaintext.py
@ -10,7 +10,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
@ -32,11 +32,10 @@ def _extract_ts(line: str) -> tuple[str, str]:
        if m:
            ts_raw = m.group("ts")
            try:
-                # Strip fractional seconds / TZ for strptime compat.
+                # Strip fractional seconds / TZ for strptime compat
                # Normalise ISO 8601 T-separator to space so strptime format matches.
                clean = re.sub(r"(\.\d+)?([Zz]|[+-]\d{2}:?\d{2})?$", "", ts_raw).strip()
                clean = clean.replace("T", " ")
-                dt = datetime.strptime(clean, fmt.replace("T", " "))
+                dt = datetime.strptime(clean, fmt)
                if dt.year == 1900:
                    dt = dt.replace(year=datetime.now().year)
                dt = dt.astimezone(timezone.utc)
--- a/app/ingest/plex.py
+++ b/app/ingest/plex.py
@ -12,7 +12,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/qbittorrent.py
+++ b/app/ingest/qbittorrent.py
@ -18,7 +18,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/servarr.py
+++ b/app/ingest/servarr.py
@ -12,7 +12,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/syslog.py
+++ b/app/ingest/syslog.py
@ -14,7 +14,7 @@ import re
 from datetime import datetime, timezone
 from typing import Iterator
-from app.glean.base import (
+from app.ingest.base import (
    SourceState, apply_patterns, detect_severity, make_entry_id, now_iso,
 )
 from app.services.models import LogPattern, RetrievedEntry
--- a/app/ingest/tautulli.py
+++ b/app/ingest/tautulli.py
@ -5,7 +5,7 @@ Tautulli sends all template values as strings, so all fields are treated as str.
 """
 from __future__ import annotations
-from app.glean.base import (
+from app.ingest.base import (
    apply_patterns,
    epoch_float_to_iso,
    make_entry_id,
--- a/app/mcp_server.py
+++ b/app/mcp_server.py
@ -11,7 +11,7 @@ from __future__ import annotations
 import logging
 import os
-import sqlite3  # still used for the pre-index-check on SQLite backend
+import sqlite3
 import sys
 from pathlib import Path
@ -53,15 +53,15 @@ _index_ready = False
 def _ensure_index() -> None:
-    """Build FTS index on first use; skip if already present (SQLite only)."""
+    """Build FTS index on first use; skip if already present."""
    global _index_ready
    if _index_ready:
        return
    try:
-        raw = sqlite3.connect(str(DB_PATH), timeout=30.0)
+        conn = sqlite3.connect(str(DB_PATH))
-        count = raw.execute("SELECT COUNT(*) FROM log_fts").fetchone()[0]
+        count = conn.execute("SELECT COUNT(*) FROM log_fts").fetchone()[0]
-        raw.close()
+        conn.close()
        if count > 0:
            _index_ready = True
            logger.info("FTS index present (%d entries)", count)
@ -93,8 +93,8 @@ def search_logs(
               Example: '"connection refused" OR "connection lost"'
        severity: Filter by level — EMERGENCY, ALERT, CRITICAL, ERROR, WARN, NOTICE, INFO, DEBUG.
        source: Partial match on source_id. Format is 'corpus:host:service'.
-                Example: 'myserver:caddy' matches all Caddy entries from myserver.
+                Example: 'xanderland:caddy' matches all Caddy entries from xanderland.
-        pattern: Filter by named pattern tag applied at glean time.
+        pattern: Filter by named pattern tag applied at ingest time.
                 Known tags: auth_failure, connection_lost, oom, segfault, disk_full,
                 timeout, caddy_tls_error, caddy_config_error, caddy_auth_error,
                 caddy_upstream_error, service_restart, service_update,
@ -176,7 +176,7 @@ def list_log_sources() -> str:
    """
    sources = list_sources(DB_PATH)
    if not sources:
-        return "No log sources found. Has the corpus been gleaned? Run: python scripts/glean_corpus.py"
+        return "No log sources found. Has the corpus been ingested? Run: python scripts/ingest_corpus.py"
    lines = [f"Corpus: {DB_PATH}", f"Sources ({len(sources)} total):\n"]
    for s in sources:
@ -192,7 +192,7 @@ def list_log_sources() -> str:
 if __name__ == "__main__":
    if not DB_PATH.exists():
        logger.error("Database not found: %s", DB_PATH)
-        logger.error("Run: python scripts/glean_corpus.py <corpus_dir> <db_path>")
+        logger.error("Run: python scripts/ingest_corpus.py <corpus_dir> <db_path>")
        sys.exit(1)
    logger.info("Starting Turnstone MCP server (DB: %s)", DB_PATH)
    mcp.run()
--- a/app/rest.py
+++ b/app/rest.py
--- a/app/services/anomaly.py
+++ b/app/services/anomaly.py
@ -1,305 +0,0 @@
 """Anomaly scoring pipeline — batch-score log_entries with a HF classifier.
 Designed to run after each glean cycle (or standalone).  When no model is
 configured the scorer is a no-op and returns immediately, so it is always
 safe to wire into the glean pipeline.
 Model: any HuggingFace text-classification model.  The existing Hybrid-BERT
 label map (from diagnose/classifier.py) is reused when the model produces
 NORMAL/SECURITY_ANOMALY/… outputs; other models get a generic severity map.
 Scoring strategy
 ----------------
 - Query unscored rows in batches (WHERE anomaly_scored_at IS NULL)
 - Run each entry text through the HF pipeline
 - Write anomaly_score + anomaly_label + anomaly_scored_at back
 - INSERT high-confidence hits (score >= threshold) into detections table,
  skipping duplicates so the scorer is safe to re-run
 """
 from __future__ import annotations
 import logging
 import os
 import time
 import uuid
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from app.db import get_conn, resolve_tenant_id
 from app.db.dialect import q
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Label maps — reuse Hybrid-BERT vocabulary from diagnose/classifier.py
 # ---------------------------------------------------------------------------
 _HYBRID_BERT_SEVERITY: dict[str, str] = {
    "NORMAL": "INFO",
    "SECURITY_ANOMALY": "ERROR",
    "SYSTEM_FAILURE": "CRITICAL",
    "PERFORMANCE_ISSUE": "WARN",
    "NETWORK_ANOMALY": "WARN",
    "CONFIG_ERROR": "ERROR",
    "HARDWARE_ISSUE": "CRITICAL",
 }
 _GENERIC_SEVERITY: dict[str, str] = {
    "CRITICAL": "CRITICAL",
    "ERROR": "ERROR",
    "WARNING": "WARN",
    "WARN": "WARN",
    "INFO": "INFO",
    "DEBUG": "DEBUG",
 }
 _ANOMALOUS_LABELS: frozenset[str] = frozenset(
    {
        "SECURITY_ANOMALY",
        "SYSTEM_FAILURE",
        "PERFORMANCE_ISSUE",
        "NETWORK_ANOMALY",
        "CONFIG_ERROR",
        "HARDWARE_ISSUE",
        "CRITICAL",
        "ERROR",
    }
 )
 _DEFAULT_THRESHOLD = float(os.environ.get("TURNSTONE_ANOMALY_THRESHOLD", "0.75"))
 _DEFAULT_MODEL = os.environ.get("TURNSTONE_ANOMALY_MODEL", "")
 _DEFAULT_DEVICE = os.environ.get("TURNSTONE_ANOMALY_DEVICE", "cpu")
 _DEFAULT_BATCH = int(os.environ.get("TURNSTONE_ANOMALY_BATCH", "256"))
 # ---------------------------------------------------------------------------
 # ML singleton
 # ---------------------------------------------------------------------------
 _pipeline: Any | None = None
 def _get_pipeline(model_id: str, device: str) -> Any:
    global _pipeline  # noqa: PLW0603
    if _pipeline is None:
        from transformers import pipeline as hf_pipeline  # type: ignore[import-untyped]
        _pipeline = hf_pipeline("text-classification", model=model_id, device=device)
    return _pipeline
 def reset_pipeline() -> None:
    """Reset the cached pipeline singleton (test helper)."""
    global _pipeline  # noqa: PLW0603
    _pipeline = None
 # ---------------------------------------------------------------------------
 # Result types
 # ---------------------------------------------------------------------------
@dataclass
 class ScoringResult:
    scored: int = 0
    detections: int = 0
    skipped: bool = False
    error: str | None = None
 # ---------------------------------------------------------------------------
 # Internal helpers
 # ---------------------------------------------------------------------------
 def _map_label(raw_label: str, score: float) -> tuple[str, str]:
    """Return (normalised_label, severity) for a raw model output label."""
    upper = raw_label.upper()
    if upper in _HYBRID_BERT_SEVERITY:
        return upper, _HYBRID_BERT_SEVERITY[upper]
    sev = _GENERIC_SEVERITY.get(upper, "WARN")
    return upper, sev
 def _fetch_unscored(conn: Any, tenant_id: str, limit: int) -> list[dict]:
    rows = conn.execute(
        q("""
        SELECT id, source_id, text, timestamp_iso, severity
        FROM log_entries
        WHERE anomaly_scored_at IS NULL
          AND (tenant_id = ? OR tenant_id = '')
        ORDER BY ingest_time DESC
        LIMIT ?
        """),
        (tenant_id, limit),
    ).fetchall()
    return [dict(r) for r in rows]
 def _write_scores(
    conn: Any,
    rows: list[dict],
    scored_at: str,
 ) -> None:
    conn.executemany(
        q("UPDATE log_entries SET anomaly_score = ?, anomaly_label = ?, anomaly_scored_at = ? WHERE id = ?"),
        [(r["anomaly_score"], r["anomaly_label"], scored_at, r["id"]) for r in rows],
    )
 def _insert_detections(conn: Any, rows: list[dict], tenant_id: str, detected_at: str) -> int:
    inserted = 0
    for r in rows:
        try:
            conn.execute(
                q("""
                INSERT INTO detections
                    (id, tenant_id, entry_id, source_id, anomaly_label, anomaly_score,
                     severity, text, timestamp_iso, detected_at)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
                """),
                (
                    str(uuid.uuid4()),
                    tenant_id,
                    r["id"],
                    r["source_id"],
                    r["anomaly_label"],
                    r["anomaly_score"],
                    r["severity"],
                    r["text"][:2000],
                    r.get("timestamp_iso"),
                    detected_at,
                ),
            )
            inserted += 1
        except Exception:  # noqa: BLE001
            pass  # duplicate entry_id or constraint violation — skip
    return inserted
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 def score_unscored(
    db_path: Path,
    model_id: str = _DEFAULT_MODEL,
    device: str = _DEFAULT_DEVICE,
    batch_size: int = _DEFAULT_BATCH,
    threshold: float = _DEFAULT_THRESHOLD,
 ) -> ScoringResult:
    """Score all unscored log_entries in batches.
    Returns immediately (skipped=True) when model_id is empty — allows
    unconditional wiring without requiring the model to be configured.
    """
    if not model_id:
        return ScoringResult(skipped=True)
    try:
        pipe = _get_pipeline(model_id, device)
    except Exception as exc:
        logger.error("Failed to load anomaly model %r: %s", model_id, exc)
        return ScoringResult(error=str(exc))
    tenant_id = resolve_tenant_id()
    total_scored = 0
    total_detections = 0
    while True:
        with get_conn(db_path) as conn:
            batch = _fetch_unscored(conn, tenant_id, batch_size)
        if not batch:
            break
        texts = [r["text"][:512] for r in batch]
        try:
            predictions = pipe(texts, truncation=True, max_length=512)
        except Exception as exc:
            logger.error("Inference error on batch of %d: %s", len(batch), exc)
            return ScoringResult(scored=total_scored, detections=total_detections, error=str(exc))
        scored_at = datetime.now(tz=timezone.utc).isoformat()
        scored_rows: list[dict] = []
        detection_rows: list[dict] = []
        for row, pred in zip(batch, predictions):
            label, severity = _map_label(pred["label"], pred["score"])
            enriched = {**row, "anomaly_score": pred["score"], "anomaly_label": label, "severity": severity}
            scored_rows.append(enriched)
            if label in _ANOMALOUS_LABELS and pred["score"] >= threshold:
                detection_rows.append(enriched)
        for _attempt in range(4):
            try:
                with get_conn(db_path) as conn:
                    _write_scores(conn, scored_rows, scored_at)
                    det_count = _insert_detections(conn, detection_rows, tenant_id, scored_at)
                    conn.commit()
                break
            except Exception as exc:
                if "database is locked" in str(exc).lower() and _attempt < 3:
                    logger.warning("DB locked, retrying write in 10s (attempt %d/4)", _attempt + 1)
                    time.sleep(10)
                else:
                    raise
        total_scored += len(scored_rows)
        total_detections += det_count
        logger.info(
            "Scored %d entries, %d detections (threshold=%.2f)",
            len(scored_rows), det_count, threshold,
        )
        if len(batch) < batch_size:
            break
    return ScoringResult(scored=total_scored, detections=total_detections)
 def list_detections(
    db_path: Path,
    limit: int = 100,
    unacked_only: bool = False,
    label: str | None = None,
    scorer: str | None = None,
 ) -> list[dict]:
    """Return detections ordered by detected_at DESC."""
    tenant_id = resolve_tenant_id()
    conditions = ["(tenant_id = ? OR tenant_id = '')"]
    params: list[Any] = [tenant_id]
    if unacked_only:
        conditions.append("acknowledged = 0")
    if label:
        conditions.append(q("anomaly_label = ?"))
        params.append(label.upper())
    if scorer:
        conditions.append(q("scorer = ?"))
        params.append(scorer.lower())
    where = " AND ".join(conditions)
    with get_conn(db_path) as conn:
        rows = conn.execute(
            q(f"SELECT * FROM detections WHERE {where} ORDER BY detected_at DESC LIMIT ?"),  # noqa: S608
            (*params, limit),
        ).fetchall()
    return [dict(r) for r in rows]
 def acknowledge_detection(db_path: Path, detection_id: str, notes: str = "") -> bool:
    """Mark a detection as acknowledged. Returns True if a row was updated."""
    tenant_id = resolve_tenant_id()
    acked_at = datetime.now(tz=timezone.utc).isoformat()
    with get_conn(db_path) as conn:
        cur = conn.execute(
            q("""
            UPDATE detections
            SET acknowledged = 1, acknowledged_at = ?, notes = ?
            WHERE id = ? AND (tenant_id = ? OR tenant_id = '')
            """),
            (acked_at, notes, detection_id, tenant_id),
        )
        conn.commit()
        return cur.rowcount > 0
--- a/app/services/blocklist.py
+++ b/app/services/blocklist.py
@ -4,12 +4,10 @@ from __future__ import annotations
 import dataclasses
 import json
 import re
 import sqlite3
 import uuid
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from app.db import get_conn, resolve_tenant_id
 import yaml
@ -93,26 +91,26 @@ def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
-def _row_to_candidate(row: Any) -> BlocklistCandidate:
+def _row_to_candidate(row: tuple) -> BlocklistCandidate:
    return BlocklistCandidate(
-        id=row["id"],
+        id=row[0],
-        domain_or_ip=row["domain_or_ip"],
+        domain_or_ip=row[1],
-        source_device_ip=row["source_device_ip"],
+        source_device_ip=row[2],
-        source_device_name=row["source_device_name"],
+        source_device_name=row[3],
-        first_seen=row["first_seen"],
+        first_seen=row[4],
-        last_seen=row["last_seen"],
+        last_seen=row[5],
-        hit_count=row["hit_count"],
+        hit_count=row[6],
-        status=row["status"],
+        status=row[7],
-        pushed_at=row["pushed_at"],
+        pushed_at=row[8],
-        log_evidence=json.loads(row["log_evidence"] or "[]"),
+        log_evidence=json.loads(row[9] or "[]"),
-        matched_rule=row["matched_rule"],
+        matched_rule=row[10],
-        llm_score=row["llm_score"],
+        llm_score=row[11],
-        llm_reason=row["llm_reason"],
+        llm_reason=row[12],
    )
 def _upsert_candidate(
-    conn: Any,
+    conn: sqlite3.Connection,
    domain_or_ip: str,
    source_device_ip: str | None,
    source_device_name: str | None,
@ -121,29 +119,26 @@ def _upsert_candidate(
    now: str,
 ) -> bool:
    """Insert or update a candidate. Returns True if a new row was created."""
    tid = resolve_tenant_id()
    row = conn.execute(
        "SELECT id, hit_count, log_evidence FROM blocklist_candidates "
-        "WHERE domain_or_ip = ? AND source_device_ip IS ? AND (tenant_id = ? OR tenant_id = '')",
+        "WHERE domain_or_ip = ? AND source_device_ip IS ?",
-        (domain_or_ip, source_device_ip, tid),
+        (domain_or_ip, source_device_ip),
    ).fetchone()
    if row is None:
        conn.execute(
            """INSERT INTO blocklist_candidates
-               (id, tenant_id, domain_or_ip, source_device_ip, source_device_name,
+               (id, domain_or_ip, source_device_ip, source_device_name,
                first_seen, last_seen, hit_count, status, pushed_at, log_evidence, matched_rule)
-               VALUES (?, ?, ?, ?, ?, ?, ?, 1, 'pending', NULL, ?, ?)""",
+               VALUES (?, ?, ?, ?, ?, ?, 1, 'pending', NULL, ?, ?)""",
            (
-                str(uuid.uuid4()), tid, domain_or_ip, source_device_ip, source_device_name,
+                str(uuid.uuid4()), domain_or_ip, source_device_ip, source_device_name,
                now, now, json.dumps([entry_id]), matched_rule,
            ),
        )
        return True
-    existing_id = row["id"]
+    existing_id, hit_count, existing_evidence = row
    hit_count = row["hit_count"]
    existing_evidence = row["log_evidence"]
    evidence = json.loads(existing_evidence or "[]")
    if entry_id not in evidence:
        evidence.append(entry_id)
@ -177,16 +172,14 @@ def run_scan(
    now = _now_iso()
    count = 0
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    try:
        rows = conn.execute(
-            f"SELECT id, text FROM log_entries WHERE source_id IN ({placeholders}) AND (tenant_id = ? OR tenant_id = '')",  # noqa: S608
+            f"SELECT id, text FROM log_entries WHERE source_id IN ({placeholders})",
-            (*router_source_ids, tid),
+            router_source_ids,
        ).fetchall()
-        for row in rows:
+        for entry_id, text in rows:
            entry_id, text = row["id"], row["text"]
            # rest of loop body follows unchanged
            src_ip: str | None = None
            dst: str | None = None
@ -211,6 +204,8 @@ def run_scan(
            count += 1
        conn.commit()
    finally:
        conn.close()
    return count
@ -231,27 +226,26 @@ def list_candidates(
    status: str | None = None,
    device_ip: str | None = None,
 ) -> list[BlocklistCandidate]:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    conditions = ["(tenant_id = ? OR tenant_id = '')"]
+    try:
-    params: list = [tid]
+        query = f"{_CANDIDATE_SELECT} WHERE 1=1"
-    if status and status != "all":
+        params: list = []
-        conditions.append("status = ?")
+        if status and status != "all":
-        params.append(status)
+            query += " AND status = ?"
-    if device_ip:
+            params.append(status)
-        conditions.append("source_device_ip = ?")
+        if device_ip:
-        params.append(device_ip)
+            query += " AND source_device_ip = ?"
-    where = " AND ".join(conditions)
+            params.append(device_ip)
-    with get_conn(db_path) as conn:
+        query += " ORDER BY last_seen DESC"
-        rows = conn.execute(
+        rows = conn.execute(query, params).fetchall()
-            f"{_CANDIDATE_SELECT} WHERE {where} ORDER BY last_seen DESC",  # noqa: S608
+    finally:
-            params,
+        conn.close()
        ).fetchall()
    return [_row_to_candidate(r) for r in rows]
-def _get_candidate(conn: Any, candidate_id: str) -> BlocklistCandidate:
+def _get_candidate(conn: sqlite3.Connection, candidate_id: str) -> BlocklistCandidate:
    row = conn.execute(
-        f"{_CANDIDATE_SELECT} WHERE id=?",  # noqa: S608
+        f"{_CANDIDATE_SELECT} WHERE id=?",
        (candidate_id,),
    ).fetchone()
    if row is None:
@ -261,31 +255,43 @@ def _get_candidate(conn: Any, candidate_id: str) -> BlocklistCandidate:
 def get_candidate(db_path: Path, candidate_id: str) -> BlocklistCandidate:
    """Fetch a single candidate by ID. Raises KeyError if not found."""
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
    try:
        return _get_candidate(conn, candidate_id)
    finally:
        conn.close()
 def update_candidate_status(db_path: Path, candidate_id: str, new_status: str) -> BlocklistCandidate:
    if new_status not in _VALID_STATUSES:
        raise ValueError(f"Invalid status {new_status!r}. Must be one of {_VALID_STATUSES}")
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
    try:
        conn.execute("UPDATE blocklist_candidates SET status=? WHERE id=?", (new_status, candidate_id))
        conn.commit()
        return _get_candidate(conn, candidate_id)
    finally:
        conn.close()
 def mark_pushed(db_path: Path, candidate_id: str) -> BlocklistCandidate:
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
    try:
        conn.execute(
            "UPDATE blocklist_candidates SET status='pushed', pushed_at=? WHERE id=?",
            (_now_iso(), candidate_id),
        )
        conn.commit()
        return _get_candidate(conn, candidate_id)
    finally:
        conn.close()
 def mark_unblocked(db_path: Path, candidate_id: str) -> BlocklistCandidate:
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
    try:
        conn.execute("UPDATE blocklist_candidates SET status='unblocked' WHERE id=?", (candidate_id,))
        conn.commit()
        return _get_candidate(conn, candidate_id)
    finally:
        conn.close()
--- a/app/services/cybersec.py
+++ b/app/services/cybersec.py
@ -1,241 +0,0 @@
 """Cybersecurity-focused scoring pipeline using zero-shot classification.
 Runs a second-pass analysis on entries that were already flagged by the
 anomaly scorer or that have pattern matches.  Uses a zero-shot classification
 model (DeBERTa-v3-base-mnli is cached locally) so no fine-tuning is needed.
 The scorer writes ml_score / ml_label / ml_scored_at to log_entries and
 inserts high-confidence non-normal hits into the detections table tagged
 with scorer='cybersec'.
 Env vars
 --------
 TURNSTONE_CYBERSEC_MODEL  — HF model id for zero-shot classification.
                            Recommended: MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli
                            (already cached from the diagnose pipeline).
                            Set to empty string to disable (safe default).
 TURNSTONE_CYBERSEC_DEVICE — 'cpu' (default) or 'cuda'
 TURNSTONE_CYBERSEC_THRESHOLD — float confidence floor for detection insertion (default 0.60)
 """
 from __future__ import annotations
 import logging
 import uuid
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any
 from app.db import get_conn, resolve_tenant_id
 from app.db.dialect import q
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Candidate labels — cybersec vocabulary for zero-shot inference
 # ---------------------------------------------------------------------------
 CYBERSEC_LABELS: list[str] = [
    "authentication failure or brute force attack",
    "privilege escalation or unauthorized access",
    "network intrusion or port scan",
    "malware or suspicious process activity",
    "data exfiltration or unusual outbound traffic",
    "normal system operation",
 ]
 _NORMAL_LABEL = "normal system operation"
 _LABEL_SEVERITY: dict[str, str] = {
    "authentication failure or brute force attack": "ERROR",
    "privilege escalation or unauthorized access":  "CRITICAL",
    "network intrusion or port scan":               "ERROR",
    "malware or suspicious process activity":       "CRITICAL",
    "data exfiltration or unusual outbound traffic":"CRITICAL",
    "normal system operation":                      "INFO",
 }
 # ---------------------------------------------------------------------------
 # Pipeline singleton
 # ---------------------------------------------------------------------------
 _pipeline: Any = None
 def _get_pipeline(model_id: str, device: str) -> Any:
    global _pipeline  # noqa: PLW0603
    if _pipeline is None:
        from transformers import pipeline  # type: ignore[import-untyped]
        logger.info("loading cybersec zero-shot pipeline: %s on %s", model_id, device)
        _pipeline = pipeline(
            "zero-shot-classification",
            model=model_id,
            device=0 if device == "cuda" else -1,
        )
        logger.info("cybersec pipeline ready")
    return _pipeline
 def reset_pipeline() -> None:
    """Clear the cached pipeline — for testing only."""
    global _pipeline  # noqa: PLW0603
    _pipeline = None
 # ---------------------------------------------------------------------------
 # Result type
 # ---------------------------------------------------------------------------
@dataclass
 class CybersecResult:
    scored: int = 0
    detections: int = 0
    skipped: bool = False
    error: str | None = None
 # ---------------------------------------------------------------------------
 # Core scoring function
 # ---------------------------------------------------------------------------
 def score_security_entries(
    db_path: Path,
    model_id: str,
    device: str = "cpu",
    batch_size: int = 32,
    threshold: float = 0.60,
 ) -> CybersecResult:
    """Score entries that were anomaly-flagged or pattern-matched.
    Only entries with ml_scored_at IS NULL are processed (idempotent).
    Writes ml_score / ml_label / ml_scored_at and inserts high-confidence
    hits into detections with scorer='cybersec'.
    """
    if not model_id:
        return CybersecResult(skipped=True)
    tenant_id = resolve_tenant_id()
    try:
        pipe = _get_pipeline(model_id, device)
    except Exception as exc:
        logger.error("failed to load cybersec pipeline: %s", exc)
        return CybersecResult(error=str(exc))
    total_scored = 0
    total_detections = 0
    try:
        with get_conn(db_path) as conn:
            # Only score entries that are worth a second look:
            # anomaly-flagged (non-normal) OR have at least one pattern match.
            rows = conn.execute(
                q("""
                SELECT id, source_id, text, timestamp_iso
                FROM log_entries
                WHERE (tenant_id = ? OR tenant_id = '')
                  AND ml_scored_at IS NULL
                  AND (
                      (anomaly_label IS NOT NULL AND anomaly_label != 'NORMAL')
                      OR (matched_patterns IS NOT NULL AND matched_patterns != '[]' AND matched_patterns != '')
                  )
                LIMIT ?
                """),
                (tenant_id, batch_size * 10),
            ).fetchall()
        if not rows:
            return CybersecResult(skipped=True)
        # Process in chunks to avoid OOM on large backlogs
        for i in range(0, len(rows), batch_size):
            chunk = rows[i : i + batch_size]
            texts = [r["text"] for r in chunk]
            try:
                results = pipe(texts, candidate_labels=CYBERSEC_LABELS, multi_label=False)
            except Exception as exc:
                logger.warning("zero-shot inference error on chunk %d: %s", i, exc)
                continue
            now = datetime.now(tz=timezone.utc).isoformat()
            with get_conn(db_path) as conn:
                for row, result in zip(chunk, results):
                    top_label: str = result["labels"][0]
                    top_score: float = result["scores"][0]
                    conn.execute(
                        q("""
                        UPDATE log_entries
                        SET ml_score = ?, ml_label = ?, ml_scored_at = ?
                        WHERE id = ? AND (tenant_id = ? OR tenant_id = '')
                        """),
                        (top_score, top_label, now, row["id"], tenant_id),
                    )
                    total_scored += 1
                    if top_score >= threshold and top_label != _NORMAL_LABEL:
                        severity = _LABEL_SEVERITY.get(top_label, "WARN")
                        try:
                            conn.execute(
                                q("""
                                INSERT INTO detections
                                  (id, tenant_id, entry_id, source_id, anomaly_label,
                                   anomaly_score, severity, text, timestamp_iso,
                                   detected_at, scorer)
                                VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'cybersec')
                                """),
                                (
                                    str(uuid.uuid4()),
                                    tenant_id,
                                    row["id"],
                                    row["source_id"],
                                    top_label,
                                    top_score,
                                    severity,
                                    row["text"],
                                    row["timestamp_iso"],
                                    now,
                                ),
                            )
                            total_detections += 1
                        except Exception:
                            pass  # entry may already have a detection — skip
                conn.commit()
    except Exception as exc:
        logger.error("cybersec scoring failed: %s", exc, exc_info=True)
        return CybersecResult(scored=total_scored, detections=total_detections, error=str(exc))
    return CybersecResult(scored=total_scored, detections=total_detections)
 # ---------------------------------------------------------------------------
 # Query helpers (used by REST layer)
 # ---------------------------------------------------------------------------
 def list_cybersec_detections(
    db_path: Path,
    limit: int = 100,
    unacked_only: bool = False,
    label: str | None = None,
 ) -> list[dict]:
    """Return cybersec detections ordered by detected_at DESC."""
    tenant_id = resolve_tenant_id()
    conditions = ["(tenant_id = ? OR tenant_id = '')", "scorer = 'cybersec'"]
    params: list[Any] = [tenant_id]
    if unacked_only:
        conditions.append("acknowledged = 0")
    if label:
        conditions.append(q("anomaly_label = ?"))
        params.append(label)
    where = " AND ".join(conditions)
    with get_conn(db_path) as conn:
        rows = conn.execute(
            q(f"SELECT * FROM detections WHERE {where} ORDER BY detected_at DESC LIMIT ?"),  # noqa: S608
            (*params, limit),
        ).fetchall()
    return [dict(r) for r in rows]
--- a/app/services/diagnose/legacy.py
+++ b/app/services/diagnose/legacy.py
@ -1,5 +1,4 @@
 """Frictionless diagnose service — NL time extraction + layered log search."""
 from __future__ import annotations
 import asyncio
@ -19,7 +18,6 @@ logger = logging.getLogger(__name__)
 try:
    from dateparser.search import search_dates as _search_dates  # type: ignore[import]
    _HAS_DATEPARSER = True
 except ImportError:
    _search_dates = None  # type: ignore[assignment]
@ -56,7 +54,7 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
    m = _RELATIVE_RE.search(query)
    if m:
        since, until = _relative_window(m)
-        keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip()
+        keywords = re.sub(r"\s{2,}", " ", query[:m.start()] + query[m.end():]).strip()
        return since, until, keywords or query
    if _HAS_DATEPARSER and _search_dates is not None:
@ -70,25 +68,17 @@ def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
            results = _search_dates(
                query,
                languages=["en"],
-                settings={
+                settings={"PREFER_DATES_FROM": "past", "TIMEZONE": tz_str, "RETURN_AS_TIMEZONE_AWARE": True},
                    "PREFER_DATES_FROM": "past",
                    "TIMEZONE": tz_str,
                    "RETURN_AS_TIMEZONE_AWARE": True,
                },
            )
        except Exception:
-            logger.warning(
+            logger.warning("dateparser failed on query %r — falling back to 60-min window", query)
                "dateparser failed on query %r — falling back to 60-min window", query
            )
            results = None
        if results:
            phrase, dt = results[0]
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            else:
-                dt = dt.astimezone(
+                dt = dt.astimezone(timezone.utc)  # normalise to UTC for SQLite string compare
                    timezone.utc
                )  # normalise to UTC for SQLite string compare
            since = (dt - timedelta(minutes=30)).isoformat()
            until = (dt + timedelta(minutes=30)).isoformat()
            keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
@ -117,23 +107,8 @@ def diagnose(
    else:
        keywords = query
-    keyword_hits = search(
+    keyword_hits = search(db_path, query=keywords, since=since, until=until, source_filter=source_filter, limit=150, or_mode=True)
-        db_path,
+    window_hits = entries_in_window(db_path, since=since, until=until, source_filter=source_filter, limit=50, per_source_cap=15)
        query=keywords,
        since=since,
        until=until,
        source_filter=source_filter,
        limit=150,
        or_mode=True,
    )
    window_hits = entries_in_window(
        db_path,
        since=since,
        until=until,
        source_filter=source_filter,
        limit=50,
        per_source_cap=15,
    )
    seen: set[str] = set()
    merged: list[SearchResult] = []
@ -142,9 +117,7 @@ def diagnose(
            seen.add(r.entry_id)
            merged.append(r)
-    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
        :200
    ]
    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
@ -156,9 +129,7 @@ def diagnose(
    reasoning: str | None = None
    if llm_url and llm_model:
-        reasoning = summarize(
+        reasoning = summarize(query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key)
            query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
        )
    return {
        "summary": {
@ -206,9 +177,7 @@ async def diagnose_stream(
        yield {"type": "status", "message": "Parsing time window…"}
        time_detected = since is not None and until is not None
        if not time_detected:
-            parsed_since, parsed_until, keywords = await asyncio.to_thread(
+            parsed_since, parsed_until, keywords = await asyncio.to_thread(parse_time_window, query)
                parse_time_window, query
            )
            since = since or parsed_since
            until = until or parsed_until
            time_detected = keywords != query
@ -228,34 +197,23 @@ async def diagnose_stream(
        keyword_hits: list[SearchResult] = []
        window_hits = await asyncio.to_thread(
            lambda: entries_in_window(
-                db_path,
+                db_path, since, until,
-                since,
+                source_filter=source_filter, limit=200,
                until,
                source_filter=source_filter,
                limit=200,
            )
        )
    else:
        keyword_hits, window_hits = await asyncio.gather(
            asyncio.to_thread(
                lambda: search(
-                    db_path,
+                    db_path, keywords,
-                    keywords,
+                    source_filter=source_filter, since=since, until=until,
-                    source_filter=source_filter,
+                    limit=150, or_mode=True,
                    since=since,
                    until=until,
                    limit=150,
                    or_mode=True,
                )
            ),
            asyncio.to_thread(
                lambda: entries_in_window(
-                    db_path,
+                    db_path, since, until,
-                    since,
+                    source_filter=source_filter, limit=50, per_source_cap=15,
                    until,
                    source_filter=source_filter,
                    limit=50,
                    per_source_cap=15,
                )
            ),
        )
@ -267,9 +225,7 @@ async def diagnose_stream(
            seen.add(r.entry_id)
            merged.append(r)
-    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
+    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[:200]
        :200
    ]
    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
@ -295,14 +251,7 @@ async def diagnose_stream(
    if llm_url and llm_model and combined:
        yield {"type": "status", "message": "Analyzing with LLM…"}
        reasoning = await asyncio.to_thread(
-            lambda: summarize(
+            lambda: summarize(query, combined, llm_url, llm_model, llm_api_key, context_block=context_block)
                query,
                combined,
                llm_url,
                llm_model,
                llm_api_key,
                context_block=context_block,
            )
        )
        if reasoning:
            yield {"type": "reasoning", "text": reasoning}
--- a/app/services/diagnose/init.py
+++ b/app/services/diagnose/init.py
@ -1,377 +0,0 @@
 """Frictionless diagnose service — NL time extraction + layered log search.
 This module is the public interface for the diagnose package.
 Full implementation lives here so that patch("app.services.diagnose._HAS_DATEPARSER")
 and patch("app.services.diagnose._search_dates") continue to target the correct
 namespace, preserving backward compatibility with existing tests.
 The verbatim original is preserved in legacy.py for reference.
 """
 from __future__ import annotations
 import asyncio
 import dataclasses
 import logging
 import os
 import re
 from collections.abc import AsyncGenerator
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
 from app.context.retriever import retrieve_context, format_context_block
 from app.services.llm import summarize
 from app.services.search import SearchResult, entries_in_window, search
 from app.services.diagnose.pipeline import run_pipeline
 logger = logging.getLogger(__name__)
 try:
    from dateparser.search import search_dates as _search_dates  # type: ignore[import]
    _HAS_DATEPARSER = True
 except ImportError:
    _search_dates = None  # type: ignore[assignment]
    _HAS_DATEPARSER = False
 _RELATIVE_RE = re.compile(
    r"\b(?:last|past)\s+(?:(?P<n>\d+)|(?P<approx>a\s+few|few|couple(?:\s+of)?|several))?\s*(?P<unit>minute|hour|day|week)s?\b",
    re.IGNORECASE,
 )
 _RELATIVE_UNITS = {"minute": 1, "hour": 60, "day": 1440, "week": 10080}
 # Fuzzy quantifiers map to a reasonable span so "last few hours" → 3h window
 _APPROX_N = 3
 def _relative_window(match: re.Match) -> tuple[str, str]:
    """Convert a relative time match to (since_iso, until_iso)."""
    n_str = match.group("n")
    approx = match.group("approx")
    unit = match.group("unit").lower()
    n = int(n_str) if n_str else (_APPROX_N if approx else 1)
    minutes = n * _RELATIVE_UNITS[unit]
    return _last_n_minutes(minutes), _now_iso()
 def parse_time_window(query: str) -> tuple[str | None, str | None, str]:
    """Extract a time window from a natural-language query string.
    Returns (since_iso, until_iso, keywords) where keywords is the query with
    the matched time phrase stripped. Falls back to last-60-min window.
    """
    # Handle relative expressions first ("last hour", "past 30 minutes", etc.)
    # dateparser misinterprets these as absolute times.
    m = _RELATIVE_RE.search(query)
    if m:
        since, until = _relative_window(m)
        keywords = re.sub(r"\s{2,}", " ", query[: m.start()] + query[m.end() :]).strip()
        return since, until, keywords or query
    if _HAS_DATEPARSER and _search_dates is not None:
        # Tell dateparser what timezone the user is in so "3:35 am" means local time.
        # PREFER_DAY_OF_MONTH is unused here but PREFER_DATES_FROM=past ensures
        # "3:35 am" resolves to the most recent past occurrence, not a future one.
        local_offset = datetime.now().astimezone().utcoffset()
        offset_h = int((local_offset.total_seconds() if local_offset else 0) / 3600)
        tz_str = f"UTC{'+' if offset_h >= 0 else ''}{offset_h}"
        try:
            results = _search_dates(
                query,
                languages=["en"],
                settings={
                    "PREFER_DATES_FROM": "past",
                    "TIMEZONE": tz_str,
                    "RETURN_AS_TIMEZONE_AWARE": True,
                },
            )
        except Exception as e:
            logger.warning(
                "dateparser failed (%s) on query %r — falling back to 60-min window",
                type(e).__name__,
                query,
            )
            results = None
        if results:
            phrase, dt = results[0]
            if dt.tzinfo is None:
                dt = dt.replace(tzinfo=timezone.utc)
            else:
                dt = dt.astimezone(
                    timezone.utc
                )  # normalise to UTC for SQLite string compare
            since = (dt - timedelta(minutes=30)).isoformat()
            until = (dt + timedelta(minutes=30)).isoformat()
            keywords = re.sub(r"\s{2,}", " ", query.replace(phrase, " ").strip())
            return since, until, keywords or query
    return _last_n_minutes(60), _now_iso(), query
 def diagnose(
    db_path: Path,
    query: str,
    since: str | None = None,
    until: str | None = None,
    source_filter: str | None = None,
    llm_url: str | None = None,
    llm_model: str | None = None,
    llm_api_key: str | None = None,
 ) -> dict[str, Any]:
    """Run layered log search with NL time extraction. Returns summary + entries."""
    time_detected = since is not None and until is not None
    if not time_detected:
        parsed_since, parsed_until, keywords = parse_time_window(query)
        since = since or parsed_since
        until = until or parsed_until
        time_detected = keywords != query
    else:
        keywords = query
    keyword_hits = search(
        db_path,
        query=keywords,
        since=since,
        until=until,
        source_filter=source_filter,
        limit=150,
        or_mode=True,
    )
    window_hits = entries_in_window(
        db_path,
        since=since,
        until=until,
        source_filter=source_filter,
        limit=50,
        per_source_cap=15,
    )
    seen: set[str] = set()
    merged: list[SearchResult] = []
    for r in keyword_hits + window_hits:
        if r.entry_id not in seen:
            seen.add(r.entry_id)
            merged.append(r)
    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
        :200
    ]
    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
    for r in combined:
        sev = (r.severity or "INFO").upper()
        if sev in by_severity:
            by_severity[sev] += 1
        by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
    reasoning: str | None = None
    if llm_url and llm_model:
        reasoning = summarize(
            query, combined, llm_url=llm_url, llm_model=llm_model, api_key=llm_api_key
        )
    return {
        "summary": {
            "total": len(combined),
            "window_start": since,
            "window_end": until,
            "time_detected": time_detected,
            "by_severity": by_severity,
            "by_source": by_source,
        },
        "reasoning": reasoning,
        "entries": combined,
    }
 async def diagnose_stream(
    db_path: Path,
    query: str,
    since: str | None = None,
    until: str | None = None,
    source_filter: str | None = None,
    llm_url: str | None = None,
    llm_model: str | None = None,
    llm_api_key: str | None = None,
    context_db_path: Path | None = None,
    incidents_db_path: Path | None = None,
    tech_level: str = "sysadmin",
    pattern_domain: dict[str, str] | None = None,
 ) -> AsyncGenerator[dict[str, Any], None]:
    """Async generator yielding SSE event dicts for the diagnose pipeline.
    Yields events in order:
      {"type":"status","message":"…"}  — pipeline progress
      {"type":"summary","data":{…}}    — window + severity counts (fast, from DB)
      {"type":"entries","data":[…]}    — log entries (fast, from DB)
      {"type":"reasoning","text":"…"}  — LLM analysis (slow, optional)
      {"type":"done"}
    """
    keywords = query.strip()
    source_browse = not keywords and source_filter is not None
    if source_browse:
        # No keyword — browsing a source directly. Use 24h window; skip FTS entirely.
        yield {"type": "status", "message": f"Loading {source_filter}…"}
        since = since or _last_n_minutes(60 * 24)
        until = until or _now_iso()
        time_detected = False
    else:
        yield {"type": "status", "message": "Parsing time window…"}
        time_detected = since is not None and until is not None
        if not time_detected:
            parsed_since, parsed_until, keywords = await asyncio.to_thread(
                parse_time_window, query
            )
            since = since or parsed_since
            until = until or parsed_until
            time_detected = keywords != query
    yield {"type": "status", "message": "Loading environment context…"}
    _ctx_db = context_db_path or db_path
    ctx = await asyncio.to_thread(lambda: retrieve_context(_ctx_db, query))
    yield {
        "type": "context",
        "facts": ctx.facts,
        "chunks": ctx.chunks,
    }
    yield {"type": "status", "message": "Searching logs…"}
    if source_browse:
        keyword_hits: list[SearchResult] = []
        window_hits = await asyncio.to_thread(
            lambda: entries_in_window(
                db_path,
                since,
                until,
                source_filter=source_filter,
                limit=200,
            )
        )
    else:
        keyword_hits, window_hits = await asyncio.gather(
            asyncio.to_thread(
                lambda: search(
                    db_path,
                    keywords,
                    source_filter=source_filter,
                    since=since,
                    until=until,
                    limit=150,
                    or_mode=True,
                    semantic=True,
                )
            ),
            asyncio.to_thread(
                lambda: entries_in_window(
                    db_path,
                    since,
                    until,
                    source_filter=source_filter,
                    limit=50,
                    per_source_cap=15,
                )
            ),
        )
    seen: set[str] = set()
    merged: list[SearchResult] = []
    for r in keyword_hits + window_hits:
        if r.entry_id not in seen:
            seen.add(r.entry_id)
            merged.append(r)
    combined = sorted(merged, key=lambda r: (r.timestamp_iso or "\xff", r.sequence))[
        :200
    ]
    by_severity: dict[str, int] = {"CRITICAL": 0, "ERROR": 0, "WARN": 0, "INFO": 0}
    by_source: dict[str, int] = {}
    for r in combined:
        sev = (r.severity or "INFO").upper()
        if sev in by_severity:
            by_severity[sev] += 1
        by_source[r.source_id] = by_source.get(r.source_id, 0) + 1
    by_domain: dict[str, int] = {}
    if pattern_domain:
        for r in combined:
            seen: set[str] = set()
            for tag in (r.matched_patterns or []):
                d = pattern_domain.get(tag, "")
                if d and d not in seen:
                    seen.add(d)
                    by_domain[d] = by_domain.get(d, 0) + 1
    yield {
        "type": "summary",
        "data": {
            "total": len(combined),
            "window_start": since,
            "window_end": until,
            "time_detected": time_detected,
            "by_severity": by_severity,
            "by_source": by_source,
            "by_domain": by_domain,
        },
    }
    yield {"type": "entries", "data": [dataclasses.asdict(r) for r in combined]}
    if MULTI_AGENT_ENABLED:
        async for event in run_pipeline(
            db_path=db_path,
            entries=combined,
            ctx=ctx,
            query=query,
            since=since,
            until=until,
            llm_url=llm_url,
            llm_model=llm_model,
            llm_api_key=llm_api_key,
            tech_level=tech_level,
            incidents_db_path=incidents_db_path,
        ):
            yield event
        return  # pipeline emits its own "done" event
    if llm_url and llm_model and combined:
        # Only compute context_block in the legacy path — pipeline uses ctx directly.
        context_block = format_context_block(ctx)
        yield {"type": "status", "message": "Analyzing with LLM…"}
        reasoning = await asyncio.to_thread(
            lambda: summarize(
                query,
                combined,
                llm_url,
                llm_model,
                llm_api_key,
                context_block=context_block,
            )
        )
        if reasoning:
            yield {"type": "reasoning", "text": reasoning}
    yield {"type": "done"}
 def _now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
 def _last_n_minutes(n: int) -> str:
    return (datetime.now(timezone.utc) - timedelta(minutes=n)).isoformat()
 __all__ = [
    "diagnose",
    "diagnose_stream",
    "parse_time_window",
 ]
 # Feature flag for Task 6
 MULTI_AGENT_ENABLED = (
    os.getenv("TURNSTONE_MULTI_AGENT_DIAGNOSE", "false").lower() == "true"
 )
--- a/app/services/diagnose/_llm_client.py
+++ b/app/services/diagnose/_llm_client.py
@ -1,174 +0,0 @@
 """Shared LLM client for the multi-agent diagnose pipeline.
 Both Stage 3 (RootCauseHypothesizer) and Stage 5 (SummarySynthesizer) send
 messages to the same LLM backend using the same two-step pattern:
  1. Try the cf-orch task endpoint  → product-scoped inference routing.
  2. Fall back to OpenAI-compat     → direct model call by name.
 Centralising here means changes to auth headers, timeouts, retry logic, or
 cf-orch payload structure only need to be made once.
 """
 from __future__ import annotations
 import logging
 import re
 import httpx
 logger = logging.getLogger(__name__)
 # Regex that strips ```json … ``` or ``` … ``` fences from LLM output.
 _JSON_FENCE_RE = re.compile(
    r"^```(?:json)?\s*|\s*```$",
    re.MULTILINE,
 )
 # Reasoning models (DeepSeek-R1, Qwen QwQ, Llama thinking variants) embed
 # chain-of-thought inside <think>…</think> tags in the content field.
 # Strip them so only the final response reaches the UI.
 _THINK_TAG_RE = re.compile(r"<think>.*?</think>", re.DOTALL | re.IGNORECASE)
 def _strip_thinking(text: str) -> str:
    """Remove <think>…</think> blocks and trim surrounding whitespace."""
    return _THINK_TAG_RE.sub("", text).strip()
 def extract_content(resp_json: dict) -> str | None:
    """Pull text content from an OpenAI-compat chat completion response.
    Strips reasoning-model thinking tags before returning.
    Returns None when the response has no choices or empty content.
    """
    choices = resp_json.get("choices") or []
    if not choices:
        return None
    raw = (choices[0].get("message", {}).get("content") or "").strip()
    if not raw:
        return None
    return _strip_thinking(raw) or None
 def strip_json_fences(raw: str) -> str:
    """Remove markdown code fences that some LLMs wrap around JSON output.
    Example: '```json\\n[...]\\n```' → '[...]'
    """
    return _JSON_FENCE_RE.sub("", raw).strip()
 def extract_first_json_array(raw: str) -> str:
    """Extract the first complete JSON array from a string.
    Reasoning models (e.g. foundation-sec-8b) sometimes emit valid JSON and
    then repeat it inside a markdown fence. Standard json.loads() fails on the
    combined text. This function scans for the first '[' and walks to its
    matching ']', handling nested structures.
    Returns the extracted substring, or the original string if no array found
    (so the caller's json.loads() fails with the usual error message).
    """
    start = raw.find("[")
    if start == -1:
        return raw
    depth = 0
    in_string = False
    escape_next = False
    for i, ch in enumerate(raw[start:], start=start):
        if escape_next:
            escape_next = False
            continue
        if ch == "\\" and in_string:
            escape_next = True
            continue
        if ch == '"':
            in_string = not in_string
            continue
        if in_string:
            continue
        if ch == "[":
            depth += 1
        elif ch == "]":
            depth -= 1
            if depth == 0:
                return raw[start : i + 1]
    return raw  # unbalanced — return as-is so caller sees the error
 def call_llm(
    llm_url: str,
    llm_model: str,
    llm_api_key: str | None,
    messages: list[dict],
    task_name: str = "log_analysis",
    timeout: float = 120.0,
    max_tokens: int = 2048,
 ) -> str | None:
    """Send messages to the LLM; return raw text or None on failure.
    Tries the cf-orch task endpoint first (product-routed inference).
    Falls back to a direct OpenAI-compat ``/v1/chat/completions`` call when:
      - The task endpoint returns 404 (no assignment for this task).
      - The task endpoint is unreachable (connection error, timeout, etc.).
    Args:
        llm_url:     Base URL of the LLM backend (e.g. ``http://<YOUR_HOST_IP>:7700``).
        llm_model:   Model identifier used in the OpenAI-compat fallback call.
        llm_api_key: Optional bearer token for authenticated endpoints.
        messages:    OpenAI-style message list (system + user turns).
        task_name:   cf-orch task name for product-routed inference (default: ``log_analysis``).
        timeout:     Request timeout in seconds (default: 120).
        max_tokens:  Maximum tokens to generate (default: 2048). Prevents mid-sentence
                     truncation when the backend default is lower than the output needs.
    Returns:
        Raw text content string, or None if both paths fail.
    """
    headers: dict[str, str] = {}
    if llm_api_key:
        headers["Authorization"] = f"Bearer {llm_api_key}"
    # --- Path 1: cf-orch task endpoint ---
    task_url = f"{llm_url.rstrip('/')}/api/inference/task"
    try:
        resp = httpx.post(
            task_url,
            json={
                "product": "turnstone",
                "task": task_name,
                "payload": {"messages": messages, "stream": False, "max_tokens": max_tokens},
            },
            headers=headers,
            timeout=timeout,
        )
        if resp.status_code == 200:
            return extract_content(resp.json())
        if resp.status_code != 404:
            resp.raise_for_status()
        logger.debug(
            "No task assignment for turnstone.%s — falling back to direct model",
            task_name,
        )
    except Exception as exc:  # noqa: BLE001
        # Broad catch is intentional: captures network errors, timeouts, and
        # any backend-specific exceptions so the pipeline can fall back.
        logger.debug(
            "Task endpoint unavailable (%s) — falling back to direct model", exc
        )
    # --- Path 2: OpenAI-compat fallback ---
    try:
        resp = httpx.post(
            f"{llm_url.rstrip('/')}/v1/chat/completions",
            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": max_tokens},
            headers=headers,
            timeout=timeout,
        )
        resp.raise_for_status()
        return extract_content(resp.json())
    except Exception as exc:  # noqa: BLE001
        logger.warning("LLM call failed (%s): %s", type(exc).__name__, exc)
        return None
--- a/app/services/diagnose/classifier.py
+++ b/app/services/diagnose/classifier.py
@ -1,274 +0,0 @@
 """Stage 2: Severity Classifier — ML with two fallback levels.
 Classification strategy (in priority order):
  Path A — ML: Hugging Face text-classification pipeline, loaded lazily.
  Path B — pattern_tags: Map cluster.pattern_tags through the loaded pattern
            severity dict; pick the highest severity across matching tags.
  Path C — regex: Call detect_severity() from app.glean.base on the cluster's
            representative_text.
 Each cluster is classified independently. The ``classifier_used`` field on the
 returned ``ClassifiedTimeline`` reflects the primary path (the one that governed
 the overall classification session, not individual cluster fallbacks).
 """
 from __future__ import annotations
 import logging
 import os
 from pathlib import Path
 from typing import Any
 from types import MappingProxyType
 from app.services.diagnose.models import (
    ClassifiedTimeline,
    EventCluster,
    SeverityLabel,
    TimelineResult,
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Module-level ML singleton — reset to None between tests via the fixture
 # ---------------------------------------------------------------------------
 _ml_classifier: Any | None = None
 def _get_ml_classifier(model_id: str, device: str) -> Any:
    """Return the cached HF pipeline, loading it on first call."""
    global _ml_classifier  # noqa: PLW0603
    if _ml_classifier is None:
        from transformers import pipeline as hf_pipeline  # type: ignore[import-untyped]
        _ml_classifier = hf_pipeline(
            "text-classification", model=model_id, device=device
        )
    return _ml_classifier
 # ---------------------------------------------------------------------------
 # Label mapping
 # ---------------------------------------------------------------------------
 _LABEL_MAP: dict[str, SeverityLabel] = {
    "ERROR": "ERROR",
    "WARNING": "WARN",
    "WARN": "WARN",
    "INFO": "INFO",
    "DEBUG": "DEBUG",
    "CRITICAL": "CRITICAL",
 }
 # Label shim for krishnas4415/log-anomaly-detection-models (Hybrid-BERT, MIT).
 # Maps the model's 7-class output vocabulary to Turnstone SeverityLabel.
 # Checked against the model config.json — labels confirmed in turnstone#41.
 _HYBRID_BERT_LABEL_MAP: dict[str, SeverityLabel] = {
    "NORMAL": "INFO",
    "SECURITY_ANOMALY": "ERROR",
    "SYSTEM_FAILURE": "CRITICAL",
    "PERFORMANCE_ISSUE": "WARN",
    "NETWORK_ANOMALY": "WARN",
    "CONFIG_ERROR": "ERROR",
    "HARDWARE_ISSUE": "CRITICAL",
 }
 _CRITICAL_KEYWORDS: frozenset[str] = frozenset(
    {
        "panic",
        "oom",
        "fatal",
        "critical",
        "kernel panic",
        "out of memory",
        "segfault",
        "segmentation fault",
    }
 )
 _SEVERITY_ORDER: dict[str | None, int] = {
    "CRITICAL": 5,
    "ERROR": 4,
    "WARN": 3,
    "WARNING": 3,
    "INFO": 2,
    "DEBUG": 1,
    None: 0,
 }
 def _map_label(label: str, score: float, text: str) -> SeverityLabel:
    """Translate a raw model output label to a Turnstone SeverityLabel.
    Handles two model vocabularies:
    - Standard (ERROR/WARN/INFO/CRITICAL/DEBUG) — byviz/bylastic_classification_logs
    - Hybrid-BERT (normal/security_anomaly/…) — krishnas4415/log-anomaly-detection-models
    Applies keyword-based CRITICAL promotion and low-confidence DEBUG demotion
    on top of the base mapping.
    """
    upper = label.upper()
    # Resolve via Hybrid-BERT map first, then standard map, then UNKNOWN.
    base: SeverityLabel = _HYBRID_BERT_LABEL_MAP.get(upper) or _LABEL_MAP.get(upper, "UNKNOWN")  # type: ignore[assignment]
    if base == "ERROR" and score > 0.95 and any(
        k in text.lower() for k in _CRITICAL_KEYWORDS
    ):
        return "CRITICAL"
    if base == "INFO" and score < 0.4:
        return "DEBUG"
    return base
 def _highest_from_tags(
    tags: tuple[str, ...], severity_map: dict[str, str]
 ) -> SeverityLabel | None:
    """Return the highest severity from the pattern_tags that appear in severity_map."""
    best: str | None = None
    best_rank = -1
    for tag in tags:
        sev = severity_map.get(tag)
        rank = _SEVERITY_ORDER.get(sev, 0)
        if rank > best_rank:
            best_rank = rank
            best = sev
    if best is None:
        return None
    normalised = "WARN" if best.upper() == "WARNING" else best.upper()
    return normalised  # type: ignore[return-value]
 # ---------------------------------------------------------------------------
 # SeverityClassifier
 # ---------------------------------------------------------------------------
 class SeverityClassifier:
    """Classify each EventCluster's severity using ML, patterns, or regex fallback.
    Parameters
    ----------
    model_id:
        Hugging Face model identifier. When empty (default), ML is skipped.
    device:
        Torch device string passed to the HF pipeline (e.g. ``"cpu"`` or ``"cuda:0"``).
    pattern_file:
        Path to the YAML pattern file. When ``None`` the classifier reads
        ``TURNSTONE_PATTERNS`` env var (same logic as ``app/rest.py``).
    """
    def __init__(
        self,
        model_id: str = "",
        device: str = "cpu",
        pattern_file: Path | None = None,
    ) -> None:
        self._model_id = model_id
        self._device = device
        self._pattern_file: Path | None = pattern_file
        self._pattern_severity: dict[str, str] = {}
        self._patterns_loaded = False
    # ------------------------------------------------------------------
    # Lazy loaders
    # ------------------------------------------------------------------
    def _resolve_pattern_file(self) -> Path | None:
        """Resolve pattern file from constructor arg or env var."""
        if self._pattern_file is not None:
            return self._pattern_file
        env_dir = os.environ.get("TURNSTONE_PATTERNS")
        if env_dir:
            return Path(env_dir) / "default.yaml"
        return None
    def _ensure_patterns_loaded(self) -> None:
        """Populate _pattern_severity from the pattern YAML file (once)."""
        if self._patterns_loaded:
            return
        self._patterns_loaded = True
        path = self._resolve_pattern_file()
        if path is None:
            return
        from app.glean.base import load_patterns
        patterns = load_patterns(path)
        self._pattern_severity = {p.name: p.severity for p in patterns}
    # ------------------------------------------------------------------
    # Per-cluster classification helpers
    # ------------------------------------------------------------------
    def _classify_cluster_ml(self, cluster: EventCluster) -> SeverityLabel | None:
        """Attempt ML classification. Returns None on any inference failure."""
        try:
            pipe = _get_ml_classifier(self._model_id, self._device)
            results = pipe(cluster.representative_text)
            if not results:
                return None
            hit = results[0]
            return _map_label(hit["label"], hit["score"], cluster.representative_text)
        except Exception:  # noqa: BLE001
            logger.warning(
                "ML inference failed for cluster %s — falling back",
                cluster.cluster_id,
            )
            return None
    def _classify_cluster_pattern_tags(
        self, cluster: EventCluster
    ) -> SeverityLabel | None:
        """Derive severity from the cluster's pattern_tags. Returns None if no match."""
        return _highest_from_tags(cluster.pattern_tags, self._pattern_severity)
    def _classify_cluster_regex(self, cluster: EventCluster) -> SeverityLabel:
        """Classify by scanning representative_text with the severity regex."""
        from app.glean.base import detect_severity
        raw = detect_severity(cluster.representative_text)
        if raw is None:
            return "INFO"
        return _LABEL_MAP.get(raw.upper(), "INFO")  # type: ignore[return-value]
    # ------------------------------------------------------------------
    # Public API
    # ------------------------------------------------------------------
    def classify(self, timeline: TimelineResult) -> ClassifiedTimeline:
        """Classify every cluster in *timeline* and return a ``ClassifiedTimeline``."""
        self._ensure_patterns_loaded()
        # Determine which primary path governs this session
        ml_available = bool(self._model_id)
        patterns_available = bool(self._pattern_severity)
        if ml_available:
            classifier_used: str = "ml"
        elif patterns_available:
            classifier_used = "pattern_tags"
        else:
            classifier_used = "regex"
        cluster_severities: dict[str, SeverityLabel] = {}
        for cluster in timeline.clusters:
            severity: SeverityLabel | None = None
            if ml_available:
                severity = self._classify_cluster_ml(cluster)
            if severity is None and patterns_available:
                severity = self._classify_cluster_pattern_tags(cluster)
            if severity is None:
                severity = self._classify_cluster_regex(cluster)
            cluster_severities[cluster.cluster_id] = severity
        return ClassifiedTimeline(
            timeline=timeline,
            cluster_severities=MappingProxyType(cluster_severities),
            classifier_used=classifier_used,  # type: ignore[arg-type]
            model_id=self._model_id if ml_available else None,
        )
--- a/app/services/diagnose/hypothesizer.py
+++ b/app/services/diagnose/hypothesizer.py
@ -1,167 +0,0 @@
 """Stage 3: Root-Cause Hypothesizer — LLM + RAG context."""
 from __future__ import annotations
 import json
 import logging
 from uuid import uuid4
 from app.context.retriever import RetrievedContext
 from app.services.diagnose._llm_client import call_llm, extract_first_json_array, strip_json_fences
 from app.services.diagnose.models import (
    ClassifiedTimeline,
    EventCluster,
    Hypothesis,
    SeverityLabel,
 )
 logger = logging.getLogger(__name__)
 _VALID_SEVERITIES: frozenset[str] = frozenset({"CRITICAL", "ERROR", "WARN", "INFO", "DEBUG"})
 _SYSTEM_PROMPT = (
    "You are a Linux sysadmin log analyst. Analyze the following clustered log timeline "
    "and generate 2-4 root cause hypotheses as a JSON array.\n\n"
    "Each hypothesis must follow this exact JSON schema:\n"
    '{"title": str (≤80 chars), "description": str (2-4 sentences), '
    '"confidence": float (0.0-1.0), "severity": str (one of: CRITICAL, ERROR, WARN, INFO), '
    '"supporting_clusters": [str list of cluster IDs]}\n\n'
    "Return ONLY a valid JSON array. No prose, no markdown, no explanation outside the JSON."
 )
 def _coerce_float(val: object, default: float) -> float:
    """Safely coerce LLM output to float, returning default on failure."""
    try:
        return float(val)  # type: ignore[arg-type]
    except (TypeError, ValueError):
        return default
 def _validate_severity(s: str) -> SeverityLabel:
    """Map a raw severity string to a valid SeverityLabel, defaulting to ERROR."""
    upper = s.upper()
    if upper == "WARNING":
        return "WARN"
    return upper if upper in _VALID_SEVERITIES else "ERROR"  # type: ignore[return-value]
 def _cluster_summary(cluster: EventCluster, severity: str) -> str:
    """Build a condensed single-line summary of a cluster for the prompt."""
    sources = ", ".join(list(cluster.source_ids)[:3])
    patterns = ", ".join(list(cluster.pattern_tags)[:5])
    text_preview = cluster.representative_text[:200]
    summary = (
        f"[{severity}] {cluster.start_iso or 'unknown'} "
        f"({sources}) — {text_preview}"
    )
    if patterns:
        summary += f" [patterns: {patterns}]"
    return summary
 class RootCauseHypothesizer:
    """Generate ranked root-cause hypotheses from a classified log timeline."""
    def __init__(self, max_hypotheses: int = 4) -> None:
        self._max_hypotheses = max_hypotheses
    def hypothesize(
        self,
        classified: ClassifiedTimeline,
        ctx: RetrievedContext,
        query: str,
        llm_url: str | None = None,
        llm_model: str | None = None,
        llm_api_key: str | None = None,
    ) -> list[Hypothesis]:
        """Generate hypotheses from a classified timeline and RAG context.
        Returns an empty list when no LLM is configured or there are no
        clusters to analyse.
        """
        if not llm_url or not llm_model:
            return []
        clusters = classified.timeline.clusters
        if not clusters:
            return []
        cluster_lines = [
            _cluster_summary(c, classified.cluster_severities.get(c.cluster_id, c.severity))
            for c in clusters
        ]
        cluster_block = "\n".join(cluster_lines)
        context_parts: list[str] = []
        for chunk in ctx.chunks[:5]:
            filename = chunk.get("filename", "unknown")
            text = chunk.get("text", "")[:300]
            context_parts.append(f"[{filename}] {text}")
        context_block = "\n".join(context_parts) if context_parts else "(none)"
        user_message = (
            f"Query: {query}\n\n"
            f"Context from runbooks and known patterns:\n{context_block}\n\n"
            f"Log timeline (clustered, {len(clusters)} clusters):\n{cluster_block}\n\n"
            f"Generate up to {self._max_hypotheses} hypotheses. Return JSON array only."
        )
        messages = [
            {"role": "system", "content": _SYSTEM_PROMPT},
            {"role": "user", "content": user_message},
        ]
        raw_response = call_llm(
            llm_url=llm_url,
            llm_model=llm_model,
            llm_api_key=llm_api_key,
            messages=messages,
            max_tokens=1024,  # JSON array of 2-4 hypotheses; 1024 is sufficient
        )
        if raw_response is None:
            return []
        return self._parse_response(raw_response)
    def _parse_response(self, raw: str) -> list[Hypothesis]:
        """Parse the LLM JSON response into a list of Hypothesis objects.
        Strips markdown code fences before parsing — some LLMs wrap JSON in
        triple-backtick fences despite being instructed not to.
        """
        try:
            # extract_first_json_array handles reasoning models that emit valid
            # JSON then repeat it inside a markdown fence block.
            data = json.loads(extract_first_json_array(strip_json_fences(raw)))
        except json.JSONDecodeError:
            logger.warning(
                "Hypothesizer: invalid JSON from LLM (truncated): %.120s", raw
            )
            return []
        if not isinstance(data, list):
            logger.warning(
                "Hypothesizer: expected JSON array, got %s", type(data).__name__
            )
            return []
        hypotheses: list[Hypothesis] = []
        for item in data[: self._max_hypotheses]:
            if not isinstance(item, dict):
                continue
            severity_raw = item.get("severity", "ERROR")
            severity = _validate_severity(str(severity_raw))
            hypothesis = Hypothesis(
                hypothesis_id=str(uuid4()),
                title=str(item.get("title", "Unknown"))[:80],
                description=str(item.get("description", "")),
                confidence=_coerce_float(item.get("confidence"), 0.5),
                supporting_cluster_ids=tuple(
                    str(x) for x in (item.get("supporting_clusters") or [])
                ),
                runbook_refs=(),
                severity=severity,
            )
            hypotheses.append(hypothesis)
        return hypotheses
--- a/app/services/diagnose/models.py
+++ b/app/services/diagnose/models.py
@ -1,77 +0,0 @@
 """Pipeline data types for the multi-agent diagnose pipeline."""
 from __future__ import annotations
 from dataclasses import dataclass
 from types import MappingProxyType
 from typing import Literal
 SeverityLabel = Literal["CRITICAL", "ERROR", "WARN", "INFO", "DEBUG", "UNKNOWN"]
@dataclass(frozen=True)
 class EventCluster:
    """A time-correlated group of log entries within the timeline."""
    cluster_id: str
    entries: tuple[str, ...]  # entry_id refs
    start_iso: str | None
    end_iso: str | None
    duration_seconds: float
    source_ids: tuple[str, ...]
    pattern_tags: tuple[str, ...]
    severity: SeverityLabel
    burst: bool
    gap_before_seconds: float
    representative_text: str
@dataclass(frozen=True)
 class TimelineResult:
    """Structured timeline of event clusters built from log entries."""
    clusters: tuple[EventCluster, ...]
    total_entries: int
    window_start: str | None
    window_end: str | None
    gap_count: int
    burst_count: int
    dominant_sources: tuple[str, ...]
@dataclass(frozen=True)
 class ClassifiedTimeline:
    """Timeline annotated with ML-assigned severity per cluster.
    ``cluster_severities`` is a ``MappingProxyType`` so the mapping is
    fully immutable — consistent with the ``frozen=True`` intent.
    """
    timeline: TimelineResult
    cluster_severities: MappingProxyType  # MappingProxyType[str, SeverityLabel]
    classifier_used: Literal["ml", "pattern_tags", "regex"]
    model_id: str | None
@dataclass(frozen=True)
 class Hypothesis:
    """A root-cause hypothesis generated by Stage 3."""
    hypothesis_id: str
    title: str
    description: str
    confidence: float
    supporting_cluster_ids: tuple[str, ...]
    runbook_refs: tuple[str, ...]
    severity: SeverityLabel
@dataclass(frozen=True)
 class RankedHypothesis:
    """A hypothesis enriched by Stage 4 false-positive suppression."""
    hypothesis: Hypothesis
    novelty_score: float
    similarity_to_known: float
    suppress: bool
    suppression_reason: str | None
--- a/app/services/diagnose/pipeline.py
+++ b/app/services/diagnose/pipeline.py
@ -1,173 +0,0 @@
 """Multi-agent diagnose pipeline orchestrator — Stage 1–5 wiring."""
 from __future__ import annotations
 import asyncio
 import dataclasses
 import logging
 import os
 from collections.abc import AsyncGenerator
 from pathlib import Path
 from typing import Any
 # Optional ML classifier model for Stage 2.
 # When empty (default), Stage 2 falls back to pattern_tags then regex.
 # Set TURNSTONE_CLASSIFIER_MODEL to a HuggingFace model ID to enable ML classification.
 # Recommended: byviz/bylastic_classification_logs (DistilBERT, ~300MB)
 _CLASSIFIER_MODEL: str = os.environ.get("TURNSTONE_CLASSIFIER_MODEL", "")
 from app.context.retriever import RetrievedContext
 from app.services.diagnose.classifier import SeverityClassifier
 from app.services.diagnose.hypothesizer import RootCauseHypothesizer
 from app.services.diagnose.suppressor import FalsePositiveSuppressor
 from app.services.diagnose.synthesizer import SummarySynthesizer
 from app.services.diagnose.timeline import TimelineReconstructor
 from app.services.search import SearchResult
 logger = logging.getLogger(__name__)
 async def run_pipeline(
    db_path: Path,
    entries: list[SearchResult],
    ctx: RetrievedContext,
    query: str,
    since: str | None,   # reserved for future range-filtering in stage queries (#29 follow-up)
    until: str | None,   # reserved for future range-filtering in stage queries (#29 follow-up)
    llm_url: str | None,
    llm_model: str | None,
    llm_api_key: str | None,
    tech_level: str = "sysadmin",
    incidents_db_path: Path | None = None,
 ) -> AsyncGenerator[dict[str, Any], None]:
    """Async generator that runs all 5 pipeline stages and yields SSE event dicts.
    Stages:
      1. TimelineReconstructor  — cluster log entries by time
      2. SeverityClassifier     — annotate clusters with severity
      3. RootCauseHypothesizer  — generate hypotheses via LLM
      4. FalsePositiveSuppressor — rank and suppress known patterns
      5. SummarySynthesizer     — produce a narrative diagnosis
    Yields events in order:
      {"type": "status", "message": "Building timeline…"}
      {"type": "pipeline_stage", "stage": 1, ...}
      {"type": "pipeline_stage", "stage": 2, ...}
      {"type": "pipeline_stage", "stage": 3, ...}
      {"type": "pipeline_stage", "stage": 4, ...}
      {"type": "hypotheses", "data": [...]}
      {"type": "status", "message": "Synthesizing…"}
      {"type": "reasoning", "text": "..."}   — only when synthesis produces text
      {"type": "done"}
    """
    # Stage 1: Timeline reconstruction
    yield {"type": "status", "message": "Building timeline…"}
    try:
        timeline = await asyncio.to_thread(
            TimelineReconstructor().reconstruct, entries
        )
    except Exception as exc:
        logger.exception("Stage 1 (timeline) failed: %s", exc)
        yield {"type": "error", "message": "Pipeline error in stage 1 (timeline)"}
        yield {"type": "done"}
        return
    n_clusters = len(timeline.clusters)
    burst = timeline.burst_count
    yield {
        "type": "pipeline_stage",
        "stage": 1,
        "name": "timeline",
        "message": f"Built {n_clusters} clusters, {burst} bursts",
    }
    # Stage 2: Severity classification
    try:
        classified = await asyncio.to_thread(
            SeverityClassifier(model_id=_CLASSIFIER_MODEL).classify, timeline
        )
    except Exception as exc:
        logger.exception("Stage 2 (classifier) failed: %s", exc)
        yield {"type": "error", "message": "Pipeline error in stage 2 (classifier)"}
        yield {"type": "done"}
        return
    sev_counts: dict[str, int] = {}
    for sev in classified.cluster_severities.values():
        sev_counts[sev] = sev_counts.get(sev, 0) + 1
    counts_str = ", ".join(f"{k}:{v}" for k, v in sorted(sev_counts.items()))
    yield {
        "type": "pipeline_stage",
        "stage": 2,
        "name": "classifier",
        "message": f"{classified.classifier_used} classifier: {counts_str}",
    }
    # Stage 3: Root-cause hypotheses
    try:
        hypotheses = await asyncio.to_thread(
            RootCauseHypothesizer().hypothesize,
            classified,
            ctx,
            query,
            llm_url,
            llm_model,
            llm_api_key,
        )
    except Exception as exc:
        logger.exception("Stage 3 (hypothesizer) failed: %s", exc)
        yield {"type": "error", "message": "Pipeline error in stage 3 (hypothesizer)"}
        yield {"type": "done"}
        return
    yield {
        "type": "pipeline_stage",
        "stage": 3,
        "name": "hypotheses",
        "message": f"{len(hypotheses)} hypotheses generated",
    }
    # Stage 4: False-positive suppression
    _incidents_db = incidents_db_path or db_path
    try:
        ranked = await asyncio.to_thread(
            FalsePositiveSuppressor().suppress, hypotheses, _incidents_db
        )
    except Exception as exc:
        logger.exception("Stage 4 (suppressor) failed: %s", exc)
        yield {"type": "error", "message": "Pipeline error in stage 4 (suppressor)"}
        yield {"type": "done"}
        return
    suppressed = sum(1 for rh in ranked if rh.suppress)
    active = len(ranked) - suppressed
    yield {
        "type": "pipeline_stage",
        "stage": 4,
        "name": "suppressor",
        "message": f"{suppressed} suppressed, {active} active",
    }
    yield {
        "type": "hypotheses",
        "data": [dataclasses.asdict(rh) for rh in ranked],
    }
    # Stage 5: Summary synthesis
    yield {"type": "status", "message": "Synthesizing…"}
    try:
        synthesis_text = await asyncio.to_thread(
            SummarySynthesizer().synthesize,
            ranked,
            timeline,
            ctx,
            query,
            llm_url,
            llm_model,
            llm_api_key,
            tech_level,
        )
    except Exception as exc:
        logger.exception("Stage 5 (synthesizer) failed: %s", exc)
        yield {"type": "error", "message": "Pipeline error in stage 5 (synthesizer)"}
        yield {"type": "done"}
        return
    if synthesis_text:
        yield {"type": "reasoning", "text": synthesis_text}
    yield {"type": "done"}
--- a/app/services/diagnose/suppressor.py
+++ b/app/services/diagnose/suppressor.py
@ -1,275 +0,0 @@
 """Stage 4: False-Positive Suppressor — embedding cosine similarity.
 Compares each hypothesis against a corpus of resolved incidents using
 embedding cosine similarity. Hypotheses that closely match a previously
 resolved incident are suppressed as likely false positives.
 When no embedding model is configured or the service is unavailable, all
 hypotheses pass through with novelty_score=1.0 (full novelty assumed).
 """
 from __future__ import annotations
 import logging
 import sqlite3
 from pathlib import Path
 from typing import Any
 from app.services.diagnose.models import Hypothesis, RankedHypothesis
 logger = logging.getLogger(__name__)
 # Module-level corpus cache: db_path_str -> (corpus_texts, embeddings)
 # Invalidated when the corpus text list changes between calls.
 _corpus_cache: dict[str, tuple[list[str], Any]] = {}
 # ---------------------------------------------------------------------------
 # Cosine similarity helpers
 # ---------------------------------------------------------------------------
 try:
    import numpy as np
    def _cosine_similarities(
        query_emb: list[float], corpus_embs: list[list[float]]
    ) -> list[float]:
        """Batch cosine similarity of one query embedding against all corpus embeddings."""
        q = np.array(query_emb, dtype=np.float32)
        c = np.array(corpus_embs, dtype=np.float32)
        q_norm = q / (np.linalg.norm(q) + 1e-10)
        c_norm = c / (np.linalg.norm(c, axis=1, keepdims=True) + 1e-10)
        return list(c_norm @ q_norm)
    _HAS_NUMPY = True
 except ImportError:  # pragma: no cover
    import math
    _HAS_NUMPY = False
    def _dot(a: list[float], b: list[float]) -> float:
        return sum(x * y for x, y in zip(a, b))
    def _norm(a: list[float]) -> float:
        return math.sqrt(sum(x * x for x in a)) + 1e-10
    def _cosine(a: list[float], b: list[float]) -> float:
        return _dot(a, b) / (_norm(a) * _norm(b))
    def _cosine_similarities(
        query_emb: list[float], corpus_embs: list[list[float]]
    ) -> list[float]:
        return [_cosine(query_emb, c) for c in corpus_embs]
 # ---------------------------------------------------------------------------
 # DB helpers
 # ---------------------------------------------------------------------------
 def _fetch_resolved_incidents(incidents_db_path: Path) -> list[str]:
    """Fetch resolved incident texts from the incidents database.
    Returns a list of non-empty combined strings for each resolved incident.
    Returns an empty list on any error (missing table, connection failure, etc.).
    """
    try:
        with sqlite3.connect(str(incidents_db_path), timeout=30.0) as conn:
            cursor = conn.execute(
                "SELECT label, notes FROM incidents WHERE ended_at IS NOT NULL LIMIT 200"
            )
            rows = cursor.fetchall()
    except sqlite3.OperationalError as exc:
        logger.warning("Could not query resolved incidents (%s) — treating as empty corpus", exc)
        return []
    except sqlite3.Error as exc:
        # Catches all remaining SQLite-family errors (IntegrityError, DatabaseError, etc.)
        logger.warning("Unexpected SQLite error fetching resolved incidents (%s) — treating as empty corpus", exc)
        return []
    texts: list[str] = []
    for label, notes in rows:
        label = (label or "").strip()
        notes = (notes or "").strip()
        combined = f"{label}. {notes}" if label and notes else (label or notes)
        if combined:
            texts.append(combined)
    return texts
 # ---------------------------------------------------------------------------
 # Public class
 # ---------------------------------------------------------------------------
 class FalsePositiveSuppressor:
    """Stage 4 of the multi-agent diagnose pipeline.
    Uses embedding cosine similarity to detect hypotheses that closely match
    previously resolved incidents and suppress them as likely false positives.
    When model_id is empty or the embedding service is unavailable, all
    hypotheses pass through with novelty_score=1.0 (no suppression).
    """
    def __init__(
        self,
        model_id: str = "",
        device: str = "cpu",
        similarity_threshold: float = 0.85,
    ) -> None:
        self._model_id = model_id
        self._device = device
        # _device stored for future use when get_embedder() supports device selection
        # Suppress when cosine similarity to a known resolved incident >= threshold.
        # A threshold of 0.85 means "suppress if 85%+ similar to something already resolved."
        self._similarity_threshold = similarity_threshold
    def suppress(
        self,
        hypotheses: list[Hypothesis],
        incidents_db_path: Path,
    ) -> list[RankedHypothesis]:
        """Rank hypotheses by novelty, suppressing those matching resolved incidents.
        Args:
            hypotheses: Candidate hypotheses from Stage 3.
            incidents_db_path: Path to the dedicated incidents SQLite database.
        Returns:
            List of RankedHypothesis sorted by (novelty_score * confidence) descending.
            Non-suppressed hypotheses appear first in practice.
        """
        if not hypotheses:
            return []
        # No model configured — full passthrough, rank by confidence only.
        if not self._model_id:
            return self._passthrough(hypotheses)
        # Attempt to obtain an embedder; fall back to passthrough on failure.
        embedder = self._load_embedder()
        if embedder is None:
            logger.warning(
                "Embedding service unavailable for model %r — skipping suppression",
                self._model_id,
            )
            return self._passthrough(hypotheses)
        # Fetch corpus texts from incidents DB; fall back to passthrough if empty.
        corpus_texts = _fetch_resolved_incidents(incidents_db_path)
        if not corpus_texts:
            logger.debug("No resolved incidents found — all hypotheses treated as novel")
            return self._passthrough(hypotheses)
        # Embed corpus (with caching).
        corpus_embeddings = self._get_corpus_embeddings(embedder, corpus_texts, incidents_db_path)
        # Score each hypothesis and sort by novelty * confidence descending.
        ranked = [
            self._score_hypothesis(h, embedder, corpus_embeddings)
            for h in hypotheses
        ]
        ranked.sort(key=lambda rh: rh.novelty_score * rh.hypothesis.confidence, reverse=True)
        return ranked
    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------
    def _score_hypothesis(
        self,
        hypothesis: Hypothesis,
        embedder: Any,
        corpus_embeddings: list[list[float]],
    ) -> RankedHypothesis:
        """Score a single hypothesis against the resolved incident corpus."""
        try:
            query_text = f"{hypothesis.title}. {hypothesis.description}"
            h_emb = embedder.embed(query_text)
            # Convert numpy array to plain Python list for _cosine_similarities
            h_emb_list: list[float] = h_emb.tolist() if hasattr(h_emb, "tolist") else list(h_emb)
            sims = _cosine_similarities(h_emb_list, corpus_embeddings)
            max_sim = float(max(sims)) if sims else 0.0
        except Exception as exc:
            # Broad catch is intentional: catches unknown embedder runtime errors
            # (e.g. CUDA OOM, backend crashes) so one bad hypothesis never halts the pipeline.
            logger.warning("Embedding failed for hypothesis %r: %s — treating as novel", hypothesis.title, exc)
            return RankedHypothesis(
                hypothesis=hypothesis,
                novelty_score=1.0,
                similarity_to_known=0.0,
                suppress=False,
                suppression_reason=None,
            )
        novelty_score = 1.0 - max_sim
        suppress = bool(max_sim >= self._similarity_threshold)
        suppression_reason = (
            f"Similar to resolved incident (similarity {max_sim:.2f})"
            if suppress
            else None
        )
        return RankedHypothesis(
            hypothesis=hypothesis,
            novelty_score=novelty_score,
            similarity_to_known=max_sim,
            suppress=suppress,
            suppression_reason=suppression_reason,
        )
    def _load_embedder(self) -> Any | None:
        """Load the embedding service. Returns None if unavailable."""
        try:
            from app.services.embeddings import get_embedder
            return get_embedder()
        except Exception as exc:
            # Broad catch is intentional: get_embedder() may raise on import or
            # backend init failures from any number of third-party libraries.
            logger.warning("Failed to import/initialise embedding service: %s", exc)
            return None
    def _get_corpus_embeddings(
        self,
        embedder: Any,
        corpus_texts: list[str],
        incidents_db_path: Path,
    ) -> list[list[float]]:
        """Return cached corpus embeddings, re-embedding if the corpus has changed."""
        cache_key = str(incidents_db_path)
        cached = _corpus_cache.get(cache_key)
        if cached is not None:
            cached_texts, cached_embeddings = cached
            if cached_texts == corpus_texts:
                return cached_embeddings
        logger.debug("Embedding corpus of %d resolved incidents", len(corpus_texts))
        try:
            raw_embeddings = embedder.embed_batch(corpus_texts)
            # Normalise each embedding to a plain Python list for portability
            corpus_embeddings: list[list[float]] = [
                e.tolist() if hasattr(e, "tolist") else list(e)
                for e in raw_embeddings
            ]
        except Exception as exc:
            # Broad catch is intentional: embed_batch() may raise from any backend
            # (network timeout, CUDA error, etc.) — treat as empty corpus so the
            # pipeline can continue without suppression.
            logger.warning("Corpus embedding failed: %s — treating as empty corpus", exc)
            return []
        _corpus_cache[cache_key] = (corpus_texts, corpus_embeddings)
        return corpus_embeddings
    def _passthrough(self, hypotheses: list[Hypothesis]) -> list[RankedHypothesis]:
        """Return all hypotheses as non-suppressed, ranked by confidence descending."""
        ranked = [
            RankedHypothesis(
                hypothesis=h,
                novelty_score=1.0,
                similarity_to_known=0.0,
                suppress=False,
                suppression_reason=None,
            )
            for h in hypotheses
        ]
        ranked.sort(key=lambda rh: rh.hypothesis.confidence, reverse=True)
        return ranked
--- a/app/services/diagnose/synthesizer.py
+++ b/app/services/diagnose/synthesizer.py
@ -1,203 +0,0 @@
 """Stage 5: Summary Synthesizer — deterministic narrative from ranked hypotheses.
 Streaming upgrade (async SSE chunks) is tracked as a follow-up enhancement.
 This implementation is synchronous to match the rest of the pipeline.
 """
 from __future__ import annotations
 import logging
 from app.context.retriever import RetrievedContext
 from app.services.diagnose._llm_client import call_llm
 from app.services.diagnose.models import RankedHypothesis, TimelineResult
 logger = logging.getLogger(__name__)
 _SYSTEM_PROMPTS: dict[str, str] = {
    "sysadmin": (
        "You are a Linux sysadmin diagnosing a system incident. "
        "Write a concise, actionable incident diagnosis.\n\n"
        "Format your response exactly as:\n"
        "1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — <what happened> (<X>% confidence)\n"
        "2. TIMELINE: <what the logs show in sequence, 2-3 sentences>\n"
        "3. ROOT CAUSES:\n"
        "   - <hypothesis 1 title> (<confidence>%)\n"
        "   - <hypothesis 2 title> (<confidence>%)\n"
        "4. RECOMMENDED ACTIONS:\n"
        "   - <action based on hypotheses>\n"
        "5. INVESTIGATE FURTHER: <open questions, if any>"
    ),
    "homelab": (
        "You are explaining a system incident to a home lab enthusiast — someone "
        "comfortable with Linux basics but not necessarily familiar with every daemon "
        "or kernel subsystem. Be clear about what each service does; spell out "
        "abbreviations; explain why each action helps.\n\n"
        "Format your response exactly as:\n"
        "1. VERDICT: [CRITICAL|ERROR|WARN|INFO] — <what happened in plain terms> (<X>% confidence)\n"
        "2. TIMELINE: <what happened in sequence, 2-3 sentences; explain what each service is>\n"
        "3. ROOT CAUSES:\n"
        "   - <hypothesis title — one sentence explaining what it means> (<confidence>%)\n"
        "4. RECOMMENDED ACTIONS:\n"
        "   - <command or step — explain what it does and why>\n"
        "5. INVESTIGATE FURTHER: <open questions in plain language>"
    ),
    "executive": (
        "You are summarizing a technical system incident for a non-technical stakeholder. "
        "Focus on what broke, what the business impact was, and what the technical team is doing about it. "
        "Use plain English. Do not use daemon names, kernel terms, log syntax, or technical jargon.\n\n"
        "Format your response exactly as:\n"
        "1. WHAT HAPPENED: <1-2 sentences describing the problem in plain English>\n"
        "2. IMPACT: <which services or users were affected, and how>\n"
        "3. CONFIDENCE: <High / Medium / Low — how certain we are of the diagnosis>\n"
        "4. ACTION NEEDED: <what the IT team is doing or should do, in plain terms>"
    ),
 }
 def _build_hypothesis_block(ranked: list[RankedHypothesis]) -> str:
    """Build the hypothesis block for the prompt (non-suppressed only, top 3)."""
    active = [rh for rh in ranked if not rh.suppress][:3]
    if not active:
        return "(none)"
    lines: list[str] = []
    for rh in active:
        h = rh.hypothesis
        conf_pct = int(h.confidence * 100)
        novelty = f"{rh.novelty_score:.2f}"
        desc = f"\n  {h.description}" if h.description else ""
        lines.append(
            f"- [{h.severity}, {conf_pct}% conf, novelty {novelty}] {h.title}{desc}"
        )
    return "\n".join(lines)
 def _build_timeline_block(timeline: TimelineResult) -> str:
    """Build a sequenced cluster block so the synthesizer can narrate what happened.
    Mirrors the format used by the hypothesizer, but adds gap information so the
    LLM can reason about silence windows between bursts.
    """
    if not timeline.clusters:
        return "(no clusters)"
    lines: list[str] = []
    for i, c in enumerate(timeline.clusters):
        ts = c.start_iso or "unknown"
        sources = ", ".join(list(c.source_ids)[:3])
        tags = ", ".join(list(c.pattern_tags)[:4])
        burst_label = " [BURST]" if c.burst else ""
        gap_label = (
            f" (+{int(c.gap_before_seconds)}s silence)"
            if c.gap_before_seconds > 30
            else ""
        )
        text_preview = c.representative_text[:200]
        line = (
            f"Cluster {i + 1}{burst_label}{gap_label} @ {ts} [{c.severity}] "
            f"({sources}) — {text_preview}"
        )
        if tags:
            line += f" [patterns: {tags}]"
        lines.append(line)
    return "\n".join(lines)
 def _build_context_block(ctx: RetrievedContext) -> str:
    """Build the runbook context block for the prompt."""
    parts: list[str] = []
    for chunk in ctx.chunks[:5]:
        filename = chunk.get("filename", "unknown")
        text = chunk.get("text", "")[:300]
        parts.append(f"[{filename}] {text}")
    return "\n".join(parts) if parts else "(none)"
 def _deterministic_fallback(
    ranked: list[RankedHypothesis],
    timeline: TimelineResult,
 ) -> str:
    """Build a deterministic fallback text when no LLM is available."""
    active = [rh for rh in ranked if not rh.suppress][:3]
    if active:
        top = active[0]
        verdict_severity = top.hypothesis.severity
        verdict_title = top.hypothesis.title
        verdict_conf = int(top.hypothesis.confidence * 100)
    elif ranked:
        top = ranked[0]
        verdict_severity = top.hypothesis.severity
        verdict_title = top.hypothesis.title
        verdict_conf = int(top.hypothesis.confidence * 100)
    else:
        verdict_severity = "UNKNOWN"
        verdict_title = "No hypotheses generated"
        verdict_conf = 0
    root_causes = ", ".join(
        rh.hypothesis.title for rh in (active or ranked[:3])
    ) or "None"
    return (
        f"VERDICT: {verdict_severity} — {verdict_title} ({verdict_conf}% confidence)\n"
        f"TIMELINE: {timeline.total_entries} entries across {len(timeline.clusters)} clusters.\n"
        f"ROOT CAUSES: {root_causes}"
    )
 class SummarySynthesizer:
    """Stage 5 of the multi-agent diagnose pipeline.
    Synthesizes a human-readable incident narrative from ranked hypotheses,
    the reconstructed timeline, and RAG context. When no LLM is configured,
    returns a deterministic fallback built from the hypothesis data.
    """
    def synthesize(
        self,
        ranked: list[RankedHypothesis],
        timeline: TimelineResult,
        ctx: RetrievedContext,
        query: str,
        llm_url: str | None = None,
        llm_model: str | None = None,
        llm_api_key: str | None = None,
        tech_level: str = "sysadmin",
    ) -> str:
        """Return synthesis text (single string, synchronous).
        Falls back to a deterministic narrative when no LLM URL or model is
        provided, or when the LLM call fails.
        """
        fallback = _deterministic_fallback(ranked, timeline)
        if not llm_url or not llm_model:
            return fallback
        system_prompt = _SYSTEM_PROMPTS.get(tech_level, _SYSTEM_PROMPTS["sysadmin"])
        hypothesis_block = _build_hypothesis_block(ranked)
        timeline_block = _build_timeline_block(timeline)
        context_block = _build_context_block(ctx)
        dominant = ", ".join(timeline.dominant_sources[:5]) or "none"
        user_message = (
            f"Query: {query}\n\n"
            f"Timeline ({len(timeline.clusters)} clusters, "
            f"{timeline.burst_count} bursts, "
            f"{timeline.gap_count} silence gaps; "
            f"primary sources: {dominant}):\n"
            f"{timeline_block}\n\n"
            f"Root-cause hypotheses:\n{hypothesis_block}\n\n"
            f"Context from runbooks:\n{context_block}"
        )
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ]
        result = call_llm(
            llm_url=llm_url,
            llm_model=llm_model,
            llm_api_key=llm_api_key,
            messages=messages,
        )
        return result if result else fallback
--- a/app/services/diagnose/timeline.py
+++ b/app/services/diagnose/timeline.py
@ -1,272 +0,0 @@
 """Stage 1: Timeline Reconstructor — pure Python, no ML."""
 from __future__ import annotations
 import hashlib
 import logging
 from collections import defaultdict
 from datetime import datetime, timezone
 from app.services.diagnose.models import EventCluster, TimelineResult
 from app.services.search import SearchResult
 logger = logging.getLogger(__name__)
 _SEVERITY_ORDER: dict[str | None, int] = {
    "CRITICAL": 5,
    "ERROR": 4,
    "WARN": 3,
    "WARNING": 3,
    "INFO": 2,
    "DEBUG": 1,
    None: 0,
 }
 def _parse_iso(s: str) -> datetime | None:
    """Parse ISO 8601 string to UTC-aware datetime. Returns None on parse failure."""
    try:
        dt = datetime.fromisoformat(s)
    except ValueError:
        logger.warning("Unparseable timestamp in log entry, treating as None: %r", s)
        return None
    if dt.tzinfo is None:
        logger.debug("Naive timestamp treated as UTC: %s", s)
        dt = dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)
 def _sort_key(e: SearchResult) -> tuple[int, str]:
    """Sort key: timestamped entries first (ascending), then None-timestamp entries."""
    if e.timestamp_iso is None:
        return (1, "")
    return (0, e.timestamp_iso)
 def _highest_severity(entries: list[SearchResult]) -> str:
    """Return the highest severity label across all entries."""
    best: str | None = None
    best_rank = -1
    for entry in entries:
        sev = entry.severity
        rank = _SEVERITY_ORDER.get(sev, 0)
        if rank > best_rank:
            best_rank = rank
            best = sev
    # SeverityLabel requires a valid literal; fall back to "UNKNOWN" if None
    if best is None:
        return "UNKNOWN"
    # Normalise WARNING -> WARN for the output type
    if best == "WARNING":
        return "WARN"
    return best
 def _representative_text(entries: list[SearchResult]) -> str:
    """Return text of the entry with highest rank; tie-break on longest text."""
    if not entries:
        return ""
    best = max(entries, key=lambda e: (e.rank, len(e.text)))
    return best.text
 def _cluster_id(entry_ids: list[str]) -> str:
    """Compute a 12-char hex cluster ID from a sorted list of entry IDs."""
    payload = ",".join(sorted(entry_ids)).encode()
    return hashlib.sha1(payload).hexdigest()[:12]  # noqa: S324 — not used for security
 def _make_event_cluster(
    cluster_entries: list[SearchResult],
    gap_before_seconds: float,
    burst_threshold: int,
    burst_window_seconds: int,
 ) -> EventCluster:
    """Construct an EventCluster from a list of SearchResult entries."""
    timestamps = [
        ts
        for e in cluster_entries
        if e.timestamp_iso is not None
        for ts in (_parse_iso(e.timestamp_iso),)
        if ts is not None
    ]
    start_iso: str | None = None
    end_iso: str | None = None
    duration_seconds = 0.0
    if timestamps:
        ts_min = min(timestamps)
        ts_max = max(timestamps)
        start_iso = ts_min.isoformat()
        end_iso = ts_max.isoformat()
        duration_seconds = (ts_max - ts_min).total_seconds()
    entry_ids = [e.entry_id for e in cluster_entries]
    burst = (
        len(cluster_entries) >= burst_threshold
        and duration_seconds <= burst_window_seconds
    )
    return EventCluster(
        cluster_id=_cluster_id(entry_ids),
        entries=tuple(entry_ids),
        start_iso=start_iso,
        end_iso=end_iso,
        duration_seconds=duration_seconds,
        source_ids=tuple(sorted(set(e.source_id for e in cluster_entries))),
        pattern_tags=tuple(
            sorted(set(tag for e in cluster_entries for tag in e.matched_patterns))
        ),
        severity=_highest_severity(cluster_entries),  # type: ignore[arg-type]  # SeverityLabel is a Literal; _highest_severity returns a compatible str
        burst=burst,
        gap_before_seconds=gap_before_seconds,
        representative_text=_representative_text(cluster_entries),
    )
 class TimelineReconstructor:
    """Reconstruct a structured timeline of event clusters from log entries.
    Pure Python — no ML or LLM calls. Designed as Stage 1 of the multi-agent
    diagnose pipeline.
    """
    def __init__(
        self,
        cluster_window_seconds: int = 30,
        burst_threshold: int = 10,
        burst_window_seconds: int = 5,
        gap_significance_seconds: int = 30,
    ) -> None:
        self._cluster_window = cluster_window_seconds
        self._burst_threshold = burst_threshold
        self._burst_window = burst_window_seconds
        self._gap_significance_seconds: int = gap_significance_seconds
    def _sort_entries(self, entries: list[SearchResult]) -> list[SearchResult]:
        """Sort entries: timestamped first (ascending), then None-timestamp entries."""
        return sorted(entries, key=_sort_key)
    def _group_into_raw_clusters(
        self, sorted_entries: list[SearchResult]
    ) -> list[list[SearchResult]]:
        """Group sorted entries into time-window clusters."""
        raw_clusters: list[list[SearchResult]] = []
        current: list[SearchResult] = []
        cluster_anchor: datetime | None = None
        for entry in sorted_entries:
            if not current:
                current.append(entry)
                if entry.timestamp_iso is not None:
                    cluster_anchor = _parse_iso(entry.timestamp_iso)
                continue
            if entry.timestamp_iso is None:
                # No timestamp — always joins the current cluster
                current.append(entry)
                continue
            entry_dt = _parse_iso(entry.timestamp_iso)
            if entry_dt is None:
                # Malformed timestamp — treat same as None: join current cluster
                current.append(entry)
                continue
            if cluster_anchor is None:
                # Current cluster has no anchor yet — set it, stay in cluster
                cluster_anchor = entry_dt
                current.append(entry)
                continue
            delta = (entry_dt - cluster_anchor).total_seconds()
            if delta > self._cluster_window:
                raw_clusters.append(current)
                current = [entry]
                cluster_anchor = entry_dt
            else:
                current.append(entry)
        if current:
            raw_clusters.append(current)
        return raw_clusters
    def _build_cluster(
        self,
        cluster_entries: list[SearchResult],
        prev_end_iso: str | None,
    ) -> EventCluster:
        """Build an EventCluster from a list of SearchResult entries."""
        gap_before = 0.0
        if prev_end_iso is not None:
            ts_list = [
                ts
                for e in cluster_entries
                if e.timestamp_iso is not None
                for ts in (_parse_iso(e.timestamp_iso),)
                if ts is not None
            ]
            if ts_list:
                this_start = min(ts_list)
                prev_end = _parse_iso(prev_end_iso)
                if prev_end is not None:
                    gap_before = (this_start - prev_end).total_seconds()
        return _make_event_cluster(
            cluster_entries,
            gap_before_seconds=gap_before,
            burst_threshold=self._burst_threshold,
            burst_window_seconds=self._burst_window,
        )
    def _dominant_sources_tuple(self, entries: list[SearchResult]) -> tuple[str, ...]:
        """Return source_ids sorted by total entry count descending."""
        source_counts: dict[str, int] = defaultdict(int)
        for entry in entries:
            source_counts[entry.source_id] += 1
        return tuple(
            src for src, _ in sorted(source_counts.items(), key=lambda kv: -kv[1])
        )
    def reconstruct(self, entries: list[SearchResult]) -> TimelineResult:
        """Build a structured timeline from a flat list of log entries."""
        if not entries:
            return TimelineResult(
                clusters=(),
                total_entries=0,
                window_start=None,
                window_end=None,
                gap_count=0,
                burst_count=0,
                dominant_sources=(),
            )
        sorted_entries = self._sort_entries(entries)
        raw_clusters = self._group_into_raw_clusters(sorted_entries)
        clusters: list[EventCluster] = []
        prev_end: str | None = None
        for raw in raw_clusters:
            c = self._build_cluster(raw, prev_end)
            clusters.append(c)
            prev_end = c.end_iso
        clusters_tuple = tuple(clusters)
        gap_count = sum(
            1
            for c in clusters_tuple
            if c.gap_before_seconds > self._gap_significance_seconds
        )
        return TimelineResult(
            clusters=clusters_tuple,
            total_entries=len(entries),
            window_start=clusters_tuple[0].start_iso if clusters_tuple else None,
            window_end=clusters_tuple[-1].end_iso if clusters_tuple else None,
            gap_count=gap_count,
            burst_count=sum(1 for c in clusters_tuple if c.burst),
            dominant_sources=self._dominant_sources_tuple(entries),
        )
--- a/app/services/discover.py
+++ b/app/services/discover.py
@ -1,285 +0,0 @@
 """Environment auto-discovery for the onboarding wizard.
 All checks are best-effort — every function returns an empty list on failure
 so the wizard degrades gracefully in containers, VMs, and minimal environments.
 """
 from __future__ import annotations
 import json
 import logging
 import os
 import re
 import shutil
 import subprocess
 import time
 from pathlib import Path
 from typing import Any
 logger = logging.getLogger(__name__)
 # Common log file candidates: (id, path, description)
 _KNOWN_PATHS: list[tuple[str, str, str]] = [
    ("syslog",       "/var/log/syslog",              "System syslog (Debian/Ubuntu)"),
    ("syslog",       "/var/log/messages",             "System messages (RHEL/Rocky)"),
    ("auth",         "/var/log/auth.log",             "Auth log"),
    ("kern",         "/var/log/kern.log",             "Kernel log"),
    ("nginx-access", "/var/log/nginx/access.log",     "Nginx access log"),
    ("nginx-error",  "/var/log/nginx/error.log",      "Nginx error log"),
    ("apache",       "/var/log/apache2/access.log",   "Apache access log"),
    ("apache-error", "/var/log/apache2/error.log",    "Apache error log"),
    ("caddy",        "/var/log/caddy/access.log",     "Caddy access log"),
    ("docker-daemon","/var/log/docker.log",           "Docker daemon log"),
    ("fail2ban",     "/var/log/fail2ban.log",         "Fail2ban log"),
    ("ufw",          "/var/log/ufw.log",              "UFW firewall log"),
 ]
 def _run(cmd: list[str], timeout: float = 5.0) -> str | None:
    """Run a command and return stdout, or None on any error."""
    try:
        result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
        return result.stdout if result.returncode == 0 else None
    except Exception:
        return None
 def discover_journald() -> list[dict[str, Any]]:
    """Return a journald source candidate if journalctl is available."""
    if not shutil.which("journalctl"):
        return []
    hostname = _run(["hostname"]) or "localhost"
    hostname = hostname.strip()
    return [{
        "type": "journald",
        "id": f"journal:{hostname}",
        "label": f"System journal ({hostname})",
        "description": "All systemd journal output from this host",
        "available": True,
    }]
 def discover_docker() -> list[dict[str, Any]]:
    """Return Docker container candidates if Docker is running."""
    for runtime in ("docker", "podman"):
        if not shutil.which(runtime):
            continue
        out = _run([runtime, "ps", "--format", "{{json .}}"])
        if out is None:
            continue
        containers = []
        for line in out.splitlines():
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
                name = obj.get("Names") or obj.get("Name") or obj.get("ID", "unknown")
                # podman returns a list for Names
                if isinstance(name, list):
                    name = name[0] if name else "unknown"
                name = name.lstrip("/")
                containers.append({
                    "type": "docker",
                    "id": f"{runtime}:{name}",
                    "label": f"{runtime.capitalize()} — {name}",
                    "description": f"Container log stream for {name}",
                    "container": name,
                    "runtime": runtime,
                    "available": True,
                })
            except (json.JSONDecodeError, KeyError):
                continue
        if containers:
            return containers
    return []
 def discover_files() -> list[dict[str, Any]]:
    """Return file-based source candidates for well-known log paths."""
    found = []
    seen_ids: set[str] = set()
    for source_id, path, description in _KNOWN_PATHS:
        if not os.path.exists(path):
            continue
        # deduplicate when both syslog and messages exist — take first match
        if source_id in seen_ids:
            continue
        seen_ids.add(source_id)
        found.append({
            "type": "file",
            "id": source_id,
            "label": description,
            "path": path,
            "description": f"Read from {path}",
            "available": True,
        })
    return found
 def discover_all() -> dict[str, Any]:
    """Run all discovery checks and return a structured candidate list."""
    candidates: list[dict[str, Any]] = []
    candidates.extend(discover_journald())
    candidates.extend(discover_docker())
    candidates.extend(discover_files())
    return {
        "candidates": candidates,
        "has_journald": any(c["type"] == "journald" for c in candidates),
        "has_docker":   any(c["type"] == "docker"   for c in candidates),
        "has_files":    any(c["type"] == "file"      for c in candidates),
    }
 def build_sources_yaml(selected: list[dict[str, Any]]) -> str:
    """Generate sources.yaml content from a list of selected candidates.
    Each item must have: type, id, and type-specific fields (path, container, etc.).
    """
    lines = [
        "# Turnstone log sources — generated by the setup wizard.",
        "# Edit this file to add, remove, or modify sources.",
        "sources:",
    ]
    for src in selected:
        src_type = src.get("type", "file")
        src_id = src.get("id", "unknown")
        if src_type == "journald":
            unit = src.get("unit")
            lines.append(f"  - id: {src_id}")
            lines.append(f"    type: journald")
            if unit:
                lines.append(f"    unit: {unit}")
        elif src_type == "docker":
            runtime = src.get("runtime", "docker")
            container = src.get("container", src_id.split(":")[-1])
            lines.append(f"  - id: {src_id}")
            lines.append(f"    type: docker")
            lines.append(f"    runtime: {runtime}")
            lines.append(f"    container: {container}")
        else:
            path = src.get("path", "")
            lines.append(f"  - id: {src_id}")
            lines.append(f"    path: {path}")
    return "\n".join(lines) + "\n"
 def validate_source(src: dict[str, Any]) -> str | None:
    """Return an error string if the source definition is invalid, else None."""
    if not src.get("id"):
        return "Source is missing 'id'"
    src_type = src.get("type", "file")
    if src_type == "file" and not src.get("path"):
        return f"File source '{src['id']}' is missing 'path'"
    if src_type == "docker" and not src.get("container"):
        return f"Docker source '{src['id']}' is missing 'container'"
    return None
 # Extensions considered as log files in the filesystem scanner.
 _LOG_EXTENSIONS = {"", ".log", ".txt", ".out", ".err"}
 # Max file size to consider (500 MB).
 _MAX_SIZE = 500 * 1024 * 1024
 # Recency half-life in days — files older than this are scored near 0.
 _RECENCY_HALFLIFE_DAYS = 30
 def _path_to_source_id(path: Path) -> str:
    """Convert an absolute path to a kebab-case source ID."""
    raw = re.sub(r"[^a-zA-Z0-9]+", "-", str(path)).strip("-").lower()
    return raw[:64]
 def scan_log_directories(
    query: str | None = None,
    dirs: list[str] | None = None,
    max_depth: int = 4,
    max_results: int = 25,
 ) -> list[dict[str, Any]]:
    """Scan filesystem directories for log files ranked by recency and keyword match.
    Scoring weights:
    - Recency  (0-1): mtime within the last 30 days, decays exponentially
    - Size     (0-1): prefer 1 KB – 50 MB; empty or huge files score low
    - Keyword  (0-1): stem matches between query words and path components
    Returns up to *max_results* candidates sorted by descending score.
    """
    if dirs is None:
        dirs = ["/var/log", "/opt"]
    now = time.time()
    query_stems: list[str] = []
    if query:
        query_stems = [w.lower() for w in re.split(r"\W+", query) if len(w) >= 3]
    candidates: list[dict[str, Any]] = []
    def _walk(root: Path, depth: int) -> None:
        if depth > max_depth:
            return
        try:
            entries = list(root.iterdir())
        except OSError:
            return
        for entry in entries:
            if entry.name.startswith("."):
                continue
            if entry.is_symlink():
                continue
            if entry.is_dir():
                _walk(entry, depth + 1)
                continue
            if not entry.is_file():
                continue
            if entry.suffix.lower() not in _LOG_EXTENSIONS:
                continue
            # Skip compressed archives
            if entry.name.endswith((".gz", ".bz2", ".xz", ".zst")):
                continue
            try:
                stat = entry.stat()
            except OSError:
                continue
            if stat.st_size == 0 or stat.st_size > _MAX_SIZE:
                continue
            if not os.access(entry, os.R_OK):
                continue
            age_days = (now - stat.st_mtime) / 86400
            recency = max(0.0, 1.0 - age_days / _RECENCY_HALFLIFE_DAYS)
            if stat.st_size < 1024:
                size_score = 0.3
            elif stat.st_size <= 50 * 1024 * 1024:
                size_score = 1.0
            else:
                # Large files: linear decay from 50 MB to 500 MB
                size_score = max(0.1, 1.0 - (stat.st_size - 50 * 1024 * 1024) / _MAX_SIZE)
            keyword_score = 0.0
            if query_stems:
                path_lower = str(entry).lower()
                matches = sum(1 for stem in query_stems if stem in path_lower)
                keyword_score = min(1.0, matches / max(len(query_stems), 1))
            if query_stems:
                total = recency * 0.4 + size_score * 0.2 + keyword_score * 0.4
            else:
                total = recency * 0.7 + size_score * 0.3
            candidates.append({
                "type": "file",
                "id": _path_to_source_id(entry),
                "path": str(entry),
                "label": entry.name,
                "size_bytes": stat.st_size,
                "mtime": stat.st_mtime,
                "score": round(total, 3),
                "available": True,
            })
    for d in dirs:
        _walk(Path(d), depth=0)
    candidates.sort(key=lambda c: c["score"], reverse=True)
    return candidates[:max_results]
--- a/app/services/embeddings.py
+++ b/app/services/embeddings.py
@ -1,229 +0,0 @@
 """Configurable embedding service — BSL licensed.
 Backends:
  sentence_transformers — local in-process inference (default, no server needed)
  ollama               — HTTP to a running Ollama instance
 Configuration (env vars):
  TURNSTONE_EMBED_BACKEND   sentence_transformers | ollama  (default: sentence_transformers)
  TURNSTONE_EMBED_MODEL     model name/path                 (backend-specific default)
  TURNSTONE_EMBED_DEVICE    cpu | cuda                      (default: cpu; ST backend only)
  TURNSTONE_LLM_URL         Ollama base URL                 (default: http://localhost:11434)
 When no backend is importable/reachable, EMBEDDING_AVAILABLE is False and all
 embed calls return empty arrays — callers must handle this gracefully.
 """
 from __future__ import annotations
 import logging
 import os
 import struct
 from typing import Protocol, runtime_checkable
 import numpy as np
 logger = logging.getLogger(__name__)
 # ── Public availability flag ──────────────────────────────────────────────────
 EMBEDDING_AVAILABLE: bool = False
 # ── Config ────────────────────────────────────────────────────────────────────
 _BACKEND = os.environ.get("TURNSTONE_EMBED_BACKEND", "sentence_transformers").lower()
 _DEVICE  = os.environ.get("TURNSTONE_EMBED_DEVICE", "cpu").lower()
 _LLM_URL = os.environ.get("TURNSTONE_LLM_URL", "http://localhost:11434")
 # BAAI/bge-small-en-v1.5: 33MB, MIT, 49M downloads/month, 384-dim, 512-token max.
 # Benchmarked as the best quality-to-size ratio in the field (MTEB 62.17).
 # all-MiniLM-L6-v2 is a viable lighter alternative (23MB, 256-token max) if
 # inference speed is the primary constraint.
 _DEFAULT_MODEL: dict[str, str] = {
    "sentence_transformers": "BAAI/bge-small-en-v1.5",
    "ollama":                "nomic-embed-text",
 }
 _MODEL = os.environ.get(
    "TURNSTONE_EMBED_MODEL",
    _DEFAULT_MODEL.get(_BACKEND, "sentence-transformers/all-MiniLM-L6-v2"),
 )
 # ── Protocol ──────────────────────────────────────────────────────────────────
@runtime_checkable
 class Embedder(Protocol):
    """Minimal interface all embedding backends must satisfy."""
    @property
    def dim(self) -> int:
        """Embedding dimension produced by this model."""
        ...
    @property
    def model_name(self) -> str:
        """Human-readable model identifier."""
        ...
    def embed(self, text: str) -> np.ndarray:
        """Embed a single string. Returns 1-D float32 array of length dim."""
        ...
    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
        """Embed a list of strings. Returns list of 1-D float32 arrays."""
        ...
 # ── sentence-transformers backend ─────────────────────────────────────────────
 class SentenceTransformerEmbedder:
    """Local in-process embedding via the sentence-transformers library.
    The model is downloaded from HuggingFace on first instantiation and cached
    at ~/.cache/huggingface/. Subsequent starts use the local cache.
    """
    def __init__(self, model_name: str = _MODEL, device: str = _DEVICE) -> None:
        from sentence_transformers import SentenceTransformer  # type: ignore[import]
        logger.info("Loading embedding model %r on device %r ...", model_name, device)
        self._model = SentenceTransformer(model_name, device=device)
        self._model_name = model_name
        # Infer dimension from a test embed rather than hard-coding
        self._dim: int = int(self._model.encode("test").shape[0])
        logger.info("Embedding model ready — dim=%d", self._dim)
    @property
    def dim(self) -> int:
        return self._dim
    @property
    def model_name(self) -> str:
        return self._model_name
    def embed(self, text: str) -> np.ndarray:
        vec = self._model.encode(text, convert_to_numpy=True, normalize_embeddings=True)
        return vec.astype(np.float32)
    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
        if not texts:
            return []
        vecs = self._model.encode(
            texts, convert_to_numpy=True, normalize_embeddings=True, batch_size=32
        )
        return [v.astype(np.float32) for v in vecs]
 # ── Ollama backend ────────────────────────────────────────────────────────────
 class OllamaEmbedder:
    """HTTP embedding via a running Ollama instance."""
    def __init__(
        self,
        model_name: str = _MODEL,
        llm_url: str = _LLM_URL,
        timeout: float = 30.0,
    ) -> None:
        import httpx  # already a project dependency
        self._model_name = model_name
        self._url = f"{llm_url.rstrip('/')}/api/embeddings"
        self._timeout = timeout
        self._client = httpx.Client(timeout=timeout)
        # Probe dimension with a test call
        self._dim = self._probe_dim()
    def _probe_dim(self) -> int:
        try:
            vec = self._raw_embed("probe")
            return len(vec)
        except Exception as exc:
            logger.warning("Ollama dim probe failed (%s) — defaulting to 768", exc)
            return 768
    def _raw_embed(self, text: str) -> list[float]:
        resp = self._client.post(
            self._url, json={"model": self._model_name, "prompt": text}
        )
        resp.raise_for_status()
        return resp.json().get("embedding") or []
    @property
    def dim(self) -> int:
        return self._dim
    @property
    def model_name(self) -> str:
        return self._model_name
    def embed(self, text: str) -> np.ndarray:
        vec = self._raw_embed(text)
        return np.array(vec, dtype=np.float32)
    def embed_batch(self, texts: list[str]) -> list[np.ndarray]:
        return [self.embed(t) for t in texts]
 # ── Singleton factory ─────────────────────────────────────────────────────────
 _embedder: Embedder | None = None
 def get_embedder() -> Embedder | None:
    """Return the configured embedder singleton, or None when unavailable.
    Lazy-initialises on first call.  Callers should check EMBEDDING_AVAILABLE
    or test for None rather than calling this unconditionally.
    """
    global _embedder, EMBEDDING_AVAILABLE
    if _embedder is not None:
        return _embedder
    if _BACKEND == "sentence_transformers":
        try:
            _embedder = SentenceTransformerEmbedder(_MODEL, _DEVICE)
            EMBEDDING_AVAILABLE = True
        except ImportError:
            logger.warning(
                "sentence-transformers not installed — embeddings disabled. "
                "Install with: pip install sentence-transformers"
            )
        except Exception as exc:
            logger.warning("Failed to load sentence-transformers model %r: %s", _MODEL, exc)
    elif _BACKEND == "ollama":
        try:
            _embedder = OllamaEmbedder(_MODEL, _LLM_URL)
            EMBEDDING_AVAILABLE = True
        except Exception as exc:
            logger.warning("Ollama embedder init failed: %s", exc)
    else:
        logger.warning("Unknown TURNSTONE_EMBED_BACKEND %r — embeddings disabled", _BACKEND)
    return _embedder
 # ── BLOB serialisation helpers ────────────────────────────────────────────────
 def pack_vector(vec: np.ndarray) -> bytes:
    """Serialise a float32 numpy vector to a SQLite BLOB."""
    arr = vec.astype(np.float32)
    return struct.pack(f"{len(arr)}f", *arr.tolist())
 def unpack_vector(blob: bytes) -> np.ndarray:
    """Deserialise a SQLite BLOB back to a float32 numpy vector."""
    n = len(blob) // 4  # 4 bytes per float32
    return np.array(struct.unpack(f"{n}f", blob), dtype=np.float32)
 def cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    """Cosine similarity between two L2-normalised vectors.
    Both vectors are re-normalised defensively so callers need not pre-normalise.
    Returns 0.0 when either vector has zero norm.
    """
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    if norm_a == 0.0 or norm_b == 0.0:
        return 0.0
    return float(np.dot(a, b) / (norm_a * norm_b))
--- a/app/services/incidents.py
+++ b/app/services/incidents.py
@ -2,31 +2,16 @@
 from __future__ import annotations
 import json
-import re
+import sqlite3
 import uuid
 from pathlib import Path
-from app.db import get_conn, resolve_tenant_id
+from app.ingest.base import now_iso
-from app.glean.base import now_iso
+from app.services.models import Incident, ReceivedBundle
 from app.services.models import Incident, ReceivedBundle, SentBundle
 from app.services.search import SearchResult, entries_in_window, search
 _REDACT_PATTERNS: list[tuple[re.Pattern, str]] = [
    (re.compile(r"\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b"), "[IP]"),
    (re.compile(r"[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}"), "[EMAIL]"),
    (re.compile(r"(?i)\b(user(?:name)?|uid)\s*[=:]\s*\S+"), r"\1=[USER]"),
    (re.compile(r"(?i)\bhost\s*[=:]\s*\S+"), "host=[HOST]"),
    (re.compile(r"(?i)\bpassword\s*[=:]\s*\S+"), "password=[REDACTED]"),
 ]
-
+def _row_to_incident(row: sqlite3.Row) -> Incident:
 def _redact_text(text: str) -> str:
    for pattern, replacement in _REDACT_PATTERNS:
        text = pattern.sub(replacement, text)
    return text
 def _row_to_incident(row) -> Incident:
    return Incident(
        id=row["id"],
        label=row["label"],
@ -39,7 +24,7 @@ def _row_to_incident(row) -> Incident:
    )
-def _row_to_bundle(row) -> ReceivedBundle:
+def _row_to_bundle(row: sqlite3.Row) -> ReceivedBundle:
    return ReceivedBundle(
        id=row["id"],
        source_host=row["source_host"],
@ -62,7 +47,6 @@ def create_incident(
    notes: str = "",
    severity: str = "medium",
 ) -> Incident:
    tid = resolve_tenant_id()
    incident = Incident(
        id=str(uuid.uuid4()),
        label=label,
@ -73,45 +57,47 @@ def create_incident(
        created_at=now_iso(),
        severity=severity,
    )
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
-        conn.execute(
+    conn.execute("PRAGMA journal_mode=WAL")
-            "INSERT INTO incidents (id, tenant_id, label, issue_type, started_at, ended_at, notes, created_at, severity) "
+    conn.execute(
-            "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
+        "INSERT INTO incidents (id, label, issue_type, started_at, ended_at, notes, created_at, severity) "
-            (incident.id, tid, incident.label, incident.issue_type, incident.started_at,
+        "VALUES (?, ?, ?, ?, ?, ?, ?, ?)",
-             incident.ended_at, incident.notes, incident.created_at, incident.severity),
+        (incident.id, incident.label, incident.issue_type, incident.started_at,
-        )
+         incident.ended_at, incident.notes, incident.created_at, incident.severity),
-        conn.commit()
+    )
    conn.commit()
    conn.close()
    return incident
 def list_incidents(db_path: Path) -> list[Incident]:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    conn.execute("PRAGMA journal_mode=WAL")
-        rows = conn.execute(
+    conn.row_factory = sqlite3.Row
-            "SELECT * FROM incidents WHERE (tenant_id = ? OR tenant_id = '') ORDER BY created_at DESC",
+    rows = conn.execute(
-            (tid,),
+        "SELECT * FROM incidents ORDER BY created_at DESC"
-        ).fetchall()
+    ).fetchall()
    conn.close()
    return [_row_to_incident(r) for r in rows]
 def get_incident(db_path: Path, incident_id: str) -> Incident | None:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    conn.execute("PRAGMA journal_mode=WAL")
-        row = conn.execute(
+    conn.row_factory = sqlite3.Row
-            "SELECT * FROM incidents WHERE id = ? AND (tenant_id = ? OR tenant_id = '')",
+    row = conn.execute(
-            (incident_id, tid),
+        "SELECT * FROM incidents WHERE id = ?", (incident_id,)
-        ).fetchone()
+    ).fetchone()
    conn.close()
    return _row_to_incident(row) if row else None
 def delete_incident(db_path: Path, incident_id: str) -> bool:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    conn.execute("PRAGMA journal_mode=WAL")
-        cur = conn.execute(
+    cur = conn.execute("DELETE FROM incidents WHERE id = ?", (incident_id,))
-            "DELETE FROM incidents WHERE id = ? AND (tenant_id = ? OR tenant_id = '')",
+    conn.commit()
-            (incident_id, tid),
+    conn.close()
        )
        conn.commit()
    return cur.rowcount > 0
@ -156,7 +142,6 @@ def build_bundle(
    incident: Incident,
    source_host: str,
    limit: int = 200,
    sanitize: bool = False,
 ) -> dict:
    """Assemble a labeled bundle: incident metadata + related log entries."""
    entries = get_incident_entries(db_path, incident, limit=limit)
@ -164,7 +149,6 @@ def build_bundle(
        "bundle_version": 1,
        "source_host": source_host,
        "bundled_at": now_iso(),
        "sanitized": sanitize,
        "incident": {
            "id": incident.id,
            "label": incident.label,
@ -180,7 +164,7 @@ def build_bundle(
                "source_id": e.source_id,
                "timestamp_iso": e.timestamp_iso,
                "severity": e.severity,
-                "text": _redact_text(e.text) if sanitize else e.text,
+                "text": e.text,
                "matched_patterns": list(e.matched_patterns),
            }
            for e in entries
@ -188,52 +172,8 @@ def build_bundle(
    }
 def record_sent_bundle(db_path: Path, incident_id: str, bundle: dict, sanitized: bool) -> SentBundle:
    """Log an outgoing bundle export to the sent_bundles table."""
    tid = resolve_tenant_id()
    record = SentBundle(
        id=str(uuid.uuid4()),
        incident_id=incident_id,
        exported_at=now_iso(),
        sanitized=sanitized,
        entry_count=len(bundle.get("log_entries", [])),
        bundle_json=json.dumps(bundle),
    )
    with get_conn(db_path) as conn:
        conn.execute(
            "INSERT INTO sent_bundles (id, tenant_id, incident_id, exported_at, sanitized, entry_count, bundle_json) "
            "VALUES (?, ?, ?, ?, ?, ?, ?)",
            (record.id, tid, record.incident_id, record.exported_at,
             int(record.sanitized), record.entry_count, record.bundle_json),
        )
        conn.commit()
    return record
 def list_sent_bundles(db_path: Path) -> list[SentBundle]:
    tid = resolve_tenant_id()
    with get_conn(db_path) as conn:
        rows = conn.execute(
            "SELECT id, incident_id, exported_at, sanitized, entry_count, bundle_json "
            "FROM sent_bundles WHERE (tenant_id = ? OR tenant_id = '') ORDER BY exported_at DESC",
            (tid,),
        ).fetchall()
    return [
        SentBundle(
            id=r["id"],
            incident_id=r["incident_id"],
            exported_at=r["exported_at"],
            sanitized=bool(r["sanitized"]),
            entry_count=r["entry_count"],
            bundle_json=r["bundle_json"],
        )
        for r in rows
    ]
 def store_bundle(db_path: Path, bundle: dict) -> ReceivedBundle:
    """Store an incoming bundle from a remote Turnstone instance."""
    tid = resolve_tenant_id()
    inc = bundle.get("incident", {})
    record = ReceivedBundle(
        id=str(uuid.uuid4()),
@ -246,34 +186,38 @@ def store_bundle(db_path: Path, bundle: dict) -> ReceivedBundle:
        entry_count=len(bundle.get("log_entries", [])),
        bundle_json=json.dumps(bundle),
    )
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
-        conn.execute(
+    conn.execute("PRAGMA journal_mode=WAL")
-            "INSERT INTO received_bundles "
+    conn.execute(
-            "(id, tenant_id, source_host, issue_type, label, severity, started_at, bundled_at, entry_count, bundle_json) "
+        "INSERT INTO received_bundles "
-            "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+        "(id, source_host, issue_type, label, severity, started_at, bundled_at, entry_count, bundle_json) "
-            (record.id, tid, record.source_host, record.issue_type, record.label,
+        "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)",
-             record.severity, record.started_at, record.bundled_at, record.entry_count, record.bundle_json),
+        (record.id, record.source_host, record.issue_type, record.label,
-        )
+         record.severity, record.started_at, record.bundled_at, record.entry_count, record.bundle_json),
-        conn.commit()
+    )
    conn.commit()
    conn.close()
    return record
 def list_bundles(db_path: Path) -> list[ReceivedBundle]:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    conn.execute("PRAGMA journal_mode=WAL")
-        rows = conn.execute(
+    conn.row_factory = sqlite3.Row
-            "SELECT id, source_host, issue_type, label, severity, started_at, bundled_at, entry_count, bundle_json "
+    rows = conn.execute(
-            "FROM received_bundles WHERE (tenant_id = ? OR tenant_id = '') ORDER BY bundled_at DESC",
+        "SELECT id, source_host, issue_type, label, severity, started_at, bundled_at, entry_count, bundle_json "
-            (tid,),
+        "FROM received_bundles ORDER BY bundled_at DESC"
-        ).fetchall()
+    ).fetchall()
    conn.close()
    return [_row_to_bundle(r) for r in rows]
 def get_bundle(db_path: Path, bundle_id: str) -> ReceivedBundle | None:
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    with get_conn(db_path) as conn:
+    conn.execute("PRAGMA journal_mode=WAL")
-        row = conn.execute(
+    conn.row_factory = sqlite3.Row
-            "SELECT * FROM received_bundles WHERE id = ? AND (tenant_id = ? OR tenant_id = '')",
+    row = conn.execute(
-            (bundle_id, tid),
+        "SELECT * FROM received_bundles WHERE id = ?", (bundle_id,)
-        ).fetchone()
+    ).fetchone()
    conn.close()
    return _row_to_bundle(row) if row else None
--- a/app/services/llm.py
+++ b/app/services/llm.py
@ -73,7 +73,7 @@ def summarize(
            json={
                "product": "turnstone",
                "task": "log_analysis",
-                "payload": {"messages": messages, "stream": False, "max_tokens": 1024},
+                "payload": {"messages": messages, "stream": False},
            },
            headers=headers,
            timeout=timeout,
@ -88,11 +88,11 @@ def summarize(
        logger.debug("Task endpoint unavailable (%s) — falling back to direct model", exc)
    # Fallback: OpenAI-compat endpoint with explicit model name (local instances,
-    # or any cf-orch node that doesn't have task assignments loaded).
+    # xanderland, or any cf-orch that doesn't have task assignments loaded).
    try:
        resp = httpx.post(
            f"{llm_url.rstrip('/')}/v1/chat/completions",
-            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 1024},
+            json={"model": llm_model, "messages": messages, "stream": False},
            headers=headers,
            timeout=timeout,
        )
--- a/app/services/models.py
+++ b/app/services/models.py
@ -10,7 +10,7 @@ class RetrievedEntry:
    entry_id: str
    source_id: str          # log file path or service name
-    sequence: int           # original line number — glean order, not wall-clock order
+    sequence: int           # original line number — ingest order, not wall-clock order
    timestamp_raw: str | None       # timestamp as it appeared in the log
    timestamp_iso: str | None       # parsed to ISO 8601 for sorting; None if unparseable
    ingest_time: str                # when Turnstone indexed this entry (wall clock)
@ -25,13 +25,12 @@ class RetrievedEntry:
@dataclass(frozen=True)
 class LogPattern:
-    """A named regex pattern for tagging entries at glean time."""
+    """A named regex pattern for tagging entries at ingest time."""
    name: str           # e.g. "device_disconnect", "auth_failure"
    pattern: str        # regex string
    severity: str       # suggested severity if not present in log line
    description: str    # human-readable explanation for the UI
    domain: str = ""    # service health domain (networking, storage, auth, etc.)
@dataclass(frozen=True)
@ -61,15 +60,3 @@ class ReceivedBundle:
    bundled_at: str
    entry_count: int
    bundle_json: str           # full bundle serialized as JSON string
@dataclass(frozen=True)
 class SentBundle:
    """A record of a bundle exported or sent from this instance."""
    id: str
    incident_id: str
    exported_at: str
    sanitized: bool
    entry_count: int
    bundle_json: str
--- a/app/services/nl_source.py
+++ b/app/services/nl_source.py
@ -1,134 +0,0 @@
 """Natural-language log source interpretation (LLM path for #53).
 BSL-gated feature: the structured form fallback is MIT; the LLM interpretation
 requires the LLM service to be configured. The caller always validates the
 output against the source schema before writing anything.
 """
 from __future__ import annotations
 import json
 import logging
 import re
 from typing import Any
 import httpx
 logger = logging.getLogger(__name__)
 _SYSTEM_PROMPT = """\
 You are a Turnstone log-source configuration assistant.
 The operator will describe a log source in plain English.
 Respond ONLY with a JSON object matching this schema — no prose, no markdown:
 {
  "id":        "short-kebab-case identifier",
  "type":      "file" | "journald" | "docker",
  "path":      "/absolute/path  (file type only)",
  "container": "container-name  (docker type only)",
  "runtime":   "docker" | "podman"  (docker type only, default docker)",
  "unit":      "service.service  (journald type only, omit for all-journal)",
  "label":     "Human-readable name for the UI"
 }
 Rules:
 - For well-known apps (nginx, apache, caddy, sonarr, radarr, qbittorrent, plex, jellyfin),
  use the conventional default log path.
 - If the operator mentions a Docker/Podman container, use type=docker.
 - If the operator mentions journald or a systemd service, use type=journald.
 - If uncertain, use type=file with the most likely path.
 - The "id" must be lowercase, hyphens only (no spaces, slashes, dots).
 - Never include trailing commas or comments in your JSON.
 """
 # Well-known path lookup for common apps — used as a deterministic fallback
 _KNOWN_APPS: dict[str, dict[str, Any]] = {
    "nginx":        {"id": "nginx-access",  "type": "file", "path": "/var/log/nginx/access.log"},
    "apache":       {"id": "apache",        "type": "file", "path": "/var/log/apache2/access.log"},
    "caddy":        {"id": "caddy",         "type": "file", "path": "/var/log/caddy/access.log"},
    "sonarr":       {"id": "sonarr",        "type": "file", "path": "/var/log/sonarr/sonarr.0.txt"},
    "radarr":       {"id": "radarr",        "type": "file", "path": "/var/log/radarr/radarr.0.txt"},
    "qbittorrent":  {"id": "qbittorrent",   "type": "file", "path": "/var/log/qbittorrent/qbittorrent.log"},
    "plex":         {"id": "plex",          "type": "file", "path": "/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs/Plex Media Server.log"},
    "jellyfin":     {"id": "jellyfin",      "type": "file", "path": "/var/log/jellyfin/jellyfin.log"},
    "syslog":       {"id": "syslog",        "type": "file", "path": "/var/log/syslog"},
    "auth":         {"id": "auth",          "type": "file", "path": "/var/log/auth.log"},
    "fail2ban":     {"id": "fail2ban",      "type": "file", "path": "/var/log/fail2ban.log"},
    "docker":       {"id": "docker-daemon", "type": "file", "path": "/var/log/docker.log"},
    "journal":      {"id": "journal",       "type": "journald"},
    "journald":     {"id": "journal",       "type": "journald"},
    "systemd":      {"id": "journal",       "type": "journald"},
 }
 def _keyword_match(description: str) -> dict[str, Any] | None:
    """Try a simple keyword match before spending an LLM call."""
    lower = description.lower()
    for keyword, template in _KNOWN_APPS.items():
        if keyword in lower:
            result = dict(template)
            result.setdefault("label", keyword.capitalize() + " log")
            return result
    return None
 def _extract_json(text: str) -> dict[str, Any] | None:
    """Pull the first {...} block out of an LLM response."""
    match = re.search(r"\{[^{}]+\}", text, re.DOTALL)
    if not match:
        return None
    try:
        return json.loads(match.group())
    except json.JSONDecodeError:
        return None
 def interpret(
    description: str,
    llm_url: str | None,
    llm_model: str | None,
    api_key: str | None = None,
    timeout: float = 30.0,
 ) -> dict[str, Any] | None:
    """Interpret a natural-language source description.
    Returns a source dict or None if interpretation fails.
    The caller must validate the result with discover.validate_source()
    before writing anything to disk.
    """
    # 1. Keyword shortcut — no LLM needed for well-known apps
    kw = _keyword_match(description)
    if kw:
        logger.debug("NL source: keyword match for %r", description)
        return kw
    # 2. LLM path
    if not llm_url or not llm_model:
        logger.debug("NL source: no LLM configured, returning None")
        return None
    messages = [
        {"role": "system", "content": _SYSTEM_PROMPT},
        {"role": "user",   "content": description},
    ]
    headers = {"Content-Type": "application/json"}
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"
    try:
        resp = httpx.post(
            f"{llm_url.rstrip('/')}/v1/chat/completions",
            json={"model": llm_model, "messages": messages, "stream": False, "max_tokens": 256},
            headers=headers,
            timeout=timeout,
        )
        resp.raise_for_status()
        content = resp.json()["choices"][0]["message"]["content"]
        parsed = _extract_json(content)
        if parsed:
            parsed.setdefault("label", description[:60])
            return parsed
        logger.warning("NL source: could not extract JSON from LLM response")
    except Exception as exc:
        logger.warning("NL source: LLM call failed (%s): %s", type(exc).__name__, exc)
    return None
--- a/app/services/orchard.py
+++ b/app/services/orchard.py
@ -1,327 +0,0 @@
 """The Orchard — auto-enrollment of new Turnstone branch nodes.
 A "branch" is an external Turnstone instance that submits pattern-matched log
 entries to a central harvest receiver (harvest.circuitforge.tech). Grafting
 provisions the receiving infrastructure for a new branch:
  1. Creates a data dir at ORCHARD_DATA_ROOT/<slug>/
  2. Starts a new turnstone-submissions-<slug> Docker container
  3. Injects a handle_path block into the Caddyfile marker section
  4. Restarts caddy-proxy to activate the route
  5. Persists the branch registry to orchard-branches.yaml
 Admin auth: the graft/deactivate endpoints require
  Authorization: Bearer <TURNSTONE_ORCHARD_ADMIN_KEY>
 Set TURNSTONE_ORCHARD_ADMIN_KEY in the environment on the harvest instance.
 If unset, the endpoints return 501 Not Implemented (feature is off).
 Anonymization: a separate pass (run_anonymization) replaces IPs, hostnames,
 and usernames in branch DBs with stable pseudonyms before Avocet reads them.
 """
 from __future__ import annotations
 import hashlib
 import hmac
 import ipaddress
 import json
 import logging
 import os
 import re
 import secrets
 import sqlite3
 import subprocess
 import time
 from pathlib import Path
 from typing import Any
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Config (read from env on the harvest instance)
 # ---------------------------------------------------------------------------
 ORCHARD_DATA_ROOT = Path(os.environ.get("TURNSTONE_ORCHARD_DATA_ROOT", "/devl/docker/turnstone-submissions"))
 ORCHARD_CADDYFILE = Path(os.environ.get("TURNSTONE_ORCHARD_CADDYFILE", "/devl/caddy-proxy/Caddyfile"))
 ORCHARD_CADDY_CONTAINER = os.environ.get("TURNSTONE_ORCHARD_CADDY_CONTAINER", "caddy-proxy")
 ORCHARD_HARVEST_HOST = os.environ.get("TURNSTONE_ORCHARD_HARVEST_HOST", "https://harvest.circuitforge.tech")
 ORCHARD_IMAGE = os.environ.get("TURNSTONE_ORCHARD_IMAGE", "localhost/turnstone:latest")
 # Ports for submission containers start here and scan upward.
 ORCHARD_PORT_BASE = int(os.environ.get("TURNSTONE_ORCHARD_PORT_BASE", "8538"))
 _REGISTRY_FILE = ORCHARD_DATA_ROOT / "orchard-branches.yaml"
 _CADDY_BRANCH_START = "# --- ORCHARD BRANCHES: auto-managed by POST /api/orchard/graft, do not edit manually ---"
 _CADDY_BRANCH_END = "# --- END ORCHARD BRANCHES ---"
 _SLUG_RE = re.compile(r"^[a-z0-9][a-z0-9-]{1,30}[a-z0-9]$")
 # ---------------------------------------------------------------------------
 # Branch registry
 # ---------------------------------------------------------------------------
 def _load_registry() -> list[dict[str, Any]]:
    if not _REGISTRY_FILE.exists():
        return []
    import yaml as _yaml
    try:
        data = _yaml.safe_load(_REGISTRY_FILE.read_text()) or {}
        return data.get("branches", [])
    except Exception:
        return []
 def _save_registry(branches: list[dict[str, Any]]) -> None:
    import yaml as _yaml
    _REGISTRY_FILE.parent.mkdir(parents=True, exist_ok=True)
    _REGISTRY_FILE.write_text(_yaml.dump({"branches": branches}, default_flow_style=False))
 def list_branches() -> list[dict[str, Any]]:
    return _load_registry()
 # ---------------------------------------------------------------------------
 # Port allocation
 # ---------------------------------------------------------------------------
 def _next_free_port() -> int:
    used = {b["port"] for b in _load_registry() if "port" in b}
    port = ORCHARD_PORT_BASE
    while port in used:
        port += 1
    return port
 # ---------------------------------------------------------------------------
 # Caddy route injection
 # ---------------------------------------------------------------------------
 def _build_branch_block(slug: str, port: int) -> str:
    return (
        f"    handle_path /{slug}/* {{\n"
        f"        reverse_proxy http://host.docker.internal:{port} {{\n"
        f"            header_up X-Real-IP {{remote_host}}\n"
        f"            header_up X-Forwarded-Proto {{scheme}}\n"
        f"            flush_interval -1\n"
        f"            transport http {{\n"
        f"                response_header_timeout 0\n"
        f"                read_timeout 0\n"
        f"            }}\n"
        f"        }}\n"
        f"    }}"
    )
 def _rewrite_caddy_branches(branches: list[dict[str, Any]]) -> None:
    """Replace the auto-managed section in the Caddyfile with current branches."""
    if not ORCHARD_CADDYFILE.exists():
        raise RuntimeError(f"Caddyfile not found at {ORCHARD_CADDYFILE}")
    text = ORCHARD_CADDYFILE.read_text()
    start_idx = text.find(_CADDY_BRANCH_START)
    end_idx = text.find(_CADDY_BRANCH_END)
    if start_idx == -1 or end_idx == -1:
        raise RuntimeError("Caddyfile is missing the ORCHARD BRANCHES marker section")
    active = [b for b in branches if b.get("active", True)]
    blocks = "\n".join(_build_branch_block(b["slug"], b["port"]) for b in active)
    replacement = f"{_CADDY_BRANCH_START}\n{blocks}\n    {_CADDY_BRANCH_END}"
    new_text = text[:start_idx] + replacement + text[end_idx + len(_CADDY_BRANCH_END):]
    ORCHARD_CADDYFILE.write_text(new_text)
    logger.info("Caddyfile updated with %d active branch routes", len(active))
 def _reload_caddy() -> None:
    result = subprocess.run(
        ["docker", "restart", ORCHARD_CADDY_CONTAINER],
        capture_output=True, text=True, timeout=30,
    )
    if result.returncode != 0:
        raise RuntimeError(f"docker restart {ORCHARD_CADDY_CONTAINER} failed: {result.stderr}")
    logger.info("Restarted %s", ORCHARD_CADDY_CONTAINER)
 # ---------------------------------------------------------------------------
 # Container provisioning
 # ---------------------------------------------------------------------------
 def _start_branch_container(slug: str, port: int, data_dir: Path) -> None:
    patterns_dir = data_dir / "patterns"
    patterns_dir.mkdir(parents=True, exist_ok=True)
    data_dir.mkdir(parents=True, exist_ok=True)
    # Seed default patterns if not already present
    repo_patterns = Path(__file__).parent.parent.parent / "patterns"
    for yaml_file in ("default.yaml", "sources-example.yaml"):
        src = repo_patterns / yaml_file
        dst = patterns_dir / yaml_file
        if src.exists() and not dst.exists():
            dst.write_text(src.read_text())
    container_name = f"turnstone-submissions-{slug}"
    cmd = [
        "docker", "run", "-d",
        "--name", container_name,
        "--restart", "unless-stopped",
        "-p", f"{port}:8534",
        "-v", f"{data_dir}:/data",
        "-v", f"{patterns_dir}:/patterns",
        "-e", f"TURNSTONE_DB=/data/turnstone.db",
        "-e", f"TURNSTONE_SOURCE_HOST={slug}",
        "-e", "PYTHONUNBUFFERED=1",
        "-e", "TZ=America/Los_Angeles",
        ORCHARD_IMAGE,
    ]
    # Remove any stale container with the same name first
    subprocess.run(["docker", "rm", "-f", container_name], capture_output=True)
    result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
    if result.returncode != 0:
        raise RuntimeError(f"docker run for {container_name} failed: {result.stderr}")
    logger.info("Started container %s on port %d", container_name, port)
 def _stop_branch_container(slug: str) -> None:
    container_name = f"turnstone-submissions-{slug}"
    subprocess.run(["docker", "rm", "-f", container_name], capture_output=True, timeout=30)
    logger.info("Removed container %s", container_name)
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 def graft(slug: str, contact_email: str, agreed_to_terms: bool) -> dict[str, Any]:
    """Provision a new Orchard branch and return connection details."""
    if not agreed_to_terms:
        raise ValueError("agreed_to_terms must be true")
    if not _SLUG_RE.match(slug):
        raise ValueError(
            f"Invalid slug {slug!r}: must be 2-32 lowercase alphanumeric/hyphen, "
            "cannot start or end with a hyphen"
        )
    branches = _load_registry()
    if any(b["slug"] == slug for b in branches):
        raise ValueError(f"Branch {slug!r} already exists")
    port = _next_free_port()
    data_dir = ORCHARD_DATA_ROOT / slug
    api_key = secrets.token_urlsafe(32)
    branch: dict[str, Any] = {
        "slug": slug,
        "port": port,
        "contact_email": contact_email,
        "api_key_hash": hashlib.sha256(api_key.encode()).hexdigest(),
        "grafted_at": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        "active": True,
    }
    _start_branch_container(slug, port, data_dir)
    branches.append(branch)
    _save_registry(branches)
    _rewrite_caddy_branches(branches)
    _reload_caddy()
    submit_endpoint = f"{ORCHARD_HARVEST_HOST}/{slug}"
    logger.info("Grafted branch %r at %s", slug, submit_endpoint)
    return {
        "slug": slug,
        "submit_endpoint": submit_endpoint,
        "api_key": api_key,
        "port": port,
    }
 def deactivate(slug: str) -> dict[str, Any]:
    """Deactivate a branch: stop its container and remove its Caddy route."""
    branches = _load_registry()
    branch = next((b for b in branches if b["slug"] == slug), None)
    if branch is None:
        raise KeyError(f"Branch {slug!r} not found")
    _stop_branch_container(slug)
    branch["active"] = False
    _save_registry(branches)
    _rewrite_caddy_branches(branches)
    _reload_caddy()
    return {"slug": slug, "deactivated": True}
 def verify_api_key(slug: str, key: str) -> bool:
    """Check whether *key* is valid for the given branch slug."""
    branches = _load_registry()
    branch = next((b for b in branches if b["slug"] == slug and b.get("active")), None)
    if branch is None:
        return False
    expected = branch.get("api_key_hash", "")
    provided = hashlib.sha256(key.encode()).hexdigest()
    return hmac.compare_digest(expected, provided)
 # ---------------------------------------------------------------------------
 # Anonymization worker
 # ---------------------------------------------------------------------------
 _IP_RE = re.compile(
    r"\b(?:(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(?:25[0-5]|2[0-4]\d|[01]?\d\d?)\b"
 )
 _USERNAME_RE = re.compile(r"\bfor\s+(\w+)\b|\buser\s+(\w+)\b|\bsession\s+opened\s+for\s+(\w+)\b", re.IGNORECASE)
 def _pseudonym(value: str, salt: bytes, prefix: str) -> str:
    digest = hmac.new(salt, value.encode(), "sha256").hexdigest()[:10]
    return f"{prefix}-{digest}"
 def _anonymize_text(text: str, salt: bytes) -> str:
    def replace_ip(m: re.Match) -> str:
        return _pseudonym(m.group(), salt, "ip")
    def replace_user(m: re.Match) -> str:
        user = next(g for g in m.groups() if g)
        return m.group().replace(user, _pseudonym(user, salt, "user"))
    text = _IP_RE.sub(replace_ip, text)
    text = _USERNAME_RE.sub(replace_user, text)
    return text
 def run_anonymization(slug: str) -> dict[str, Any]:
    """Anonymize IPs and usernames in a branch DB in-place.
    Uses a stable per-branch salt so pseudonyms are consistent across runs
    but not reversible without the salt.
    """
    branch = next((b for b in _load_registry() if b["slug"] == slug), None)
    if branch is None:
        raise KeyError(f"Branch {slug!r} not found")
    db_path = ORCHARD_DATA_ROOT / slug / "turnstone.db"
    if not db_path.exists():
        return {"slug": slug, "anonymized": 0}
    # Per-branch salt derived from api_key_hash for stability
    salt = branch["api_key_hash"].encode()[:32].ljust(32, b"0")
    conn = sqlite3.connect(str(db_path), timeout=30)
    conn.execute("PRAGMA journal_mode=WAL")
    rows = conn.execute("SELECT id, text FROM log_entries WHERE anonymized IS NULL OR anonymized = 0").fetchall()
    updated = 0
    for row_id, text in rows:
        clean = _anonymize_text(text or "", salt)
        if clean != text:
            conn.execute("UPDATE log_entries SET text = ?, anonymized = 1 WHERE id = ?", (clean, row_id))
            updated += 1
        else:
            conn.execute("UPDATE log_entries SET anonymized = 1 WHERE id = ?", (row_id,))
    conn.commit()
    conn.close()
    logger.info("Anonymized %d/%d entries in branch %r", updated, len(rows), slug)
    return {"slug": slug, "anonymized": updated, "total_processed": len(rows)}
--- a/app/services/search.py
+++ b/app/services/search.py
@ -1,8 +1,4 @@
-"""FTS-based log search with optional hybrid BM25 + vector re-ranking.
+"""FTS5-based log search with severity, source, and pattern filters."""
 SQLite backend: FTS5 virtual table with Porter stemmer.
 Postgres backend: tsvector column with GIN index + websearch_to_tsquery.
 """
 from __future__ import annotations
 import json
@ -10,11 +6,8 @@ import logging
 import re
 import sqlite3
 from dataclasses import dataclass
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from app.db import BACKEND, Backend, frag, get_conn, resolve_tenant_id
 logger = logging.getLogger(__name__)
@ -35,47 +28,48 @@ class SearchResult:
 def build_fts_index(db_path: Path) -> None:
    """Build (or rebuild) the FTS5 index from log_entries. Safe to re-run.
-    For Postgres, the tsvector column is maintained by a trigger — this is a no-op.
+    Drops and recreates the table if the schema is stale (missing sequence column).
    """
-    if BACKEND == Backend.POSTGRES:
+    conn = sqlite3.connect(str(db_path))
-        return
+    conn.execute("PRAGMA journal_mode=WAL")
-    with get_conn(db_path) as conn:
+    # Check whether existing table has the sequence column; rebuild if not.
-        needs_rebuild = False
+    needs_rebuild = False
-        try:
+    try:
-            conn.execute("SELECT sequence FROM log_fts LIMIT 0")
+        conn.execute("SELECT sequence FROM log_fts LIMIT 0")
-        except Exception:
+    except sqlite3.OperationalError:
-            needs_rebuild = True
+        needs_rebuild = True
-        if needs_rebuild:
+    if needs_rebuild:
-            conn.execute("DROP TABLE IF EXISTS log_fts")
+        conn.execute("DROP TABLE IF EXISTS log_fts")
            conn.commit()
-        conn.execute("""
+    conn.executescript("""
-            CREATE VIRTUAL TABLE IF NOT EXISTS log_fts USING fts5(
+        CREATE VIRTUAL TABLE IF NOT EXISTS log_fts USING fts5(
-                text,
+            text,
-                entry_id      UNINDEXED,
+            entry_id      UNINDEXED,
-                source_id     UNINDEXED,
+            source_id     UNINDEXED,
-                sequence      UNINDEXED,
+            sequence      UNINDEXED,
-                severity      UNINDEXED,
+            severity      UNINDEXED,
-                timestamp_iso UNINDEXED,
+            timestamp_iso UNINDEXED,
-                matched_patterns UNINDEXED,
+            matched_patterns UNINDEXED,
-                repeat_count  UNINDEXED,
+            repeat_count  UNINDEXED,
-                out_of_order  UNINDEXED,
+            out_of_order  UNINDEXED,
-                tokenize = 'porter ascii'
+            tokenize = 'porter ascii'
-            )
+        );
-        """)
+    """)
-        conn.execute("""
+    # Only insert rows not already indexed
-            INSERT INTO log_fts(text, entry_id, source_id, sequence, severity,
+    conn.execute("""
-                                timestamp_iso, matched_patterns,
+        INSERT INTO log_fts(text, entry_id, source_id, sequence, severity,
-                                repeat_count, out_of_order)
+                            timestamp_iso, matched_patterns,
-            SELECT e.text, e.id, e.source_id, e.sequence, e.severity,
+                            repeat_count, out_of_order)
-                   e.timestamp_iso, e.matched_patterns,
+        SELECT e.text, e.id, e.source_id, e.sequence, e.severity,
-                   e.repeat_count, e.out_of_order
+               e.timestamp_iso, e.matched_patterns,
-            FROM log_entries e
+               e.repeat_count, e.out_of_order
-            WHERE e.id NOT IN (SELECT entry_id FROM log_fts WHERE entry_id IS NOT NULL)
+        FROM log_entries e
-        """)
+        WHERE e.id NOT IN (SELECT entry_id FROM log_fts WHERE entry_id IS NOT NULL)
-        conn.commit()
+    """)
    conn.commit()
    conn.close()
 def _sanitize_fts_query(raw: str, or_mode: bool = False) -> str:
@ -102,263 +96,55 @@ def search(
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
    semantic: bool = False,
 ) -> list[SearchResult]:
-    """Full-text search with optional filters. Returns results ranked by relevance.
+    """Full-text search with optional filters. Returns results ranked by relevance."""
    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row
    When ``semantic=True`` and an embedding backend is configured, the BM25
    candidate pool is re-ranked using hybrid scoring (BM25 + cosine similarity).
    Falls back silently to pure BM25 when the embedder is unavailable.
    """
    if semantic:
        return _hybrid_search(
            db_path, query, severity=severity, source_filter=source_filter,
            pattern_filter=pattern_filter, since=since, until=until, limit=limit,
            include_repeats=include_repeats, or_mode=or_mode,
        )
    return _bm25_search(
        db_path, query, severity=severity, source_filter=source_filter,
        pattern_filter=pattern_filter, since=since, until=until, limit=limit,
        include_repeats=include_repeats, or_mode=or_mode,
    )
 def _hybrid_search(
    db_path: Path,
    query: str,
    severity: str | None = None,
    source_filter: str | None = None,
    pattern_filter: str | None = None,
    since: str | None = None,
    until: str | None = None,
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
    alpha: float = 0.6,
    beta: float = 0.4,
 ) -> list[SearchResult]:
    """BM25 + vector re-ranking (late-fusion hybrid search).
    Fetches an oversized BM25 candidate pool, embeds the query and each
    candidate text in-process, then combines scores:
        hybrid_score = alpha * bm25_normalized + beta * cosine_sim
    BM25 normalization: FTS5 rank is negative (more negative = better match).
    We flip the sign and divide by the pool maximum so all BM25 scores land
    in (0, 1] — 1.0 for the top BM25 hit, approaching 0 for the weakest.
    Falls back to pure BM25 when the embedding backend is unavailable.
    """
    from app.services.embeddings import EMBEDDING_AVAILABLE, cosine_similarity, get_embedder
    # Fetch a large candidate pool — 5x limit, minimum 100 entries.
    pool_limit = max(limit * 5, 100)
    candidates = _bm25_search(
        db_path, query, severity=severity, source_filter=source_filter,
        pattern_filter=pattern_filter, since=since, until=until,
        limit=pool_limit, include_repeats=include_repeats, or_mode=or_mode,
    )
    if not candidates:
        return []
    if not EMBEDDING_AVAILABLE:
        return candidates[:limit]
    embedder = get_embedder()
    if embedder is None:
        return candidates[:limit]
    try:
        query_vec = embedder.embed(query)
        candidate_vecs = embedder.embed_batch([r.text for r in candidates])
    except Exception as exc:
        logger.warning("Hybrid search embedding failed (%s) — falling back to BM25", exc)
        return candidates[:limit]
    # Normalize BM25 ranks: FTS5 rank is negative, flip and scale to [0, 1].
    abs_ranks = [abs(r.rank) for r in candidates]
    max_rank = max(abs_ranks) or 1.0
    scored: list[tuple[float, SearchResult]] = []
    for result, abs_rank, cand_vec in zip(candidates, abs_ranks, candidate_vecs):
        bm25_norm = abs_rank / max_rank
        cos_sim = cosine_similarity(query_vec, cand_vec)
        hybrid = alpha * bm25_norm + beta * max(cos_sim, 0.0)
        scored.append((hybrid, result))
    scored.sort(key=lambda x: x[0], reverse=True)
    return [r for _, r in scored[:limit]]
 def _bm25_search(
    db_path: Path,
    query: str,
    severity: str | None = None,
    source_filter: str | None = None,
    pattern_filter: str | None = None,
    since: str | None = None,
    until: str | None = None,
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
 ) -> list[SearchResult]:
    """FTS search — BM25 via FTS5 (SQLite) or tsvector (Postgres)."""
    tid = resolve_tenant_id()
    if BACKEND == Backend.POSTGRES:
        return _pg_fts_search(
            db_path, query, tid,
            severity=severity, source_filter=source_filter,
            pattern_filter=pattern_filter, since=since, until=until,
            limit=limit, include_repeats=include_repeats,
        )
    return _sqlite_fts_search(
        db_path, query, tid,
        severity=severity, source_filter=source_filter,
        pattern_filter=pattern_filter, since=since, until=until,
        limit=limit, include_repeats=include_repeats, or_mode=or_mode,
    )
 def _sqlite_fts_search(
    db_path: Path,
    query: str,
    tid: str,
    severity: str | None,
    source_filter: str | None,
    pattern_filter: str | None,
    since: str | None,
    until: str | None,
    limit: int,
    include_repeats: bool,
    or_mode: bool,
 ) -> list[SearchResult]:
    fts_query = _sanitize_fts_query(query, or_mode=or_mode)
-    conditions = [
+    conditions = ["log_fts MATCH ?"]
-        "log_fts MATCH ?",
+    params: list = [fts_query]
        "(e.tenant_id = ? OR e.tenant_id = '')",
    ]
    params: list = [fts_query, tid]
    if severity:
-        conditions.append("f.severity = ?")
+        conditions.append("severity = ?")
        params.append(severity.upper())
    if source_filter:
-        conditions.append("f.source_id LIKE ?")
+        conditions.append("source_id LIKE ?")
        params.append(f"%{source_filter}%")
    if pattern_filter:
-        conditions.append("f.matched_patterns LIKE ?")
+        conditions.append("matched_patterns LIKE ?")
        params.append(f'%"{pattern_filter}"%')
    if since:
-        conditions.append("f.timestamp_iso >= ?")
+        conditions.append("timestamp_iso >= ?")
        params.append(since)
    if until:
-        conditions.append("f.timestamp_iso <= ?")
+        conditions.append("timestamp_iso <= ?")
        params.append(until)
    if not include_repeats:
        conditions.append("f.repeat_count = 1")
    where = " AND ".join(conditions)
    params.append(limit)
    raw = sqlite3.connect(str(db_path), timeout=30.0)
    raw.row_factory = sqlite3.Row
    try:
        rows = raw.execute(
            f"""
            SELECT f.entry_id, f.source_id, f.sequence, f.timestamp_iso, f.severity,
                   f.repeat_count, f.out_of_order, f.matched_patterns, f.text, f.rank
            FROM log_fts f
            JOIN log_entries e ON e.id = f.entry_id
            WHERE {where}
            ORDER BY f.rank
            LIMIT ?
            """,
            params,
        ).fetchall()
    except sqlite3.OperationalError as exc:
        logger.warning("FTS query failed (%s) — index may not be built yet", exc)
        return []
    finally:
        raw.close()
    return [
        SearchResult(
            entry_id=r["entry_id"],
            source_id=r["source_id"],
            sequence=r["sequence"],
            timestamp_iso=r["timestamp_iso"],
            severity=r["severity"],
            repeat_count=r["repeat_count"],
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
            rank=float(r["rank"]),
        )
        for r in rows
    ]
 def _pg_fts_search(
    db_path: Path,
    query: str,
    tid: str,
    severity: str | None,
    source_filter: str | None,
    pattern_filter: str | None,
    since: str | None,
    until: str | None,
    limit: int,
    include_repeats: bool,
 ) -> list[SearchResult]:
    """Postgres FTS via tsvector column and websearch_to_tsquery."""
    tsq = "websearch_to_tsquery('english', %s)"
    conditions = [
        f"text_tsv @@ {tsq}",
        "(tenant_id = %s OR tenant_id = '')",
    ]
    params: list = [query, tid]
    if severity:
        conditions.append("severity = %s")
        params.append(severity.upper())
    if source_filter:
        conditions.append("source_id LIKE %s")
        params.append(f"%{source_filter}%")
    if pattern_filter:
        conditions.append("matched_patterns LIKE %s")
        params.append(f'%"{pattern_filter}"%')
    if since:
        conditions.append("timestamp_iso >= %s")
        params.append(since)
    if until:
        conditions.append("timestamp_iso <= %s")
        params.append(until)
    if not include_repeats:
        conditions.append("repeat_count = 1")
    where = " AND ".join(conditions)
-    # ts_rank needs the tsquery again — append it then the limit
+    params.append(limit)
    params.extend([query, limit])
-    with get_conn(db_path) as conn:
+    try:
        rows = conn.execute(
            f"""
-            SELECT id AS entry_id, source_id, sequence, timestamp_iso, severity,
+            SELECT entry_id, source_id, sequence, timestamp_iso, severity,
-                   repeat_count, out_of_order, matched_patterns, text,
+                   repeat_count, out_of_order, matched_patterns, text, rank
-                   ts_rank(text_tsv, {tsq}) AS rank
+            FROM log_fts
            FROM log_entries
            WHERE {where}
-            ORDER BY rank DESC
+            ORDER BY rank
-            LIMIT %s
+            LIMIT ?
            """,
            params,
        ).fetchall()
    except sqlite3.OperationalError as e:
        logger.warning("FTS query failed (%s) — index may not be built yet", e)
        conn.close()
        return []
-    return [
+    results = [
        SearchResult(
            entry_id=r["entry_id"],
            source_id=r["source_id"],
@ -369,10 +155,12 @@ def _pg_fts_search(
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
-            rank=float(r["rank"]),
+            rank=r["rank"],
        )
        for r in rows
    ]
    conn.close()
    return results
 def entries_in_window(
@ -393,12 +181,12 @@ def entries_in_window(
    (e.g. network-syslog) don't crowd out lower-volume but more interesting ones.
    Errors/warnings are ranked first within each source partition.
    """
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
-    conditions: list[str] = [
+    conn.execute("PRAGMA journal_mode=WAL")
-        "repeat_count = 1",
+    conn.row_factory = sqlite3.Row
-        "(tenant_id = ? OR tenant_id = '')",
+
-    ]
+    conditions: list[str] = ["repeat_count = 1"]
-    params: list = [tid]
+    params: list = []
    if since:
        conditions.append("timestamp_iso >= ?")
@ -416,7 +204,8 @@ def entries_in_window(
    where = " AND ".join(conditions)
    if per_source_cap is not None:
-        sql = f"""
+        # Use a window function to cap rows per source, errors/warnings first.
        query = f"""
            WITH ranked AS (
                SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
                       repeat_count, out_of_order, matched_patterns, text, 0.0 as rank,
@ -443,7 +232,7 @@ def entries_in_window(
        """
        params.extend([per_source_cap, limit])
    else:
-        sql = f"""
+        query = f"""
            SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
                   repeat_count, out_of_order, matched_patterns, text, 0.0 as rank
            FROM log_entries
@ -453,8 +242,8 @@ def entries_in_window(
        """
        params.append(limit)
-    with get_conn(db_path) as conn:
+    rows = conn.execute(query, params).fetchall()
-        rows = conn.execute(sql, params).fetchall()
+    conn.close()
    return [
        SearchResult(
@ -467,7 +256,7 @@ def entries_in_window(
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
-            rank=float(r["rank"]),
+            rank=r["rank"],
        )
        for r in rows
    ]
@ -486,14 +275,16 @@ def recent_source_errors(
    Bypasses FTS ranking so text content doesn't affect which errors surface.
    Used by diagnose when FTS keyword search returns nothing for a known source.
    """
-    tid = resolve_tenant_id()
+    conn = sqlite3.connect(str(db_path))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row
    conditions = [
        "source_id LIKE ?",
        "severity = ?",
        "repeat_count = 1",
        "(tenant_id = ? OR tenant_id = '')",
    ]
-    params: list = [f"%{source_filter}%", severity.upper(), tid]
+    params: list = [f"%{source_filter}%", severity.upper()]
    if since:
        conditions.append("timestamp_iso >= ?")
@ -505,18 +296,18 @@ def recent_source_errors(
    params.append(limit)
    where = " AND ".join(conditions)
-    with get_conn(db_path) as conn:
+    rows = conn.execute(
-        rows = conn.execute(
+        f"""
-            f"""
+        SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
-            SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
+               repeat_count, out_of_order, matched_patterns, text, 0.0 as rank
-                   repeat_count, out_of_order, matched_patterns, text, 0.0 as rank
+        FROM log_entries
-            FROM log_entries
+        WHERE {where}
-            WHERE {where}
+        ORDER BY timestamp_iso DESC
-            ORDER BY timestamp_iso DESC
+        LIMIT ?
-            LIMIT ?
+        """,
-            """,
+        params,
-            params,
+    ).fetchall()
-        ).fetchall()
+    conn.close()
    return [
        SearchResult(
@ -529,49 +320,35 @@ def recent_source_errors(
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
-            rank=float(r["rank"]),
+            rank=r["rank"],
        )
        for r in rows
    ]
 def list_sources(db_path: Path) -> list[dict]:
-    """Return sources with entry counts, grouped by prefix:host stem.
+    """Return distinct sources with entry counts and time ranges."""
-
+    conn = sqlite3.connect(str(db_path))
-    source_ids with three or more colon-separated segments (e.g.
+    conn.execute("PRAGMA journal_mode=WAL")
-    ``muninn-journal:Muninn:ssh.service``) are collapsed to their first two
+    rows = conn.execute("""
-    segments (``muninn-journal:Muninn``).  Single- or two-segment IDs are
+        SELECT
-    returned as-is.  ``unit_count`` reports how many distinct sub-units were
+            source_id,
-    merged into each row.
+            COUNT(*) as entry_count,
-    """
+            MIN(timestamp_iso) as earliest,
-    tid = resolve_tenant_id()
+            MAX(timestamp_iso) as latest,
-    group_expr = frag.source_group_expr("source_id")
+            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) as error_count
-    with get_conn(db_path) as conn:
+        FROM log_entries
-        rows = conn.execute(
+        GROUP BY source_id
-            f"""
+        ORDER BY entry_count DESC
-            SELECT
+    """).fetchall()
-                {group_expr} AS group_id,
+    conn.close()
                COUNT(DISTINCT source_id) AS unit_count,
                COUNT(*) AS entry_count,
                MIN(timestamp_iso) AS earliest,
                MAX(timestamp_iso) AS latest,
                SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT')
                         THEN 1 ELSE 0 END) AS error_count
            FROM log_entries
            WHERE (tenant_id = ? OR tenant_id = '')
            GROUP BY group_id
            ORDER BY entry_count DESC
            """,
            (tid,),
        ).fetchall()
    return [
        {
-            "source_id": r["group_id"],
+            "source_id": r[0],
-            "unit_count": r["unit_count"],
+            "entry_count": r[1],
-            "entry_count": r["entry_count"],
+            "earliest": r[2],
-            "earliest": r["earliest"],
+            "latest": r[3],
-            "latest": r["latest"],
+            "error_count": r[4],
            "error_count": r["error_count"],
        }
        for r in rows
    ]
@ -603,83 +380,43 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
    Queries plain log_entries (not FTS) so it works even before the index is built.
    """
    rules = _compile_overrides(severity_overrides or [])
    tid = resolve_tenant_id()
    group_expr = frag.source_group_expr("source_id")
    since_iso = (
        datetime.now(timezone.utc) - timedelta(hours=window_hours)
    ).strftime("%Y-%m-%dT%H:%M:%S")
-    with get_conn(db_path) as conn:
+    conn = sqlite3.connect(str(db_path))
-        row = conn.execute(
+    conn.execute("PRAGMA journal_mode=WAL")
-            """
+    conn.row_factory = sqlite3.Row
            SELECT
                COUNT(*) AS total,
                SUM(CASE WHEN severity = 'CRITICAL' THEN 1 ELSE 0 END) AS criticals,
                SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS errors
            FROM log_entries
            WHERE timestamp_iso >= ?
              AND repeat_count = 1
              AND (tenant_id = ? OR tenant_id = '')
            """,
            (since_iso, tid),
        ).fetchone()
        total_24h = int(row["total"] or 0)
        criticals_24h = int(row["criticals"] or 0)
        errors_24h = int(row["errors"] or 0)
-        source_rows = conn.execute(
+    since_expr = f"strftime('%Y-%m-%dT%H:%M:%S', 'now', '-{window_hours} hours')"
            f"""
            SELECT
                {group_expr} AS group_id,
                COUNT(*) AS entry_count,
                SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS error_count,
                MAX(timestamp_iso) AS latest
            FROM log_entries
            WHERE timestamp_iso >= ?
              AND repeat_count = 1
              AND (tenant_id = ? OR tenant_id = '')
            GROUP BY group_id
            ORDER BY error_count DESC, entry_count DESC
            """,
            (since_iso, tid),
        ).fetchall()
-        crit_rows = conn.execute(
+    # Overall counts in window
-            """
+    row = conn.execute(f"""
-            SELECT id as entry_id, source_id, timestamp_iso, severity, text
+        SELECT
-            FROM log_entries
+            COUNT(*) AS total,
-            WHERE severity = 'CRITICAL'
+            SUM(CASE WHEN severity = 'CRITICAL' THEN 1 ELSE 0 END) AS criticals,
-              AND repeat_count = 1
+            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS errors
-              AND (tenant_id = ? OR tenant_id = '')
+        FROM log_entries
-            ORDER BY timestamp_iso DESC
+        WHERE timestamp_iso >= {since_expr}
-            LIMIT 25
+          AND repeat_count = 1
-            """,
+    """).fetchone()
-            (tid,),
+    total_24h = int(row["total"] or 0)
-        ).fetchall()
+    criticals_24h = int(row["criticals"] or 0)
-
+    errors_24h = int(row["errors"] or 0)
        timeline_rows = conn.execute(
            """
            SELECT id as entry_id, source_id, timestamp_iso, severity, text
            FROM log_entries
            WHERE severity IN ('CRITICAL','ERROR','WARN','WARNING','EMERGENCY','ALERT')
              AND timestamp_iso >= ?
              AND timestamp_iso IS NOT NULL
              AND repeat_count = 1
              AND (tenant_id = ? OR tenant_id = '')
            ORDER BY timestamp_iso DESC
            LIMIT 300
            """,
            (since_iso, tid),
        ).fetchall()
        last_row = conn.execute(
            "SELECT MAX(ingest_time) AS t FROM log_entries WHERE (tenant_id = ? OR tenant_id = '')",
            (tid,),
        ).fetchone()
    # Per-source breakdown
    source_rows = conn.execute(f"""
        SELECT
            source_id,
            COUNT(*) AS entry_count,
            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS error_count,
            MAX(timestamp_iso) AS latest
        FROM log_entries
        WHERE timestamp_iso >= {since_expr}
          AND repeat_count = 1
        GROUP BY source_id
        ORDER BY error_count DESC, entry_count DESC
    """).fetchall()
    source_health = [
        {
-            "source_id": r["group_id"],
+            "source_id": r["source_id"],
            "entry_count": int(r["entry_count"]),
            "error_count": int(r["error_count"]),
            "latest": r["latest"],
@ -687,6 +424,16 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
        for r in source_rows
    ]
    # Fetch candidate criticals (fetch more so filtering doesn't leave us with too few)
    crit_rows = conn.execute("""
        SELECT id as entry_id, source_id, timestamp_iso, severity, text
        FROM log_entries
        WHERE severity = 'CRITICAL' AND repeat_count = 1
        ORDER BY timestamp_iso DESC
        LIMIT 25
    """).fetchall()
    # Apply overrides: skip entries whose effective severity is no longer CRITICAL
    suppressed = 0
    recent_criticals = []
    for r in crit_rows:
@ -704,18 +451,11 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
        else:
            suppressed += 1
-    timeline_events = [
+    # When did we last ingest anything?
-        {
+    last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone()
-            "entry_id": r["entry_id"],
+    last_ingested: str | None = last_row["t"] if last_row else None
            "source_id": r["source_id"],
            "timestamp_iso": r["timestamp_iso"],
            "severity": r["severity"],
            "text": r["text"],
        }
        for r in timeline_rows
    ]
-    last_gleaned: str | None = last_row["t"] if last_row else None
+    conn.close()
    return {
        "window_hours": window_hours,
@ -725,8 +465,7 @@ def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: lis
        "source_health": source_health,
        "recent_criticals": recent_criticals,
        "suppressed_criticals": suppressed,
-        "last_gleaned": last_gleaned,
+        "last_ingested": last_ingested,
        "timeline_events": timeline_events,
    }
--- a/app/services/ssh_targets.py
+++ b/app/services/ssh_targets.py
@ -1,265 +0,0 @@
 """SSH target registry — persisted in the main SQLite DB.
 Targets are stored as path references only. The private key is never
 read into the database, logged, or returned by any API response.
 """
 from __future__ import annotations
 import os
 import sqlite3
 import stat
 import time
 import uuid
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any
@dataclass
 class SshTarget:
    id: str
    label: str
    host: str
    port: int
    user: str
    key_path: str
    last_tested: str | None
    last_ok: bool | None
    last_error: str | None
    created_at: str
    updated_at: str
 def _row_to_target(row: tuple) -> SshTarget:
    return SshTarget(
        id=row[0],
        label=row[1],
        host=row[2],
        port=row[3],
        user=row[4],
        key_path=row[5],
        last_tested=row[6],
        last_ok=bool(row[7]) if row[7] is not None else None,
        last_error=row[8],
        created_at=row[9],
        updated_at=row[10],
    )
 def _now() -> str:
    return time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime())
 # ---------------------------------------------------------------------------
 # CRUD
 # ---------------------------------------------------------------------------
 def list_targets(db_path: Path) -> list[SshTarget]:
    conn = sqlite3.connect(str(db_path), timeout=10)
    rows = conn.execute(
        "SELECT id, label, host, port, user, key_path, last_tested, last_ok, last_error, created_at, updated_at "
        "FROM ssh_targets ORDER BY label"
    ).fetchall()
    conn.close()
    return [_row_to_target(r) for r in rows]
 def get_target(db_path: Path, target_id: str) -> SshTarget | None:
    conn = sqlite3.connect(str(db_path), timeout=10)
    row = conn.execute(
        "SELECT id, label, host, port, user, key_path, last_tested, last_ok, last_error, created_at, updated_at "
        "FROM ssh_targets WHERE id = ?",
        (target_id,),
    ).fetchone()
    conn.close()
    return _row_to_target(row) if row else None
 def create_target(
    db_path: Path,
    label: str,
    host: str,
    port: int,
    user: str,
    key_path: str,
 ) -> SshTarget:
    resolved = _validate_key_path(key_path)
    now = _now()
    target_id = str(uuid.uuid4())
    conn = sqlite3.connect(str(db_path), timeout=10)
    conn.execute(
        "INSERT INTO ssh_targets (id, label, host, port, user, key_path, created_at, updated_at) "
        "VALUES (?,?,?,?,?,?,?,?)",
        (target_id, label, host, port, user, str(resolved), now, now),
    )
    conn.commit()
    conn.close()
    return get_target(db_path, target_id)  # type: ignore[return-value]
 def update_target(
    db_path: Path,
    target_id: str,
    *,
    label: str | None = None,
    host: str | None = None,
    port: int | None = None,
    user: str | None = None,
    key_path: str | None = None,
 ) -> SshTarget | None:
    existing = get_target(db_path, target_id)
    if existing is None:
        return None
    resolved_key = str(_validate_key_path(key_path)) if key_path else existing.key_path
    conn = sqlite3.connect(str(db_path), timeout=10)
    conn.execute(
        "UPDATE ssh_targets SET label=?, host=?, port=?, user=?, key_path=?, updated_at=? WHERE id=?",
        (
            label if label is not None else existing.label,
            host if host is not None else existing.host,
            port if port is not None else existing.port,
            user if user is not None else existing.user,
            resolved_key,
            _now(),
            target_id,
        ),
    )
    conn.commit()
    conn.close()
    return get_target(db_path, target_id)
 def delete_target(db_path: Path, target_id: str) -> bool:
    conn = sqlite3.connect(str(db_path), timeout=10)
    cur = conn.execute("DELETE FROM ssh_targets WHERE id = ?", (target_id,))
    conn.commit()
    conn.close()
    return cur.rowcount > 0
 # ---------------------------------------------------------------------------
 # Test connection
 # ---------------------------------------------------------------------------
 def test_connection(db_path: Path, target_id: str) -> dict[str, Any]:
    """Attempt an SSH no-op and record the result.
    Runs `true` on the remote host — no data is pulled. Returns
    {ok: bool, error: str|null, tested_at: str}.
    """
    target = get_target(db_path, target_id)
    if target is None:
        raise KeyError(f"SSH target {target_id!r} not found")
    # Lazy import — paramiko is optional
    try:
        from paramiko import SSHClient, AutoAddPolicy, AuthenticationException, SSHException
    except ImportError:
        _record_test(db_path, target_id, ok=False, error="paramiko not installed")
        return {"ok": False, "error": "paramiko not installed — run: pip install paramiko", "tested_at": _now()}
    key_path = str(Path(target.key_path).expanduser())
    error: str | None = None
    ok = False
    try:
        client = SSHClient()
        client.set_missing_host_key_policy(AutoAddPolicy())
        client.connect(
            hostname=target.host,
            port=target.port,
            username=target.user,
            key_filename=key_path,
            timeout=10,
            banner_timeout=10,
        )
        stdin, stdout, stderr = client.exec_command("true", timeout=10)
        exit_code = stdout.channel.recv_exit_status()
        client.close()
        ok = exit_code == 0
        if not ok:
            error = f"Remote command exited with code {exit_code}"
    except AuthenticationException:
        error = f"Authentication failed — check key path and remote authorized_keys"
    except SSHException as exc:
        error = f"SSH error: {exc}"
    except OSError as exc:
        error = f"Connection failed: {exc}"
    except Exception as exc:
        error = f"Unexpected error: {exc}"
    tested_at = _now()
    _record_test(db_path, target_id, ok=ok, error=error, tested_at=tested_at)
    return {"ok": ok, "error": error, "tested_at": tested_at}
 def _record_test(
    db_path: Path,
    target_id: str,
    *,
    ok: bool,
    error: str | None,
    tested_at: str | None = None,
 ) -> None:
    if tested_at is None:
        tested_at = _now()
    conn = sqlite3.connect(str(db_path), timeout=10)
    conn.execute(
        "UPDATE ssh_targets SET last_tested=?, last_ok=?, last_error=?, updated_at=? WHERE id=?",
        (tested_at, 1 if ok else 0, error, _now(), target_id),
    )
    conn.commit()
    conn.close()
 # ---------------------------------------------------------------------------
 # Validation
 # ---------------------------------------------------------------------------
 def _validate_key_path(raw: str) -> Path:
    """Resolve and validate the SSH key path.
    Returns the resolved Path. Raises ValueError with a user-readable message
    on any problem (does not raise on world-readable — just returns a warning
    to the caller so the UI can display it non-blocking).
    """
    p = Path(raw).expanduser()
    if not p.exists():
        raise ValueError(f"Key file not found: {p}")
    if not p.is_file():
        raise ValueError(f"Key path is not a file: {p}")
    return p
 def key_path_warning(key_path: str) -> str | None:
    """Return a warning string if the key file has overly permissive mode, else None."""
    try:
        p = Path(key_path).expanduser()
        mode = p.stat().st_mode
        if mode & (stat.S_IRGRP | stat.S_IWGRP | stat.S_IROTH | stat.S_IWOTH):
            perms = oct(mode & 0o777)
            return f"Key file permissions are too open ({perms}). SSH may refuse to use it — run: chmod 600 {p}"
    except OSError:
        pass
    return None
 def target_to_dict(t: SshTarget, include_warning: bool = False) -> dict[str, Any]:
    """Serialize a target for API responses. Never includes key contents."""
    d: dict[str, Any] = {
        "id": t.id,
        "label": t.label,
        "host": t.host,
        "port": t.port,
        "user": t.user,
        "key_path": t.key_path,
        "last_tested": t.last_tested,
        "last_ok": t.last_ok,
        "last_error": t.last_error,
        "created_at": t.created_at,
        "updated_at": t.updated_at,
    }
    if include_warning:
        d["key_warning"] = key_path_warning(t.key_path)
    return d
--- a/app/services/ticket_export.py
+++ b/app/services/ticket_export.py
@ -1,213 +0,0 @@
 """Incident ticket export — push Turnstone incidents to external trackers.
 Supported targets: "notion", "jira"
 Each exporter receives the incident dict and a list of log entry dicts,
 and returns {"url": str, "ticket_id": str}.
 """
 from __future__ import annotations
 import json
 from typing import Any
 import httpx
 # ---------------------------------------------------------------------------
 # Notion exporter
 # ---------------------------------------------------------------------------
 def _notion_export(
    incident: dict[str, Any],
    entries: list[dict[str, Any]],
    token: str,
    database_id: str,
 ) -> dict[str, str]:
    """Create a Notion page in *database_id* from an incident.
    Notion block types used: heading_2, bulleted_list_item, paragraph.
    Rich text max length is 2000 chars per block.
    """
    if not token or not database_id:
        raise ValueError("Notion not configured — set notion_token and notion_database_id in Settings")
    def _text(s: str, bold: bool = False) -> dict:
        chunk: dict[str, Any] = {"type": "text", "text": {"content": s[:2000]}}
        if bold:
            chunk["annotations"] = {"bold": True}
        return chunk
    log_blocks: list[dict] = []
    for e in entries[:50]:  # Notion has page size limits
        line = f"[{e.get('severity') or '?'}] {e.get('source_id', '')} — {e.get('text', '')[:160]}"
        log_blocks.append({"object": "block", "type": "bulleted_list_item",
                            "bulleted_list_item": {"rich_text": [_text(line)]}})
    sev = incident.get("severity", "medium").upper()
    issue_type = incident.get("issue_type") or "—"
    window = f"{incident.get('started_at') or '?'} → {incident.get('ended_at') or 'ongoing'}"
    children: list[dict] = [
        {"object": "block", "type": "heading_2",
         "heading_2": {"rich_text": [_text("Incident Details", bold=True)]}},
        {"object": "block", "type": "paragraph",
         "paragraph": {"rich_text": [
             _text("Severity: ", bold=True), _text(sev),
             _text("   Type: ", bold=True), _text(issue_type),
             _text("   Window: ", bold=True), _text(window),
         ]}},
    ]
    if incident.get("notes"):
        children.append({"object": "block", "type": "paragraph",
                          "paragraph": {"rich_text": [_text("Notes: ", bold=True), _text(incident["notes"])]}})
    children.append({"object": "block", "type": "heading_2",
                      "heading_2": {"rich_text": [_text("Log Evidence")]}})
    children.extend(log_blocks)
    payload = {
        "parent": {"database_id": database_id},
        "properties": {
            "title": {"title": [_text(incident.get("label", "Unnamed Incident"))]},
        },
        "children": children,
    }
    resp = httpx.post(
        "https://api.notion.com/v1/pages",
        headers={
            "Authorization": f"Bearer {token}",
            "Notion-Version": "2022-06-28",
            "Content-Type": "application/json",
        },
        json=payload,
        timeout=15,
    )
    if not resp.is_success:
        raise RuntimeError(f"Notion API error {resp.status_code}: {resp.text[:300]}")
    page = resp.json()
    page_id = page["id"]
    url = page.get("url") or f"https://notion.so/{page_id.replace('-', '')}"
    return {"url": url, "ticket_id": page_id}
 # ---------------------------------------------------------------------------
 # Jira exporter
 # ---------------------------------------------------------------------------
 def _jira_export(
    incident: dict[str, Any],
    entries: list[dict[str, Any]],
    jira_url: str,
    email: str,
    api_token: str,
    project_key: str,
    issue_type: str = "Bug",
 ) -> dict[str, str]:
    """Create a Jira issue via REST API v3 (cloud or Server 8.4+)."""
    if not jira_url or not email or not api_token or not project_key:
        raise ValueError("Jira not configured — set jira_url, jira_email, jira_api_token, and jira_project_key in Settings")
    base = jira_url.rstrip("/")
    sev = incident.get("severity", "medium").upper()
    inc_type = incident.get("issue_type") or "incident"
    window = f"{incident.get('started_at') or '?'} → {incident.get('ended_at') or 'ongoing'}"
    log_lines = "\n".join(
        f"[{e.get('severity') or '?'}] {e.get('source_id', '')} — {e.get('text', '')[:160]}"
        for e in entries[:40]
    )
    description = (
        f"*Severity:* {sev}  |  *Type:* {inc_type}  |  *Window:* {window}\n\n"
        + (f"*Notes:* {incident['notes']}\n\n" if incident.get("notes") else "")
        + "h2. Log Evidence\n\n{{code}}\n" + log_lines + "\n{{code}}"
    )
    # Jira REST v3 uses Atlassian Document Format for description
    adf_body = {
        "type": "doc",
        "version": 1,
        "content": [
            {"type": "paragraph", "content": [{"type": "text", "text": description}]},
        ],
    }
    payload = {
        "fields": {
            "project": {"key": project_key},
            "summary": incident.get("label", "Unnamed Incident"),
            "issuetype": {"name": issue_type},
            "description": adf_body,
        }
    }
    import base64 as _b64
    creds = _b64.b64encode(f"{email}:{api_token}".encode()).decode()
    resp = httpx.post(
        f"{base}/rest/api/3/issue",
        headers={
            "Authorization": f"Basic {creds}",
            "Content-Type": "application/json",
            "Accept": "application/json",
        },
        json=payload,
        timeout=15,
    )
    if not resp.is_success:
        raise RuntimeError(f"Jira API error {resp.status_code}: {resp.text[:300]}")
    data = resp.json()
    issue_key = data["key"]
    url = f"{base}/browse/{issue_key}"
    return {"url": url, "ticket_id": issue_key}
 # ---------------------------------------------------------------------------
 # Public API
 # ---------------------------------------------------------------------------
 _EXPORTERS = {
    "notion": _notion_export,
    "jira": _jira_export,
 }
 def available_targets() -> list[str]:
    return list(_EXPORTERS.keys())
 def export_incident(
    target: str,
    incident: dict[str, Any],
    entries: list[dict[str, Any]],
    config: dict[str, str],
 ) -> dict[str, str]:
    """Dispatch to the appropriate exporter.
    *config* is pulled from the settings pref dict — callers pass the relevant
    subset so this service stays stateless and testable.
    Returns {"url": str, "ticket_id": str}.
    Raises ValueError for unknown target or missing config.
    Raises RuntimeError on API-level failures.
    """
    if target not in _EXPORTERS:
        raise ValueError(f"Unknown ticket target: {target!r}. Supported: {list(_EXPORTERS)}")
    if target == "notion":
        return _notion_export(
            incident, entries,
            token=config.get("notion_token", ""),
            database_id=config.get("notion_database_id", ""),
        )
    if target == "jira":
        return _jira_export(
            incident, entries,
            jira_url=config.get("jira_url", ""),
            email=config.get("jira_email", ""),
            api_token=config.get("jira_api_token", ""),
            project_key=config.get("jira_project_key", ""),
            issue_type=config.get("jira_issue_type", "Bug"),
        )
    raise ValueError(f"Unhandled target: {target!r}")
--- a/app/tasks/init.py
+++ b/app/tasks/init.py
--- a/app/tasks/anomaly_scorer.py
+++ b/app/tasks/anomaly_scorer.py
@ -1,114 +0,0 @@
 """Background anomaly scoring task.
 Runs score_unscored() after each glean cycle (triggered by glean_scheduler)
 or on its own interval when TURNSTONE_ANOMALY_INTERVAL is set.
 Set TURNSTONE_ANOMALY_MODEL to a HuggingFace model ID to activate.
 When the env var is empty (default) the scorer is a no-op.
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from app.services.anomaly import ScoringResult, score_unscored
 logger = logging.getLogger(__name__)
 _DEFAULT_INTERVAL = int(os.environ.get("TURNSTONE_ANOMALY_INTERVAL", "0"))
 _lock = asyncio.Lock()
@dataclass
 class ScorerState:
    last_run_at: str | None = None
    last_duration_s: float | None = None
    last_scored: int = 0
    last_detections: int = 0
    last_error: str | None = None
    run_count: int = 0
    next_run_at: str | None = None
    running: bool = False
    total_scored: int = 0
    total_detections: int = 0
 _state = ScorerState()
 def get_state() -> ScorerState:
    return _state
 async def run_once(
    db_path: Path,
    model_id: str = "",
    device: str = "cpu",
    batch_size: int = 256,
    threshold: float = 0.75,
 ) -> ScoringResult:
    """Score unscored entries once. Skips if already running or model not configured."""
    if _lock.locked():
        return ScoringResult(skipped=True, error="scorer already running")
    async with _lock:
        _state.running = True
        started = datetime.now(tz=timezone.utc)
        try:
            loop = asyncio.get_running_loop()
            result: ScoringResult = await loop.run_in_executor(
                None,
                lambda: score_unscored(db_path, model_id, device, batch_size, threshold),
            )
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_scored = result.scored
            _state.last_detections = result.detections
            _state.last_error = result.error
            _state.run_count += 1
            _state.total_scored += result.scored
            _state.total_detections += result.detections
            if not result.skipped:
                logger.info(
                    "Anomaly scorer: %d scored, %d detections in %.1fs",
                    result.scored, result.detections, duration,
                )
            return result
        except Exception as exc:
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_error = str(exc)
            _state.run_count += 1
            logger.error("Anomaly scorer failed: %s", exc)
            return ScoringResult(error=str(exc))
        finally:
            _state.running = False
 async def scorer_loop(
    db_path: Path,
    model_id: str,
    device: str,
    interval_s: int,
    batch_size: int = 256,
    threshold: float = 0.75,
 ) -> None:
    """Score unscored entries every interval_s seconds until cancelled."""
    logger.info("Anomaly scorer loop started — interval %ds, model: %s", interval_s, model_id)
    while True:
        await run_once(db_path, model_id, device, batch_size, threshold)
        next_run = datetime.now(tz=timezone.utc) + timedelta(seconds=interval_s)
        _state.next_run_at = next_run.isoformat()
        try:
            await asyncio.sleep(interval_s)
        except asyncio.CancelledError:
            logger.info("Anomaly scorer loop cancelled")
            _state.next_run_at = None
            raise
--- a/app/tasks/cybersec_scorer.py
+++ b/app/tasks/cybersec_scorer.py
@ -1,84 +0,0 @@
 """Background task wrapper for the cybersec zero-shot scoring pipeline."""
 from __future__ import annotations
 import asyncio
 import logging
 from dataclasses import dataclass, field
 from datetime import datetime, timezone
 from pathlib import Path
 from app.services.cybersec import score_security_entries
 logger = logging.getLogger(__name__)
 _lock = asyncio.Lock()
@dataclass
 class CybersecState:
    last_run_at: str | None = None
    last_duration_s: float | None = None
    last_scored: int = 0
    last_detections: int = 0
    last_error: str | None = None
    run_count: int = 0
    running: bool = False
    total_scored: int = 0
    total_detections: int = 0
 _state = CybersecState()
 def get_state() -> dict:
    return {
        "last_run_at":    _state.last_run_at,
        "last_duration_s":_state.last_duration_s,
        "last_scored":    _state.last_scored,
        "last_detections":_state.last_detections,
        "last_error":     _state.last_error,
        "run_count":      _state.run_count,
        "running":        _state.running,
        "total_scored":   _state.total_scored,
        "total_detections": _state.total_detections,
    }
 async def run_once(
    db_path: Path,
    model_id: str,
    device: str = "cpu",
    batch_size: int = 32,
    threshold: float = 0.60,
 ) -> None:
    """Single cybersec scoring pass — no-op if already running or no model set."""
    if not model_id or _lock.locked():
        return
    async with _lock:
        _state.running = True
        started = datetime.now(tz=timezone.utc)
        try:
            loop = asyncio.get_running_loop()
            result = await loop.run_in_executor(
                None,
                lambda: score_security_entries(db_path, model_id, device, batch_size, threshold),
            )
            elapsed = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at     = started.isoformat()
            _state.last_duration_s = elapsed
            _state.last_scored     = result.scored
            _state.last_detections = result.detections
            _state.last_error      = result.error
            _state.run_count      += 1
            _state.total_scored   += result.scored
            _state.total_detections += result.detections
            if result.error:
                logger.error("cybersec scorer error: %s", result.error)
            elif not result.skipped:
                logger.info(
                    "cybersec scorer: scored=%d detections=%d in %.1fs",
                    result.scored, result.detections, elapsed,
                )
        finally:
            _state.running = False
--- a/app/tasks/glean_scheduler.py
+++ b/app/tasks/glean_scheduler.py
@ -1,237 +0,0 @@
 """Periodic batch glean scheduler with optional CF submission.
 Runs glean_sources on a configurable interval (TURNSTONE_GLEAN_INTERVAL env var,
 default 900s / 15 min). Set to 0 to disable.
 When TURNSTONE_SUBMIT_ENDPOINT is set, pushes pattern-matched entries to a remote
 Turnstone instance (the CF receiving store) after each glean run.
 """
 from __future__ import annotations
 import asyncio
 import json
 import logging
 from app.db import get_conn, resolve_tenant_id
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Any
 import httpx
 from app.glean.pipeline import glean_sources
 from app.tasks.anomaly_scorer import run_once as _run_scorer
 from app.tasks.cybersec_scorer import run_once as _run_cybersec
 from app.tasks.incident_detector import run_once as _run_incident_detector
 logger = logging.getLogger(__name__)
 _lock = asyncio.Lock()
@dataclass
 class IngestState:
    last_run_at: str | None = None
    last_duration_s: float | None = None
    last_stats: dict[str, int] = field(default_factory=dict)
    last_error: str | None = None
    run_count: int = 0
    next_run_at: str | None = None
    running: bool = False
    last_submitted_at: str | None = None
    last_submit_count: int = 0
    last_submit_error: str | None = None
 _state = IngestState()
 def get_state() -> IngestState:
    return _state
 def _query_matched_since(db_path: Path, since: str | None) -> list[dict]:
    """Return entries with non-empty matched_patterns, optionally filtered by ingest_time."""
    tid = resolve_tenant_id()
    with get_conn(db_path) as conn:
        if since:
            rows = conn.execute(
                """
                SELECT id, source_id, sequence, timestamp_raw, timestamp_iso,
                       ingest_time, severity, repeat_count, out_of_order,
                       matched_patterns, text
                FROM log_entries
                WHERE matched_patterns != '[]'
                  AND ingest_time > ?
                  AND (tenant_id = ? OR tenant_id = '')
                ORDER BY ingest_time
                LIMIT 5000
                """,
                (since, tid),
            ).fetchall()
        else:
            rows = conn.execute(
                """
                SELECT id, source_id, sequence, timestamp_raw, timestamp_iso,
                       ingest_time, severity, repeat_count, out_of_order,
                       matched_patterns, text
                FROM log_entries
                WHERE matched_patterns != '[]'
                  AND (tenant_id = ? OR tenant_id = '')
                ORDER BY ingest_time DESC
                LIMIT 5000
                """,
                (tid,),
            ).fetchall()
    return [dict(r) for r in rows]
 async def submit_matched(
    db_path: Path,
    submit_endpoint: str,
    source_host: str,
    since: str | None = None,
 ) -> dict[str, Any]:
    """Push pattern-matched entries to the remote CF receiving instance."""
    loop = asyncio.get_running_loop()
    entries = await loop.run_in_executor(
        None, lambda: _query_matched_since(db_path, since)
    )
    if not entries:
        return {"ok": True, "submitted": 0, "skipped": True}
    url = f"{submit_endpoint.rstrip('/')}/turnstone/api/glean/batch"
    payload = {"source_host": source_host, "entries": entries}
    try:
        async with httpx.AsyncClient(timeout=30.0) as client:
            resp = await client.post(url, json=payload)
            resp.raise_for_status()
        result = resp.json()
        submitted = result.get("gleaned", len(entries))
        _state.last_submitted_at = datetime.now(tz=timezone.utc).isoformat()
        _state.last_submit_count = submitted
        _state.last_submit_error = None
        logger.info("Submitted %d matched entries to %s", submitted, submit_endpoint)
        return {"ok": True, "submitted": submitted}
    except Exception as exc:
        _state.last_submit_error = str(exc)
        logger.warning("Submission to %s failed: %s", submit_endpoint, exc)
        return {"ok": False, "error": str(exc)}
 async def run_once(
    sources_file: Path,
    db_path: Path,
    pattern_file: Path | None = None,
    submit_endpoint: str | None = None,
    source_host: str = "unknown",
    force: bool = False,
    anomaly_model: str = "",
    anomaly_device: str = "cpu",
    anomaly_threshold: float = 0.75,
    cybersec_model: str = "",
    cybersec_device: str = "cpu",
    cybersec_threshold: float = 0.60,
    incidents_db_path: Path | None = None,
    auto_incident: bool = True,
 ) -> dict[str, Any]:
    """Ingest all sources once, then submit matched entries if configured.
    Pass ``force=True`` to bypass fingerprint checks and re-glean all local
    file sources regardless of whether they appear unchanged.
    """
    if _lock.locked():
        return {"ok": False, "error": "glean already running", "skipped": True}
    async with _lock:
        _state.running = True
        started = datetime.now(tz=timezone.utc)
        try:
            loop = asyncio.get_running_loop()
            stats: dict[str, int] = await loop.run_in_executor(
                None,
                lambda: glean_sources(sources_file, db_path, pattern_file, force=force),
            )
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_stats = stats
            _state.last_error = None
            _state.run_count += 1
            logger.info("Batch glean complete in %.1fs — %s", duration, stats)
        except Exception as exc:
            duration = (datetime.now(tz=timezone.utc) - started).total_seconds()
            _state.last_run_at = started.isoformat()
            _state.last_duration_s = round(duration, 2)
            _state.last_error = str(exc)
            _state.run_count += 1
            logger.error("Batch glean failed: %s", exc)
            _state.running = False
            return {"ok": False, "error": str(exc)}
        finally:
            _state.running = False
    if submit_endpoint:
        await submit_matched(db_path, submit_endpoint, source_host, since=_state.last_submitted_at)
    if anomaly_model:
        await _run_scorer(db_path, anomaly_model, anomaly_device, threshold=anomaly_threshold)
    if cybersec_model:
        await _run_cybersec(db_path, cybersec_model, cybersec_device, threshold=cybersec_threshold)
    if auto_incident and incidents_db_path:
        glean_started_iso = _state.last_run_at
        result = await _run_incident_detector(db_path, incidents_db_path, since=glean_started_iso)
        if result["created"]:
            logger.info("Incident detector: %d incident(s) auto-created", result["created"])
    return {"ok": True, "stats": _state.last_stats, "duration_s": _state.last_duration_s}
 async def scheduler_loop(
    sources_file: Path,
    db_path: Path,
    pattern_file: Path | None,
    interval_s: int,
    submit_endpoint: str | None = None,
    source_host: str = "unknown",
    anomaly_model: str = "",
    anomaly_device: str = "cpu",
    anomaly_threshold: float = 0.75,
    cybersec_model: str = "",
    cybersec_device: str = "cpu",
    cybersec_threshold: float = 0.60,
    incidents_db_path: Path | None = None,
    auto_incident: bool = True,
 ) -> None:
    """Run glean + optional submission + optional anomaly/cybersec scoring every interval_s seconds."""
    logger.info("Ingest scheduler started — interval %ds, sources: %s", interval_s, sources_file)
    if submit_endpoint:
        logger.info("Submission enabled — endpoint: %s", submit_endpoint)
    if anomaly_model:
        logger.info("Anomaly scoring enabled — model: %s", anomaly_model)
    if cybersec_model:
        logger.info("Cybersec scoring enabled — model: %s", cybersec_model)
    if auto_incident and incidents_db_path:
        logger.info("Auto-incident detection enabled")
    while True:
        await run_once(
            sources_file, db_path, pattern_file, submit_endpoint, source_host,
            anomaly_model=anomaly_model,
            anomaly_device=anomaly_device,
            anomaly_threshold=anomaly_threshold,
            cybersec_model=cybersec_model,
            cybersec_device=cybersec_device,
            cybersec_threshold=cybersec_threshold,
            incidents_db_path=incidents_db_path,
            auto_incident=auto_incident,
        )
        next_run = datetime.now(tz=timezone.utc) + timedelta(seconds=interval_s)
        _state.next_run_at = next_run.isoformat()
        try:
            await asyncio.sleep(interval_s)
        except asyncio.CancelledError:
            logger.info("Ingest scheduler cancelled")
            _state.next_run_at = None
            raise
--- a/app/tasks/incident_detector.py
+++ b/app/tasks/incident_detector.py
@ -1,188 +0,0 @@
 """Post-glean automatic incident detection.
 After each batch glean, scan entries ingested since the last run for
 ERROR/CRITICAL clusters. If a source produces >= threshold errors within
 window_s seconds, auto-create an incident unless one already exists for
 that source in that time window.
 Environment variables (all optional):
    TURNSTONE_AUTO_INCIDENT_THRESHOLD   integer, default 5
    TURNSTONE_AUTO_INCIDENT_WINDOW      seconds, default 600 (10 min)
 """
 from __future__ import annotations
 import asyncio
 import logging
 import os
 from collections import defaultdict
 from datetime import datetime, timezone
 from pathlib import Path
 from app.db import get_conn, resolve_tenant_id
 from app.services.incidents import create_incident
 logger = logging.getLogger(__name__)
 _THRESHOLD = int(os.environ.get("TURNSTONE_AUTO_INCIDENT_THRESHOLD", "5"))
 _WINDOW_S  = int(os.environ.get("TURNSTONE_AUTO_INCIDENT_WINDOW",    "600"))
 # Severity rank — used to pick the cluster's worst severity
 _SEV_RANK = {"CRITICAL": 3, "ERROR": 2, "WARN": 1, "INFO": 0, "DEBUG": 0}
 def _query_recent_errors(db_path: Path, since: str | None) -> list[dict]:
    tid = resolve_tenant_id()
    with get_conn(db_path) as conn:
        if since:
            rows = conn.execute(
                """
                SELECT source_id, timestamp_iso, severity
                FROM log_entries
                WHERE severity IN ('ERROR', 'CRITICAL')
                  AND ingest_time > ?
                  AND (tenant_id = ? OR tenant_id = '')
                ORDER BY source_id, timestamp_iso ASC
                """,
                (since, tid),
            ).fetchall()
        else:
            rows = conn.execute(
                """
                SELECT source_id, timestamp_iso, severity
                FROM log_entries
                WHERE severity IN ('ERROR', 'CRITICAL')
                  AND (tenant_id = ? OR tenant_id = '')
                ORDER BY source_id, timestamp_iso ASC
                LIMIT 10000
                """,
                (tid,),
            ).fetchall()
    return [dict(r) for r in rows]
 def _parse_ts(iso: str | None) -> float | None:
    """Parse ISO timestamp to epoch seconds; return None on failure."""
    if not iso:
        return None
    try:
        dt = datetime.fromisoformat(iso.replace("Z", "+00:00"))
        return dt.timestamp()
    except (ValueError, TypeError):
        return None
 def _find_clusters(
    events: list[dict], window_s: int, threshold: int
 ) -> list[tuple[str, str, str]]:
    """Return (started_at_iso, ended_at_iso, worst_severity) for each cluster."""
    # Filter to events with parseable timestamps, sorted ascending
    timed = []
    for e in events:
        t = _parse_ts(e["timestamp_iso"])
        if t is not None:
            timed.append((t, e["timestamp_iso"], e["severity"]))
    timed.sort()
    clusters: list[tuple[str, str, str]] = []
    i = 0
    while i < len(timed):
        j = i
        while j < len(timed) and timed[j][0] - timed[i][0] <= window_s:
            j += 1
        count = j - i
        if count >= threshold:
            worst = max((timed[k][2] for k in range(i, j)), key=lambda s: _SEV_RANK.get(s, 0))
            clusters.append((timed[i][1], timed[j - 1][1], worst))
            i = j  # skip past the cluster to avoid overlap
        else:
            i += 1
    return clusters
 def _incident_exists_for_cluster(
    incidents_db_path: Path, source_id: str, started_at: str, ended_at: str
 ) -> bool:
    """Return True if an auto-incident for this source already covers the window."""
    issue_type = f"auto:{source_id}"
    start_ts = _parse_ts(started_at)
    end_ts   = _parse_ts(ended_at)
    if start_ts is None or end_ts is None:
        return False
    tid = resolve_tenant_id()
    with get_conn(incidents_db_path) as conn:
        rows = conn.execute(
            """
            SELECT started_at, ended_at FROM incidents
            WHERE issue_type = ?
              AND (tenant_id = ? OR tenant_id = '')
            """,
            (issue_type, tid),
        ).fetchall()
    for row in rows:
        ex_start = _parse_ts(row["started_at"])
        ex_end   = _parse_ts(row["ended_at"])
        if ex_start is None or ex_end is None:
            continue
        # Overlap check: two intervals [a,b] and [c,d] overlap when a<=d and b>=c
        if ex_start <= end_ts and ex_end >= start_ts:
            return True
    return False
 def detect_and_create(
    db_path: Path,
    incidents_db_path: Path,
    since: str | None,
    threshold: int = _THRESHOLD,
    window_s: int = _WINDOW_S,
 ) -> dict[str, int]:
    """Detect error clusters and create incidents. Returns {"created": N}."""
    entries = _query_recent_errors(db_path, since)
    if not entries:
        return {"created": 0}
    by_source: dict[str, list[dict]] = defaultdict(list)
    for e in entries:
        by_source[e["source_id"]].append(e)
    created = 0
    for source_id, events in by_source.items():
        clusters = _find_clusters(events, window_s, threshold)
        for started_at, ended_at, worst_sev in clusters:
            if _incident_exists_for_cluster(incidents_db_path, source_id, started_at, ended_at):
                continue
            n = len(events)  # event count for this source in the glean window
            sev_label = "critical" if worst_sev == "CRITICAL" else "high"
            create_incident(
                incidents_db_path,
                label=f"Auto: {source_id} — {n} errors",
                issue_type=f"auto:{source_id}",
                started_at=started_at,
                ended_at=ended_at,
                notes="Auto-detected error cluster. Review and label as needed.",
                severity=sev_label,
            )
            logger.info(
                "Auto-incident created: source=%s window=[%s, %s] severity=%s",
                source_id, started_at, ended_at, sev_label,
            )
            created += 1
    if created:
        logger.info("Incident detector: %d new incident(s) created", created)
    return {"created": created}
 async def run_once(
    db_path: Path,
    incidents_db_path: Path,
    since: str | None,
    threshold: int = _THRESHOLD,
    window_s: int = _WINDOW_S,
 ) -> dict[str, int]:
    """Async wrapper — runs detection in a thread to avoid blocking the event loop."""
    loop = asyncio.get_running_loop()
    return await loop.run_in_executor(
        None,
        lambda: detect_and_create(db_path, incidents_db_path, since, threshold, window_s),
    )
--- a/app/watch/watcher.py
+++ b/app/watch/watcher.py
@ -1,4 +1,4 @@
-"""Live watch: tail active log sources and glean entries in near-real-time.
+"""Live watch: tail active log sources and ingest entries in near-real-time.
 Each WatchSource runs a subprocess (journalctl -f, podman/docker logs -f)
 in a daemon thread and pipes lines through the existing ingestors into SQLite.
@ -8,6 +8,7 @@ from __future__ import annotations
 import json
 import logging
 import sqlite3
 import subprocess
 import threading
 from dataclasses import dataclass, field
@ -17,19 +18,20 @@ from typing import Iterator
 import yaml
-from app.glean import journald as journald_parser, syslog as syslog_parser
+from app.ingest import journald as journald_parser, syslog as syslog_parser
-from app.glean import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
+from app.ingest import plaintext as plaintext_parser, servarr as servarr_parser, plex as plex_parser
-from app.glean import qbittorrent as qbit_parser, caddy as caddy_parser
+from app.ingest import qbittorrent as qbit_parser, caddy as caddy_parser
-from app.db import get_conn
+from app.ingest.pipeline import _detect_format
-from app.db.schema import ensure_schema
+from app.ingest.base import _compile, load_patterns, now_iso
-from app.glean.pipeline import _detect_format, _write_batch
+from app.ingest.pipeline import _write_batch, _SCHEMA
-from app.glean.base import _compile, load_patterns, now_iso
+from app.services.search import build_fts_index
 from app.services.models import RetrievedEntry
 logger = logging.getLogger(__name__)
 FLUSH_INTERVAL_SEC = 10
 FLUSH_BATCH_SIZE = 100
 FTS_SYNC_EVERY_N_FLUSHES = 3  # sync FTS every ~30s under normal load
 # ── Config ────────────────────────────────────────────────────────────────────
@ -83,7 +85,7 @@ class WatchSource:
            "source_id": self.config.source_id,
            "type": self.config.source_type,
            "running": self._thread is not None and self._thread.is_alive(),
-            "entries_gleaned": self._entry_count,
+            "entries_ingested": self._entry_count,
            "last_event": self._last_event,
            "error": self._error,
        }
@ -109,7 +111,10 @@ class WatchSource:
        patterns = load_patterns(self.pattern_file)
        compiled = _compile(patterns)
-        ensure_schema(self.db_path)
+        conn = sqlite3.connect(str(self.db_path))
        conn.execute("PRAGMA journal_mode=WAL")
        conn.executescript(_SCHEMA)
        conn.commit()
        try:
            cmd = self._build_command()
@ -122,10 +127,12 @@ class WatchSource:
                text=True,
                bufsize=1,
            )
-            self._drain(compiled)
+            self._drain(conn, compiled)
        except Exception as exc:
            self._error = str(exc)
            logger.error("Watch source %r crashed: %s", self.config.source_id, exc)
        finally:
            conn.close()
    def _build_command(self) -> list[str] | None:
        t = self.config.source_type
@ -186,7 +193,7 @@ class WatchSource:
        return []
-    def _drain(self, compiled) -> None:
+    def _drain(self, conn: sqlite3.Connection, compiled) -> None:
        """Read lines from the subprocess and flush to DB periodically."""
        assert self._proc is not None
        buffer: list[str] = []
@ -214,28 +221,29 @@ class WatchSource:
            should_flush = len(buffer) >= FLUSH_BATCH_SIZE or elapsed >= FLUSH_INTERVAL_SEC
            if buffer and should_flush:
-                flush_count = self._flush(buffer, compiled, flush_count)
+                flush_count = self._flush(conn, buffer, compiled, flush_count)
                buffer.clear()
                last_flush = datetime.now(tz=timezone.utc)
        # Flush remainder
        if buffer:
-            self._flush(buffer, compiled, flush_count)
+            self._flush(conn, buffer, compiled, flush_count)
-    def _flush(self, lines: list[str], compiled, flush_count: int) -> int:
+    def _flush(self, conn: sqlite3.Connection, lines: list[str], compiled, flush_count: int) -> int:
        ingest_time = now_iso()
        try:
            entries = self._parse_lines(lines, ingest_time, compiled)
            if entries:
-                with get_conn(self.db_path) as conn:
+                _write_batch(conn, entries)
-                    _write_batch(conn, entries)
+                conn.commit()
                    conn.commit()
                self._entry_count += len(entries)
                self._last_event = now_iso()
                if entries:
                    self._last_event = entries[-1].timestamp_iso or self._last_event
            flush_count += 1
            if flush_count % FTS_SYNC_EVERY_N_FLUSHES == 0:
                build_fts_index(self.db_path)
        except Exception as exc:
            logger.warning("Flush error for %r: %s", self.config.source_id, exc)
        return flush_count
--- a/docker-compose.submissions.yml
+++ b/docker-compose.submissions.yml
@ -1,74 +0,0 @@
 # Turnstone — CF receiving instances for external node submissions.
 #
 # These are SEPARATE instances from the main Turnstone deployment. Each node
 # that has TURNSTONE_SUBMIT_ENDPOINT configured pushes pattern-matched entries
 # here. Each instance has its own isolated database. Avocet reads these
 # databases for training data.
 #
 # Ports:
 #   8536 → submissions-contrib1  (harvest.circuitforge.tech/contrib1/*)
 #   8537 → submissions-contrib2  (harvest.circuitforge.tech/contrib2/*)
 #
 # Deploy on Heimdall:
 #   docker compose -f docker-compose.submissions.yml up -d
 #
 # Database locations:
 #   /devl/docker/turnstone-submissions/contrib1/turnstone.db
 #   /devl/docker/turnstone-submissions/contrib2/turnstone.db
 #
 # These instances have TURNSTONE_INGEST_INTERVAL=0 — they only receive POSTs,
 # they do not run their own scheduled ingest.
 services:
  submissions-contrib1:
    image: turnstone:latest
    container_name: turnstone-submissions-contrib1
    restart: unless-stopped
    ports:
      - "8536:8534"
    volumes:
      - /devl/docker/turnstone-submissions/contrib1:/data:z
      - /devl/docker/turnstone-submissions/contrib1/patterns:/patterns:ro
    environment:
      TURNSTONE_DB: /data/turnstone.db
      TURNSTONE_PATTERNS: /patterns
      TURNSTONE_SOURCE_HOST: submissions-contrib1
      TURNSTONE_INGEST_INTERVAL: "0"
      PYTHONUNBUFFERED: "1"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8534/turnstone/health"]
      interval: 30s
      timeout: 10s
      start_period: 20s
      retries: 3
    networks:
      - caddy-internal
  submissions-contrib2:
    image: turnstone:latest
    container_name: turnstone-submissions-contrib2
    restart: unless-stopped
    ports:
      - "8537:8534"
    volumes:
      - /devl/docker/turnstone-submissions/contrib2:/data:z
      - /devl/docker/turnstone-submissions/contrib2/patterns:/patterns:ro
    environment:
      TURNSTONE_DB: /data/turnstone.db
      TURNSTONE_PATTERNS: /patterns
      TURNSTONE_SOURCE_HOST: submissions-contrib2
      TURNSTONE_INGEST_INTERVAL: "0"
      PYTHONUNBUFFERED: "1"
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8534/turnstone/health"]
      interval: 30s
      timeout: 10s
      start_period: 20s
      retries: 3
    networks:
      - caddy-internal
 networks:
  caddy-internal:
    name: caddy-proxy_caddy-internal
    external: true
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,68 +0,0 @@
 version: "3.9"
 # Turnstone with external Postgres DB.
 # Data lives in the named volume `turnstone_pgdata` — survives image rebuilds.
 # To adopt an EXISTING Postgres install, set DATABASE_URL to point at it and
 # remove the `db` service and `depends_on` blocks.
 #
 # Quick start:
 #   docker compose up -d
 #   # Then open http://localhost:8520
 services:
  db:
    image: postgres:16-alpine
    restart: unless-stopped
    environment:
      POSTGRES_DB: turnstone
      POSTGRES_USER: turnstone
      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-turnstone_dev}
    volumes:
      - turnstone_pgdata:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U turnstone -d turnstone"]
      interval: 5s
      timeout: 5s
      retries: 5
  turnstone:
    build: .
    restart: unless-stopped
    ports:
      - "${TURNSTONE_PORT:-8520}:8520"
    depends_on:
      db:
        condition: service_healthy
    environment:
      # Backend selection — comment out DATABASE_URL to fall back to SQLite
      DATABASE_URL: postgresql://turnstone:${POSTGRES_PASSWORD:-turnstone_dev}@db:5432/turnstone
      TURNSTONE_TENANT_ID: ${TURNSTONE_TENANT_ID:-}
      TURNSTONE_API_KEY: ${TURNSTONE_API_KEY:-}
      TURNSTONE_GLEAN_INTERVAL: ${TURNSTONE_GLEAN_INTERVAL:-900}
      TURNSTONE_SOURCE_HOST: ${TURNSTONE_SOURCE_HOST:-}
      TURNSTONE_SUBMIT_ENDPOINT: ${TURNSTONE_SUBMIT_ENDPOINT:-}
      # --- Multi-agent diagnose pipeline ---
      TURNSTONE_MULTI_AGENT_DIAGNOSE: ${TURNSTONE_MULTI_AGENT_DIAGNOSE:-false}
      TURNSTONE_CLASSIFIER_MODEL: ${TURNSTONE_CLASSIFIER_MODEL:-}
      TURNSTONE_EMBED_BACKEND: ${TURNSTONE_EMBED_BACKEND:-}
      TURNSTONE_EMBED_MODEL: ${TURNSTONE_EMBED_MODEL:-}
      TURNSTONE_EMBED_DEVICE: ${TURNSTONE_EMBED_DEVICE:-cpu}
      # --- Cybersec scoring pipeline ---
      TURNSTONE_CYBERSEC_MODEL: ${TURNSTONE_CYBERSEC_MODEL:-}
      TURNSTONE_CYBERSEC_DEVICE: ${TURNSTONE_CYBERSEC_DEVICE:-cpu}
      TURNSTONE_CYBERSEC_THRESHOLD: ${TURNSTONE_CYBERSEC_THRESHOLD:-0.60}
      # --- Anomaly scoring pipeline ---
      TURNSTONE_ANOMALY_MODEL: ${TURNSTONE_ANOMALY_MODEL:-}
      TURNSTONE_ANOMALY_DEVICE: ${TURNSTONE_ANOMALY_DEVICE:-cpu}
      TURNSTONE_ANOMALY_THRESHOLD: ${TURNSTONE_ANOMALY_THRESHOLD:-0.75}
      TURNSTONE_ANOMALY_INTERVAL: ${TURNSTONE_ANOMALY_INTERVAL:-0}
      # --- HuggingFace model cache ---
      HF_HOME: /hf_cache
    volumes:
      - ./patterns:/app/patterns:ro
      - ./data:/app/data  # optional: persists SQLite files if DATABASE_URL unset
      - ${HF_CACHE_PATH:-/Library/Assets/LLM}:/hf_cache:ro  # shared model cache
 volumes:
  turnstone_pgdata:
    name: turnstone_pgdata
--- a/docker-standalone.sh
+++ b/docker-standalone.sh
@ -1,171 +0,0 @@
 #!/usr/bin/env bash
 # docker-standalone.sh — Turnstone Docker setup (no Compose)
 #
 # For hosts running Docker (not Podman). The container restarts automatically
 # on boot via Docker's built-in restart policy — no systemd unit needed.
 # Turnstone is a diagnostic log intelligence layer — glean service logs,
 # search by symptom, and view incidents in a lightweight web UI.
 #
 # ── Prerequisites ────────────────────────────────────────────────────────────
 #   1. Clone the repo:
 #        git clone https://git.opensourcesolarpunk.com/Circuit-Forge/turnstone.git ~/turnstone
 #        (or wherever you prefer — update REPO_DIR below)
 #
 #   2. Build the image:
 #        cd ~/turnstone && docker build -t localhost/turnstone:latest .
 #
 #   3. Create data and patterns directories, then copy config files:
 #        mkdir -p ~/turnstone/{data,patterns}
 #        cp ~/turnstone/patterns/default.yaml ~/turnstone/patterns/
 #        cp ~/turnstone/patterns/sources.yaml ~/turnstone/patterns/
 #        # Edit sources.yaml — set log paths that exist on this host.
 #
 #   4. Set any env vars (see sections below), then run this script:
 #        bash ~/turnstone/docker-standalone.sh
 #
 # ── After setup ──────────────────────────────────────────────────────────────
 #   The container starts with --restart=unless-stopped so it survives reboots.
 #   To upgrade: git pull && bash ~/turnstone/docker-standalone.sh
 #
 # ── Gleaning logs ─────────────────────────────────────────────────────────────
 #   All service logs under /opt are accessible inside the container.
 #   Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/).
 #
 #   To glean all sources (run manually or via cron):
 #
 #     docker exec turnstone python scripts/glean_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db
 #
 #   Example cron (every 15 minutes, add with: crontab -e):
 #     */15 * * * * docker exec turnstone python scripts/glean_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1
 #
 #   To add a new log source: edit patterns/sources.yaml — no restart needed.
 #
 # ── Adding Caddy reverse proxy ────────────────────────────────────────────────
 #   Add to /etc/caddy/Caddyfile on this host:
 #
 #     turnstone.yourdomain.tld {
 #       import protected
 #       reverse_proxy 127.0.0.1:8534
 #       import cloudflare
 #     }
 #
 #   Then: sudo systemctl reload caddy
 #
 # ── Ports ────────────────────────────────────────────────────────────────────
 #   Turnstone UI  → http://localhost:8534/turnstone/
 #
 set -euo pipefail
 # ── Paths — update to match your clone location ──────────────────────────────
 REPO_DIR="${HOME}/turnstone"
 DATA_DIR="${REPO_DIR}/data"
 PATTERNS_DIR="${REPO_DIR}/patterns"
 # HF_CACHE_DIR: override to a shared cache directory to avoid re-downloading models.
 # Example (Heimdall, where byviz/bylastic_classification_logs is already cached):
 #   export HF_CACHE_DIR=/Library/Assets/LLM
 HF_CACHE_DIR="${HF_CACHE_DIR:-${REPO_DIR}/hf-cache}"
 TZ="${TZ:-America/Los_Angeles}"
 # ── Bundle push configuration ────────────────────────────────────────────────
 # Set TURNSTONE_BUNDLE_ENDPOINT to enable the "Send Bundle" button in the
 # Incidents UI:
 #
 #   export TURNSTONE_BUNDLE_ENDPOINT=https://turnstone.circuitforge.tech/turnstone/api/bundles
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ── Orchard submission (opt-in telemetry) ────────────────────────────────────
 # Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF
 # receiving instance after each glean run. Only matched entries are sent —
 # no raw log content. Used to build Avocet training data.
 #
 #   export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/your-node-id
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ── Anomaly scoring pipeline (IDS / watchdog) ────────────────────────────────
 # Set TURNSTONE_ANOMALY_MODEL to enable automatic anomaly scoring after each
 # glean run.  The byviz classifier (already used by the diagnose pipeline) is
 # a good default — it's cached alongside the other models.
 #
 #   export TURNSTONE_ANOMALY_MODEL=byviz/bylastic_classification_logs
 #   export TURNSTONE_ANOMALY_THRESHOLD=0.80   # confidence floor (default 0.75)
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ── Multi-agent diagnose pipeline ────────────────────────────────────────────
 # Enable the 5-stage ML pipeline to get smarter diagnose results.
 #
 # If your host has WireGuard to Heimdall's LAN:
 #   export GPU_SERVER_URL=http://<HEIMDALL_LAN_IP>:7700
 #   export TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 #   bash ~/turnstone/docker-standalone.sh
 #
 # If your host has no WireGuard to Heimdall (use public cf-orch endpoint):
 #   export GPU_SERVER_URL=https://orch.circuitforge.tech
 #   export TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 #   bash ~/turnstone/docker-standalone.sh
 #
 # ML models are downloaded on first diagnose run and cached in HF_CACHE_DIR.
 # First run takes a few minutes (downloading ~400MB of CPU-only models).
 # Subsequent runs are instant (models served from hf-cache/).
 #
 # ── Build image from current source ─────────────────────────────────────────
 echo "Building Turnstone image..."
 docker build -t localhost/turnstone:latest "${REPO_DIR}"
 # Create HF model cache dir if not present (persists across container rebuilds)
 mkdir -p "${HF_CACHE_DIR}"
 mkdir -p "${DATA_DIR}" "${PATTERNS_DIR}"
 # Remove existing container if present (safe re-run)
 docker rm -f turnstone 2>/dev/null || true
 docker run -d \
  --name=turnstone \
  --restart=unless-stopped \
  -p 8534:8534 \
  -v "${DATA_DIR}:/data" \
  -v "${PATTERNS_DIR}:/patterns" \
  -v "${HF_CACHE_DIR}:/hf-cache" \
  -v /opt:/opt:ro \
  -v /var/log:/var/log:ro \
  -e TURNSTONE_DB=/data/turnstone.db \
  -e TURNSTONE_SOURCE_HOST="$(hostname)" \
  -e TURNSTONE_BUNDLE_ENDPOINT="${TURNSTONE_BUNDLE_ENDPOINT:-}" \
  -e TURNSTONE_SUBMIT_ENDPOINT="${TURNSTONE_SUBMIT_ENDPOINT:-}" \
  -e PYTHONUNBUFFERED=1 \
  -e TZ="${TZ}" \
  -e TURNSTONE_MULTI_AGENT_DIAGNOSE="${TURNSTONE_MULTI_AGENT_DIAGNOSE:-false}" \
  -e GPU_SERVER_URL="${GPU_SERVER_URL:-}" \
  -e HF_HOME=/hf-cache \
  -e TURNSTONE_CLASSIFIER_MODEL="${TURNSTONE_CLASSIFIER_MODEL:-byviz/bylastic_classification_logs}" \
  -e TURNSTONE_EMBED_BACKEND="${TURNSTONE_EMBED_BACKEND:-sentence_transformers}" \
  -e TURNSTONE_EMBED_MODEL="${TURNSTONE_EMBED_MODEL:-sentence-transformers/all-MiniLM-L6-v2}" \
  -e TURNSTONE_EMBED_DEVICE="${TURNSTONE_EMBED_DEVICE:-cpu}" \
  -e TURNSTONE_CYBERSEC_MODEL="${TURNSTONE_CYBERSEC_MODEL:-}" \
  -e TURNSTONE_CYBERSEC_DEVICE="${TURNSTONE_CYBERSEC_DEVICE:-cpu}" \
  -e TURNSTONE_CYBERSEC_THRESHOLD="${TURNSTONE_CYBERSEC_THRESHOLD:-0.60}" \
  -e TURNSTONE_ANOMALY_MODEL="${TURNSTONE_ANOMALY_MODEL:-}" \
  -e TURNSTONE_ANOMALY_DEVICE="${TURNSTONE_ANOMALY_DEVICE:-cpu}" \
  -e TURNSTONE_ANOMALY_THRESHOLD="${TURNSTONE_ANOMALY_THRESHOLD:-0.75}" \
  -e TURNSTONE_ANOMALY_INTERVAL="${TURNSTONE_ANOMALY_INTERVAL:-0}" \
  localhost/turnstone:latest
 echo ""
 echo "Turnstone is starting up."
 echo "  UI: http://localhost:8534/turnstone/"
 echo ""
 echo "Check container health with:"
 echo "  docker ps"
 echo "  docker logs turnstone"
 echo ""
 echo "To glean all sources now:"
 echo "  docker exec turnstone python scripts/glean_corpus.py \\"
 echo "    --sources /patterns/sources.yaml --db /data/turnstone.db"
 echo ""
 echo "To add a new source: edit ${PATTERNS_DIR}/sources.yaml — no restart needed."
--- a/docs/air-gapped-deployment.md
+++ b/docs/air-gapped-deployment.md
@ -1,129 +0,0 @@
 # Air-Gapped Deployment Guide
 Turnstone can run entirely without internet access. This guide covers pre-downloading
 all model weights, configuring offline mode, and verifying that no outbound connections
 are made at runtime.
 ## What requires network access by default
 | Component | When | What it downloads |
 |-----------|------|------------------|
 | Stage 2 ML classifier | First diagnose run (if `TURNSTONE_CLASSIFIER_MODEL` is set) | HuggingFace model weights (~300 MB) |
 | Stage 4 sentence-transformers embedder | First diagnose run (if `TURNSTONE_EMBED_BACKEND=sentence_transformers`) | Embedding model (~130 MB) |
 | LLM inference | Every diagnose run | Nothing — calls your configured `GPU_SERVER_URL` only |
 | Log glean | Every glean run | Nothing — reads local files or SSH sources |
 If neither the classifier nor the sentence-transformers embedder is enabled, Turnstone
 makes no outbound network calls at runtime (only local SQLite reads/writes and your
 configured LLM endpoint).
 ## Step 1 — Pre-download models (on an internet-connected machine)
 Run these commands in the `cf` conda environment before moving to the air-gapped host:
 ```bash
 # Stage 2 ML classifier (only needed if TURNSTONE_CLASSIFIER_MODEL is set)
 conda run -n cf python -c "
 from transformers import pipeline
 pipeline('text-classification', model='byviz/bylastic_classification_logs')
 print('classifier cached')
 "
 # Stage 4 sentence-transformers embedder (only if TURNSTONE_EMBED_BACKEND=sentence_transformers)
 conda run -n cf python -c "
 from sentence_transformers import SentenceTransformer
 SentenceTransformer('BAAI/bge-small-en-v1.5')
 print('embedder cached')
 "
 ```
 Models are cached to `~/.cache/huggingface/`. Copy that directory to the air-gapped host
 at the same path before deployment.
 ## Step 2 — Pre-ingest your documentation corpus
 On the internet-connected machine, or before cutting the network:
 ```bash
 # Write your manifest (see scripts/manifests/example.yaml)
 # Then bulk-upload to the context DB:
 conda run -n cf python scripts/harvest_docs.py --manifest scripts/manifests/your-site.yaml
 ```
 The context DB (`turnstone-context.db`) is a plain SQLite file — copy it to the
 air-gapped host alongside `turnstone.db`.
 ## Step 3 — Set offline environment variables
 Add to your `.env` file (copy from `.env.example`):
 ```bash
 # Block all HuggingFace hub network access
 TURNSTONE_OFFLINE_MODE=1
 # Point models at the pre-downloaded cache (usually the default)
 # HF_HOME=/home/youruser/.cache/huggingface
 ```
 `TURNSTONE_OFFLINE_MODE=1` sets both `HF_HUB_OFFLINE=1` and `TRANSFORMERS_OFFLINE=1`
 before any model library loads. If the cache is missing or incomplete, the classifier
 falls back to the pattern-tag / regex path and embedding is skipped — diagnose still
 works, just without ML-assisted severity or suppression.
 ## Step 4 — Configure a local LLM endpoint
 Turnstone's LLM reasoning calls your `GPU_SERVER_URL`. On an air-gapped host this
 must be a local endpoint — either Ollama or a local cf-orch coordinator:
 ```bash
 # Local Ollama
 GPU_SERVER_URL=http://localhost:11434
 # Local cf-orch coordinator
 GPU_SERVER_URL=http://localhost:7700
 ```
 Pull the Ollama model before cutting network access:
 ```bash
 ollama pull llama3.1:8b
 ```
 ## Step 5 — Verify no outbound connections at runtime
 Start Turnstone and run a diagnose query, then check for unexpected outbound connections:
 ```bash
 # Watch for any connection to HuggingFace, PyPI, or other external hosts
 ss -tp | grep python
 # or
 lsof -i -n -P | grep python | grep ESTABLISHED
 ```
 Expected: only connections to your `GPU_SERVER_URL` and any SSH log sources.
 No connections to `huggingface.co`, `cdn-lfs.huggingface.co`, or `pypi.org`.
 ## Deployment checklist
 - [ ] `~/.cache/huggingface/` copied to air-gapped host (if using ML classifier or embedder)
 - [ ] `TURNSTONE_OFFLINE_MODE=1` set in `.env`
 - [ ] `GPU_SERVER_URL` points to a local inference endpoint
 - [ ] Ollama model pulled locally (if using Ollama)
 - [ ] Context DB pre-populated with runbooks via `harvest_docs.py`
 - [ ] No internet access verified with `ss -tp` during a diagnose run
 - [ ] `TURNSTONE_API_KEY` set if the host is accessible over the network (see API auth docs)
 ## Troubleshooting
 **"OSError: We couldn't connect to huggingface.co…"**
 The model is not in the local cache. Either download it on a connected machine and copy
 `~/.cache/huggingface/`, or unset `TURNSTONE_CLASSIFIER_MODEL` to fall back to the
 pattern-based classifier.
 **Diagnose still works but no ML severity in pipeline stages**
 Expected when running offline without a pre-cached model. Stage 2 falls back to
 `pattern_tags` → regex severity detection automatically.
 **LLM reasoning missing from diagnose output**
 Check that `GPU_SERVER_URL` is reachable from the air-gapped host and that your local
 Ollama/vLLM has the configured model pulled.
--- a/docs/compliance/checklist.md
+++ b/docs/compliance/checklist.md
@ -1,154 +0,0 @@
 # Turnstone Compliance Checklist
 **Last reviewed:** 2026-05-28
 **Applies to:** All deployments handling log data in compliance-sensitive environments.
 Symbols: ✅ satisfied by code, ⚙️ operator action required, ⚠️ known limitation, 🔲 not implemented.
 ---
 ## Data Isolation
 ### Source-level query isolation
 ✅ **`source_filter` enforced on all log-returning endpoints.**
 Every endpoint that returns log entries accepts a `source` parameter. Both the FTS5 keyword search path and the time-window scan path apply `source_id LIKE ?` before returning results. No cross-source data leakage is possible through the API.
 Relevant code: `app/services/search.py` — `search()` and `entries_in_window()`.
 ### FTS5 cross-source leakage
 ✅ **FTS5 index includes `source_id` as an UNINDEXED column; all queries filter on it.**
 The virtual table schema stores `source_id` alongside each entry. Query functions always join back to the base table or filter the FTS result set by `source_id`. There is no full-corpus FTS path that ignores source.
 ### SQLite file permissions
 ⚙️ **Operator responsibility — not enforced by Turnstone.**
 Turnstone does not set file permissions on the database. Recommended posture for multi-user hosts:
 ```bash
 # Restrict DB to the Turnstone process user only
 chmod 600 /devl/turnstone-cluster/data/turnstone.db
 chmod 600 /devl/turnstone-cluster/data/turnstone-context.db
 chown turnstone:turnstone /devl/turnstone-cluster/data/
 ```
 Run Turnstone as a dedicated non-root user via systemd `User=turnstone`.
 ---
 ## Audit Logging
 ### API query logging
 ✅ **Implemented as FastAPI middleware (`turnstone.audit` logger).**
 Every request to `/turnstone/api/*` is logged at INFO level with:
 - Timestamp (from the logging handler)
 - HTTP method
 - Path + query string
 - Response status code
 - Request duration (ms)
 Body content is never logged. Example output:
 ```
 2026-05-28 14:23:01 INFO turnstone.audit  GET /turnstone/api/diagnose/stream?source=heimdall-journal 200 1843ms
 ```
 To capture audit logs to a separate file, configure the `turnstone.audit` logger in your logging config:
 ```python
 # In your uvicorn startup or log config YAML:
 logging.getLogger("turnstone.audit").addHandler(
    logging.FileHandler("/var/log/turnstone/audit.log")
 )
 ```
 ### Glean operation logging
 ✅ **Glean scheduler logs source ID, entry count, and duration at INFO level.**
 Relevant logger: `app.tasks.glean_scheduler` — logs start, per-source stats, and errors.
 Log example:
 ```
 INFO app.tasks.glean_scheduler  Batch glean complete in 12.4s — {'heimdall-journal': 847, 'plex': 12}
 ```
 ### Error logging
 ✅ **Errors logged with source context but without PII in message fields.**
 Exception handlers in `rest.py` log at ERROR level with the endpoint path and error type. Raw log entry text is not included in error messages. Stack traces go to the `uvicorn.error` logger.
 ---
 ## LLM / PII Egress
 ### Multi-agent pipeline (recommended path, `TURNSTONE_MULTI_AGENT_DIAGNOSE=true`)
 ✅ **Raw log message text is NOT sent to the LLM.**
 Stage 5 (synthesizer) sends only:
 - The operator's query string
 - Timeline statistics (cluster counts, burst counts, gap counts — no entry text)
 - Hypothesis titles from Stage 3 (derived labels, not raw messages)
 - Runbook context from the operator's own uploaded documents
 No raw `MESSAGE` field content reaches the LLM in this path. Review: `app/services/diagnose/synthesizer.py`.
 ### Legacy single-call path (`TURNSTONE_MULTI_AGENT_DIAGNOSE` unset or `false`)
 ⚠️ **Raw log message text (truncated to 200 chars) IS sent to the LLM.**
 The legacy `summarize()` function in `app/services/llm.py` builds a prompt that includes up to 25 log entries with their `text` field (truncated). If log entries contain hostnames, usernames, IP addresses, or other PII, those values are included in the LLM call.
 **Operator action for PII-sensitive deployments:** Enable `TURNSTONE_MULTI_AGENT_DIAGNOSE=true` to use the pipeline path, which does not expose raw log text.
 ### Avocet harvester (corpus export)
 ✅ **Only pattern-tagged entries are exported; export can be disabled.**
 The harvester (`harvester/harvester.py`) only POSTs entries that matched at least one named pattern. It does not export the full corpus. Disable by leaving `TURNSTONE_SUBMIT_ENDPOINT` unset (the default).
 ### External telemetry
 ✅ **None.** Turnstone makes no calls to Sentry, Segment, Amplitude, or any analytics service. The only outbound network calls are:
 - Your configured `GPU_SERVER_URL` (LLM inference, operator-controlled)
 - HuggingFace Hub (model downloads — disable with `TURNSTONE_OFFLINE_MODE=1`)
 - SSH connections to configured remote log sources (operator-defined)
 ---
 ## Configuration Hardening
 For compliance deployments, set these in `.env`:
 ```bash
 # Block HuggingFace network access (model weights pre-downloaded)
 TURNSTONE_OFFLINE_MODE=1
 # Require bearer token for all API calls
 TURNSTONE_API_KEY=<strong-random-token>
 # Use multi-agent pipeline (no raw log text to LLM)
 TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 # Disable Avocet corpus push if not needed
 # (leave TURNSTONE_SUBMIT_ENDPOINT unset)
 ```
 ---
 ## Outstanding Items
 🔲 **Per-user access control** — all authenticated clients share the same API key. There is no per-user identity, role separation, or per-source ACL. Track as a future enhancement.
 🔲 **Audit log retention policy** — Turnstone writes audit events to the logging system but does not manage log rotation or retention. Operator must configure log rotation (logrotate, systemd journal limits, etc.).
 🔲 **Encrypted DB at rest** — SQLite does not support transparent encryption. For encryption at rest, use full-disk encryption (LUKS) or an encrypted filesystem on the host.
 🔲 **TLS between client and Turnstone** — Turnstone binds to HTTP by default. For production, place Caddy or nginx in front and terminate TLS there. Do not expose port 8534 directly over untrusted networks.
 ---
 ## Data Subject Rights (GDPR / CCPA)
 ### Right to erasure — anonymized records
 ⚠️ **Anonymized log data cannot be selectively deleted on a per-subject basis.**
 When PII sanitization is applied to a bundle export (redacting IP addresses, usernames, hostnames), the resulting data is no longer linked to a specific data subject. As a consequence, Turnstone cannot identify which stored log entries relate to that subject and cannot fulfill a targeted deletion request for records that have already been anonymized.
 **Operators must clearly disclose this limitation to data subjects before export:**
 > "Anonymized log data exported or submitted from this system cannot be individually identified or selectively deleted. If data was exported in anonymized form, Turnstone cannot distinguish your records from others in the exported set. The right to erasure does not apply to data that is no longer personally identifiable."
 This is consistent with GDPR Recital 26, which excludes anonymized data from the regulation's scope. However, the original (pre-anonymization) records in Turnstone's local SQLite database *can* be deleted by source ID via the Sources view (Delete all entries for source) or directly via the database.
 **Recommended operator practice:**
 - Maintain a log of which bundles were exported, when, and to whom — the audit log (`turnstone.audit`) covers this.
 - Provide data subjects with the bundle export timestamp and source scope so they can verify what was shared.
 - For full erasure of pre-anonymization records: use `DELETE /api/sources/{source_id}` to purge all entries for a given source from the local DB.
--- a/docs/tautulli-setup.md
+++ b/docs/tautulli-setup.md
@ -39,7 +39,7 @@ notification agent:
 ## Webhook URL
 ```
-http://<turnstone-host>:8534/turnstone/api/glean/tautulli
+http://<turnstone-host>:8534/turnstone/api/ingest/tautulli
 ```
 Replace `<turnstone-host>` with the hostname or IP of the machine running
--- a/harvester/harvester.py
+++ b/harvester/harvester.py
@ -2,7 +2,7 @@
 """Turnstone Harvester — collect logs and ship them to a Turnstone instance.
 Subcommands:
-    push      Read sources.yaml, POST each log file to Turnstone /api/glean/upload
+    push      Read sources.yaml, POST each log file to Turnstone /api/ingest/upload
    incident  Tag an incident on the remote Turnstone instance
 Usage:
@ -97,8 +97,8 @@ def cmd_push(args: argparse.Namespace) -> int:
        logger.warning("No sources defined in %s", sources_path)
        return 0
-    upload_url = args.url.rstrip("/") + "/turnstone/api/glean/upload"
+    upload_url = args.url.rstrip("/") + "/turnstone/api/ingest/upload"
-    total_gleaned = 0
+    total_ingested = 0
    errors = 0
    for src in sources:
@ -110,9 +110,9 @@ def cmd_push(args: argparse.Namespace) -> int:
        logger.info("Pushing %s (%s) ...", src_id, src_path)
        try:
            result = _post_file(upload_url, src_path, src_id)
-            count = result.get("gleaned", 0)
+            count = result.get("ingested", 0)
-            total_gleaned += count
+            total_ingested += count
-            logger.info("  %s: %d entries gleaned", src_id, count)
+            logger.info("  %s: %d entries ingested", src_id, count)
        except urllib.error.HTTPError as exc:
            logger.error("  %s: HTTP %d — %s", src_id, exc.code, exc.read().decode(errors="replace"))
            errors += 1
@ -120,7 +120,7 @@ def cmd_push(args: argparse.Namespace) -> int:
            logger.error("  %s: %s", src_id, exc)
            errors += 1
-    logger.info("Done. Total gleaned: %d entries, errors: %d", total_gleaned, errors)
+    logger.info("Done. Total ingested: %d entries, errors: %d", total_ingested, errors)
    return 1 if errors else 0
--- a/harvester/sources.example.yaml
+++ b/harvester/sources.example.yaml
@ -42,10 +42,3 @@ sources:
  # Jellyfin
  # - id: jellyfin
  #   path: /opt/jellyfin/log/jellyfin.log
  # Wazuh SIEM — alerts.json on the Wazuh manager
  # Turnstone auto-detects this format; source_id is qualified per agent automatically.
  # For push-based ingestion from Wazuh custom integrations, use:
  #   POST /api/glean/wazuh/alert  (single alert JSON body)
  # - id: wazuh
  #   path: /var/ossec/logs/alerts/alerts.json
--- a/manage.sh
+++ b/manage.sh
@ -38,15 +38,6 @@ PATTERN_DIR="${TURNSTONE_PATTERNS:-$([[ -d /devl/turnstone-cluster/patterns ]] &
 CONDA_BASE="${CONDA_BASE:-/devl/miniconda3}"
 PYTHON="${CONDA_BASE}/envs/cf/bin/python"
 # Source .env if present — loads TURNSTONE_MULTI_AGENT_DIAGNOSE, GPU_SERVER_URL, etc.
 # Variables already set in the environment take precedence (set -a / set +a scoping).
 if [[ -f "${SCRIPT_DIR}/.env" ]]; then
    set -a
    # shellcheck source=/dev/null
    source "${SCRIPT_DIR}/.env"
    set +a
 fi
 # ── Helpers ───────────────────────────────────────────────────────────────────
 _is_alive() {
@ -129,9 +120,9 @@ usage() {
    echo -e "    ${GREEN}dev${NC}                      uvicorn --reload (:${API_PORT}) + Vite HMR (:${VITE_PORT})"
    echo ""
    echo "  Data:"
-    echo -e "    ${GREEN}glean PATH [DB]${NC}          Glean a log file or corpus directory"
+    echo -e "    ${GREEN}ingest PATH [DB]${NC}         Ingest a log file or corpus directory"
-    echo -e "    ${GREEN}glean-plex [HOST]${NC}        Pull Plex log from Cass (or HOST) and glean"
+    echo -e "    ${GREEN}ingest-plex [HOST]${NC}       Pull Plex log from Cass (or HOST) and ingest"
-    echo -e "    ${GREEN}glean-qbit [HOST]${NC}        Pull qBittorrent log locally or from HOST via SSH"
+    echo -e "    ${GREEN}ingest-qbit [HOST]${NC}       Pull qBittorrent log locally or from HOST via SSH"
    echo -e "    ${GREEN}build-fts${NC}                Rebuild the FTS search index"
    echo ""
    echo "  Tests:"
@ -143,8 +134,8 @@ usage() {
    echo "  Examples:"
    echo "    ./manage.sh start"
    echo "    ./manage.sh dev"
-    echo "    ./manage.sh glean corpus/raw/"
+    echo "    ./manage.sh ingest corpus/raw/"
-    echo "    ./manage.sh glean corpus/raw/ data/custom.db"
+    echo "    ./manage.sh ingest corpus/raw/ data/custom.db"
    echo ""
 }
@ -240,15 +231,15 @@ case "$CMD" in
        (cd web && npm run dev -- --port "$VITE_PORT")
        ;;
-    glean)
+    ingest)
        if [[ $# -lt 1 ]]; then
-            error "Usage: ./manage.sh glean <file_or_dir> [DB_PATH]"
+            error "Usage: ./manage.sh ingest <file_or_dir> [DB_PATH]"
        fi
-        info "Gleaning $1 → ${2:-$DB}…"
+        info "Ingesting $1 → ${2:-$DB}…"
-        "$PYTHON" scripts/glean_corpus.py "$1" "${2:-$DB}"
+        "$PYTHON" scripts/ingest_corpus.py "$1" "${2:-$DB}"
        ;;
-    glean-plex)
+    ingest-plex)
        PLEX_HOST="${1:-cass}"
        PLEX_LOG_DIR="/var/lib/plexmediaserver/Library/Application Support/Plex Media Server/Logs"
        TMP_DIR="/tmp/turnstone-plex-$$"
@ -273,16 +264,16 @@ case "$CMD" in
            ssh "$PLEX_HOST" "cat '${remote_path}'" > "$local_path"
        done
-        info "Gleaning ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
+        info "Ingesting ${#REMOTE_LOGS[@]} log file(s) into ${DB}…"
        for f in "$TMP_DIR"/*.log; do
-            "$PYTHON" scripts/glean_corpus.py "$f" "$DB"
+            "$PYTHON" scripts/ingest_corpus.py "$f" "$DB"
        done
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
        ;;
-    glean-qbit)
+    ingest-qbit)
        QBIT_HOST="${1:-}"
        # Default log locations in priority order
        QBIT_LOG_PATHS=(
@ -325,8 +316,8 @@ case "$CMD" in
            info "  ← ${LOCAL_LOG}"
        fi
-        info "Gleaning into ${DB}…"
+        info "Ingesting into ${DB}…"
-        "$PYTHON" scripts/glean_corpus.py "${TMP_DIR}"/*.log "$DB"
+        "$PYTHON" scripts/ingest_corpus.py "${TMP_DIR}"/*.log "$DB"
        rm -rf "$TMP_DIR"
        info "Done. Restarting server…"
        exec bash "$0" restart
--- a/patterns/default.yaml
+++ b/patterns/default.yaml
@ -2,101 +2,83 @@
 # Each matched pattern name is stored on RetrievedEntry.matched_patterns and
 # used to boost retrieval relevance for diagnostic queries.
 #
-# domain: groups patterns into service health domains for triage-level summaries.
+# Add domain-specific patterns here. Patterns are applied in order; multiple
-# Valid domains: service_health | networking | auth | storage | memory |
+# can match a single entry.
 #                kernel | power | web_proxy | media | gpu | audio
 #
 # Patterns are applied in order; multiple can match a single entry.
 patterns:
  - name: service_restart
    pattern: "(restarting|restart requested|service.*start)"
    severity: WARN
    domain: service_health
    description: Service restart detected
  - name: connection_lost
    pattern: "(connection (lost|dropped|refused|timed? out)|disconnect(ed)?)"
    severity: ERROR
    domain: networking
    description: Network or device connection failure
  - name: auth_failure
    pattern: "(auth(entication)? (failed?|error|denied)|permission denied|unauthorized)"
    severity: ERROR
    domain: auth
    description: Authentication or authorization failure
  - name: oom
    pattern: "(out of memory|OOM|killed process|cannot allocate)"
    severity: CRITICAL
    domain: memory
    description: Out-of-memory condition
  - name: segfault
    pattern: "(segmentation fault|segfault|SIGSEGV|core dump)"
    severity: CRITICAL
    domain: kernel
    description: Process crash or memory corruption
  - name: disk_full
    pattern: "(no space left|disk full|filesystem.*full|ENOSPC)"
    severity: ERROR
    domain: storage
    description: Storage capacity exhausted
  - name: timeout
    pattern: "(timed? out|deadline exceeded|operation timed?)"
    severity: WARN
    domain: networking
    description: Operation timeout
  - name: caddy_tls_error
    pattern: "(acme|certificate|tls).*(error|fail|invalid|expired|renew)"
    severity: ERROR
    domain: web_proxy
    description: Caddy TLS or certificate error
  - name: caddy_config_error
    pattern: "(config|caddyfile|directive).*(error|invalid|unknown|unrecognized)"
    severity: ERROR
    domain: web_proxy
    description: Caddy configuration error
  - name: caddy_auth_error
    pattern: "(forward_auth|basicauth|basic_auth).*(error|fail|denied|invalid|unreachable)"
    severity: ERROR
    domain: web_proxy
    description: Caddy authentication middleware failure
  - name: caddy_upstream_error
    pattern: "(upstream|backend|reverse.proxy).*(error|fail|unreachable|refused|timeout)"
    severity: ERROR
    domain: web_proxy
    description: Caddy upstream/backend failure
  - name: service_update
    pattern: "(upgraded?|updated?|installing|dpkg|apt|package).*(caddy|nginx|apache|proxy)"
    severity: INFO
    domain: web_proxy
    description: Web server package update detected
  - name: power_failure
    pattern: "(power (fail|loss|outage|cut)|ups|battery|shutdown.*power|lost power)"
    severity: CRITICAL
    domain: power
    description: Power failure or UPS event
  - name: network_interface
    pattern: "(eth[0-9]|ens[0-9]|enp[0-9]|wlan[0-9]).*(down|up|carrier|link)"
    severity: WARN
    domain: networking
    description: Network interface state change
  - name: ip_change
    pattern: "(new ip|ip.*(changed|assigned|address)|dhcp.*(ack|offer|bound|renew))"
    severity: INFO
    domain: networking
    description: IP address change or DHCP event
  # ── System / journald patterns ─────────────────────────────────────────────
@ -104,55 +86,46 @@ patterns:
  - name: systemd_fail
    pattern: "(Failed to start|failed with result|entered failed state|start request repeated too quickly|Main process exited)"
    severity: ERROR
    domain: service_health
    description: systemd service failed to start or crashed
  - name: oom_kill
    pattern: "(Killed process|oom.kill|oom_kill_process|Out of memory: Kill|memory cgroup out of memory)"
    severity: CRITICAL
    domain: memory
    description: Kernel OOM killer terminated a process
  - name: disk_hw_error
    pattern: "(ata[0-9]|sd[a-z]|nvme[0-9]).*(error|failed|reset|timeout|exception|EH|FAILED COMMAND)"
    severity: ERROR
    domain: storage
    description: Storage device hardware error or reset
  - name: fs_error
    pattern: "(EXT4-fs error|XFS.*error|BTRFS.*error|I/O error|blk_update_request.*error|buffer I/O error)"
    severity: ERROR
    domain: storage
    description: Filesystem or block I/O error
  - name: kernel_error
    pattern: "(kernel: BUG|kernel panic|Oops:|general protection fault|Call Trace|RIP:.*[0-9a-f]{16})"
    severity: CRITICAL
    domain: kernel
    description: Kernel bug, panic, or oops — system may be unstable
  - name: ssh_brute
    pattern: "(Failed password|Invalid user|authentication failure|Connection closed by authenticating user).*(sshd|ssh)"
    severity: WARN
    domain: auth
    description: SSH authentication failure — possible brute force
  - name: container_crash
    pattern: "(container.*exited|oci runtime.*error|podman.*error|docker.*error|container.*killed|OCI.*failed)"
    severity: ERROR
    domain: service_health
    description: Container runtime error or unexpected exit
  - name: smart_error
    pattern: "(smartd|SMART.*error|reallocated sector|pending sector|uncorrectable sector|Current_Pending_Sector)"
    severity: CRITICAL
    domain: storage
    description: SMART disk health warning — potential drive failure
  - name: nfs_error
    pattern: "(nfs.*error|nfs.*timeout|RPC.*timed out|nfs4.*server.*not responding|mount.*nfs.*failed)"
    severity: ERROR
    domain: networking
    description: NFS mount or RPC timeout
  # Add device/service-specific patterns below this line:
@ -160,156 +133,49 @@ patterns:
  - name: qbit_tracker_error
    pattern: "(tracker|announce).*(not working|error|fail|unreachable|timeout|refused|invalid)"
    severity: WARN
    domain: media
    description: qBittorrent tracker connection or announce failure
  - name: qbit_port_bind
    pattern: "(couldn't? listen|bind.*fail|port.*in use|listening.*fail)"
    severity: CRITICAL
    domain: media
    description: qBittorrent failed to bind listen port — firewall or port conflict
  - name: qbit_disk_error
    pattern: "(cannot (write|open|create)|disk.*error|i/o error|file.*fail|write.*fail)"
    severity: ERROR
    domain: media
    description: qBittorrent disk write or file access failure
  - name: qbit_hash_fail
    pattern: "(hash.*(check|fail|mismatch)|recheck|piece.*fail)"
    severity: WARN
    domain: media
    description: qBittorrent torrent hash verification failure — possible corrupt data
  - name: qbit_peer_ban
    pattern: "(peer.*ban|banned.*peer|blocked.*peer)"
    severity: INFO
    domain: media
    description: qBittorrent peer banned (encryption enforcement or bad actor)
  - name: qbit_download_complete
    pattern: "(download.*complet|torrent.*finish|has finished downloading)"
    severity: INFO
    domain: media
    description: qBittorrent torrent download completed
  - name: qbit_ratio_limit
    pattern: "(ratio.*reach|seeding.*limit|stop.*seeding|upload.*limit)"
    severity: INFO
    domain: media
    description: qBittorrent seeding ratio or time limit reached
  - name: qbit_session_error
    pattern: "(session.*error|couldn't? resume|resume.*fail|torrent.*error)"
    severity: ERROR
    domain: media
    description: qBittorrent session or resume data error
  - name: plex_eae_failure
    pattern: "(EAE timeout|EAE not running|eac3_eae.*error reading output|Error submitting packet to decoder.*I/O error)"
    severity: ERROR
    domain: media
    description: Plex EasyAudioEncoder (EAC3 Dolby audio transcoder) crashed — service restart required
-  # - name: ext_device_error
+  # - name: avcx_device_error
  #   pattern: "ERR-\d{4}"
  #   severity: ERROR
-  #   description: vendor device structured error code
+  #   description: AVCX device error code
  # ── VPN / tunnel patterns ──────────────────────────────────────────────────
  - name: vpn_tunnel_fail
    pattern: "(wg-quick@|wireguard|spirit-city-tunnel|cf-orch-tunnel|cf-tunnel|openvpn|vpn).*(failed|error|exit.code|timeout|connection reset)"
    severity: ERROR
    domain: networking
    description: VPN or WireGuard tunnel service failed — remote node may be unreachable
  - name: vpn_handshake
    pattern: "(handshake|peer.*allowed|WireGuard|wg-quick).*(initiating|complete|timeout|fail|retrying)"
    severity: WARN
    domain: networking
    description: WireGuard peer handshake event — track for timeout/retry patterns
  - name: dns_degraded
    pattern: "(degraded feature set|DNS.*fall.?back|resolver.*fail|NXDOMAIN|DNS.*timeout|SERVFAIL)"
    severity: WARN
    domain: networking
    description: DNS resolver degradation or fallback — often precedes connectivity failures
  # ── GPU / NVIDIA driver patterns ───────────────────────────────────────────
  - name: nvidia_api_mismatch
    pattern: "(NVRM: API mismatch|nvidia.*version mismatch|driver.*mismatch|kernel module.*mismatch)"
    severity: ERROR
    domain: gpu
    description: NVIDIA kernel module version does not match userspace driver — GPU ops will fail until driver reinstalled
  - name: nvidia_xid
    pattern: "(NVRM: Xid|Xid.*(error|critical)|GPU.*Xid)"
    severity: CRITICAL
    domain: gpu
    description: NVIDIA Xid error — GPU hardware fault or driver crash (check nvidia-smi error code)
  - name: nvidia_gpu_reset
    pattern: "(nvidia.*reset|GPU.*reset|NVRM.*reset|nvml.*error|NVLink.*fail)"
    severity: ERROR
    domain: gpu
    description: NVIDIA GPU reset or NVLink fault — possible hardware instability
  # ── Power / thermal patterns ───────────────────────────────────────────────
  - name: acpi_error
    pattern: "(ACPI.*failed|ACPI.*error|ACPI.*_DSM|acpi.*_PPC|ACPI BIOS Error)"
    severity: WARN
    domain: kernel
    description: ACPI firmware evaluation failure — often harmless but can indicate BIOS/power management issues
  - name: thermal_throttle
    pattern: "(CPU.*throttl|thermal throttl|Package temp|TjMax|temperature.*critical|No RAPL|RAPL.*not available)"
    severity: WARN
    domain: power
    description: CPU/GPU thermal throttling or thermal management subsystem unavailable
  - name: undervoltage
    pattern: "(under.?voltage|brownout|voltage.*(low|critical)|power supply.*insufficient)"
    severity: ERROR
    domain: power
    description: Undervoltage event — instability risk, check PSU and cable connections
  # ── Audio / PipeWire / ALSA ──────────────────────────────────────────────────
  - name: pipewire_overflow
    pattern: "(OVERFLOW channel|stream.*OVERFLOW|protocol.pulse.*OVERFLOW)"
    severity: WARN
    domain: audio
    description: PipeWire-Pulse stream buffer overflow — client not draining audio fast enough; usually indicates a quantum/period-size mismatch or CPU scheduling issue
  - name: pipewire_underrun
    pattern: "(pw\\.node.*underrun|spa\\.alsa.*underrun|alsa.*underrun|UNDERRUN)"
    severity: WARN
    domain: audio
    description: PipeWire/ALSA buffer underrun (xrun) — audio thread missed its deadline; increase quantum or period-size for the affected device
  - name: alsa_xrun
    pattern: "(ALSA.*[Xx][Rr][Uu][Nn]|alsa.*xrun|snd_pcm.*xrun|pcm.*underrun|pcm.*overrun)"
    severity: WARN
    domain: audio
    description: ALSA xrun (hardware buffer overrun/underrun) — increase api.alsa.period-size via WirePlumber rule or raise clock.min-quantum
  - name: pipewire_quantum_mismatch
    pattern: "(quantum.*mismatch|rate.*mismatch|sample.rate.*mismatch|resampl.*fail|can.*t adapt quantum)"
    severity: WARN
    domain: audio
    description: PipeWire quantum or sample-rate mismatch between nodes — check for mixed 44100/48000 streams; may need per-device WirePlumber rules
  - name: pipewire_node_error
    pattern: "(pw\\.node.*error|node.*ERROR|pipewire.*failed to set|spa\\.alsa.*error|alsa_sink.*error|alsa_source.*error)"
    severity: ERROR
    domain: audio
    description: PipeWire node error — device may be unavailable or misconfigured
  - name: pipewire_jackdbus_missing
    pattern: "(jackdbus.*reply|jackaudio.*service.*not.*provided|org\\.jackaudio\\.service)"
    severity: INFO
    domain: audio
    description: PipeWire JACK D-Bus probe — JACK not running; benign on non-JACK systems, fires once per PipeWire restart
--- a/patterns/sources-cluster.yaml
+++ b/patterns/sources-cluster.yaml
@ -1,15 +1,15 @@
-# Turnstone log sources — Heimdall cluster glean.
+# Turnstone log sources — Heimdall cluster ingest.
 # Covers: Heimdall (local), Navi, Sif, Cass, Strahl (SSH-collected),
 #         Docker services on Heimdall, and network device syslog.
 #
-# Collected by scripts/collect_cluster_logs.sh before each glean run.
+# Collected by scripts/collect_cluster_logs.sh before each ingest run.
 # All paths are container-side (/data/ = bind-mount of /devl/turnstone-cluster/data/).
 #
-# Cron (collect + glean, every 15 min):
+# Cron (collect + ingest, every 15 min):
 #   */15 * * * * bash /Library/Development/CircuitForge/turnstone/scripts/collect_cluster_logs.sh && \
-#     docker exec turnstone-cluster python scripts/glean_corpus.py \
+#     docker exec turnstone-cluster python scripts/ingest_corpus.py \
 #       --sources /patterns/sources-cluster.yaml --db /data/turnstone.db \
-#       >> /var/log/turnstone-cluster-glean.log 2>&1
+#       >> /var/log/turnstone-cluster-ingest.log 2>&1
 sources:
  # ── Heimdall (local) ─────────────────────────────────────────────────────────
@ -48,8 +48,8 @@ sources:
  # ── Network syslog (router, switches, UniFi APs) ─────────────────────────────
  # Written by syslog-receiver.service (UDP 5140 → /devl/turnstone-cluster/data/network-syslog.txt).
  # Configure devices to send syslog to Heimdall:5140.
-  # UniFi: Settings → System → Remote Logging → Syslog Host = <YOUR_HOST_IP>:5140
+  # UniFi: Settings → System → Remote Logging → Syslog Host = 10.1.10.71:5140
-  # Ubiquiti EdgeRouter: set system syslog host <YOUR_HOST_IP> facility all level debug
+  # Ubiquiti EdgeRouter: set system syslog host 10.1.10.71 facility all level debug
-  # Managed switches: varies by vendor — target <YOUR_HOST_IP> UDP 5140
+  # Managed switches: varies by vendor — target 10.1.10.71 UDP 5140
  - id: network-syslog
    path: /data/network-syslog.txt
--- a/patterns/sources-example.yaml
+++ b/patterns/sources-example.yaml
@ -1,50 +0,0 @@
 # Turnstone log sources — example node (Docker/Podman, self-hosted media stack)
 #
 # Copy this file to your patterns directory and edit for your setup.
 # Container paths: /opt and /var/log are bind-mounted read-only.
 # journal-export.jsonl is written to /data/ by export_journal.sh (run via cron before glean).
 #
 # Add or remove sources freely. Missing paths are skipped with a warning.
 sources:
  # ── System ────────────────────────────────────────────────────────────────
  # Requires: cron job to run export_journal.sh before each glean.
  # Example cron (every 15 min — edit paths for your install):
  #   */15 * * * * /opt/turnstone/scripts/export_journal.sh \
  #     /opt/turnstone-data/
  - id: system-journal
    path: /data/journal-export.jsonl
  - id: dmesg
    path: /data/dmesg-export.txt
  # ── Servarr stack ─────────────────────────────────────────────────────────
  - id: sonarr
    path: /opt/sonarr/config/logs/sonarr.0.txt
  - id: radarr
    path: /opt/radarr/config/logs/radarr.0.txt
  - id: bazarr
    path: /opt/bazarr/config/log/bazarr.log
  - id: prowlarr
    path: /opt/prowlarr/config/logs/prowlarr.0.txt
  # ── Media server / tracking ────────────────────────────────────────────────
  - id: tautulli
    path: /opt/tautulli/config/logs/plex_websocket.log
  # ── Download automation ────────────────────────────────────────────────────
  - id: autoscan
    path: /opt/autoscan/config/autoscan.log
  # ── Web / proxy ────────────────────────────────────────────────────────────
  - id: organizr-nginx
    path: /opt/organizr/log/nginx/error.log
  - id: organizr-app
    path: /opt/organizr/www/organizr/server.log
  - id: nextcloud-nginx
    path: /opt/nextcloud/config/log/nginx/error.log
--- a/patterns/sources.yaml
+++ b/patterns/sources.yaml
@ -1,8 +1,8 @@
 # Turnstone log sources — edit this file to add or remove services.
 # NOTE: the system-journal entry requires export_journal.sh to run on the HOST
-# before the container glean step. See crontab setup instructions in the README.
+# before the container ingest step. See crontab setup instructions in the README.
-# Run glean manually:
+# Run ingest manually:
-#   sudo podman exec turnstone python scripts/glean_corpus.py \
+#   sudo podman exec turnstone python scripts/ingest_corpus.py \
 #     --sources /patterns/sources.yaml --db /data/turnstone.db
 #
 # Paths here are container-side paths under the /opt bind mount.
@ -12,7 +12,7 @@
 sources:
  # ── System (exported by export_journal.sh on the host) ───────────────────
  # journal-export.jsonl and dmesg-export.txt are written to /opt/turnstone/data/
-  # by the export script before each glean run.
+  # by the export script before each ingest run.
  - id: system-journal
    path: /data/journal-export.jsonl
@ -70,27 +70,3 @@ sources:
  - id: jellyseerr
    path: /opt/jellyseerr/config/logs/jellyseerr.log
  # ── MQTT / IoT (live — subscribe mode, no path needed) ───────────────────
  # Requires: pip install circuitforge-core[mqtt]
  # These sources are handled by the live MQTT subscriber task (not batch glean).
  # Uncomment and configure to enable.
  #
  # Meshtastic MQTT bridge (node must have MQTT uplink enabled):
  # - id: meshtastic-home
  #   type: mqtt
  #   broker_host: 10.1.10.5   # IP of your local MQTT broker (e.g. Mosquitto on Huginn)
  #   broker_port: 1883
  #   topics:
  #     - msh/#                # all Meshtastic regions; use msh/us-east/# to narrow
  #
  # Generic IoT sensors:
  # - id: iot-home
  #   type: mqtt
  #   broker_host: localhost
  #   broker_port: 1883
  #   topics:
  #     - home/+/temperature
  #     - home/+/humidity
  #     - home/+/motion
  #   severity: INFO
--- a/podman-standalone.sh
+++ b/podman-standalone.sh
@ -2,7 +2,7 @@
 # podman-standalone.sh — Turnstone rootful Podman setup (no Compose)
 #
 # For hosts running system Podman (non-rootless) with systemd.
-# Turnstone is a diagnostic log intelligence layer — glean service logs,
+# Turnstone is a diagnostic log intelligence layer — ingest service logs,
 # search by symptom, and view incidents in a lightweight web UI.
 #
 # ── Prerequisites ────────────────────────────────────────────────────────────
@ -28,25 +28,25 @@
 #   sudo systemctl daemon-reload
 #   sudo systemctl enable --now turnstone
 #
-# ── Gleaning logs ─────────────────────────────────────────────────────────────
+# ── Ingesting logs ────────────────────────────────────────────────────────────
 #   All service logs under /opt are accessible inside the container.
 #   Sources are configured in patterns/sources.yaml (bind-mounted at /patterns/).
 #
-#   To glean all sources (run manually or via cron):
+#   To ingest all sources (run manually or via cron):
 #
-#     sudo podman exec turnstone python scripts/glean_corpus.py \
+#     sudo podman exec turnstone python scripts/ingest_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db
 #
 #   Example cron (every 15 minutes, add to root's crontab with: sudo crontab -e):
-#     */15 * * * * podman exec turnstone python scripts/glean_corpus.py \
+#     */15 * * * * podman exec turnstone python scripts/ingest_corpus.py \
-#       --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-glean.log 2>&1
+#       --sources /patterns/sources.yaml --db /data/turnstone.db >> /var/log/turnstone-ingest.log 2>&1
 #
 #   To add a new log source: edit /opt/turnstone/patterns/sources.yaml — no restart needed.
 #
 # ── Adding Caddy reverse proxy ────────────────────────────────────────────────
 #   Add to /etc/caddy/Caddyfile:
 #
-#     turnstone.your-domain.example {
+#     turnstone.xanderland.tv {
 #       import protected
 #       reverse_proxy 10.0.0.10:8534
 #       import cloudflare
@ -59,14 +59,10 @@
 #
 set -euo pipefail
-# Auto-detect repo from script location — works whether cloned to /opt/turnstone
+REPO_DIR=/opt/turnstone
-# or to /Library/Development/CircuitForge/turnstone or any other path.
+DATA_DIR=/opt/turnstone/data
-REPO_DIR="${TURNSTONE_REPO_DIR:-$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)}"
+PATTERNS_DIR=/opt/turnstone/patterns
-# Data and patterns live OUTSIDE the repo so they survive git pulls.
+TZ=America/Los_Angeles
 DATA_DIR="${TURNSTONE_DATA_DIR:-/opt/turnstone-data}"
 PATTERNS_DIR="${TURNSTONE_PATTERNS_DIR:-${DATA_DIR}/patterns}"
 HF_CACHE_DIR="${TURNSTONE_HF_CACHE:-${DATA_DIR}/hf-cache}"
 TZ="${TZ:-America/Los_Angeles}"
 # ── Bundle push configuration ────────────────────────────────────────────────
 # Set TURNSTONE_BUNDLE_ENDPOINT before running this script to enable the
@ -75,35 +71,7 @@ TZ="${TZ:-America/Los_Angeles}"
 #   export TURNSTONE_BUNDLE_ENDPOINT=https://turnstone.circuitforge.tech/turnstone/api/bundles
 #   bash /opt/turnstone/podman-standalone.sh
 #
 # ── Orchard submission (opt-in telemetry) ────────────────────────────────────
 # Set TURNSTONE_SUBMIT_ENDPOINT to push pattern-matched log entries to a CF
 # receiving instance after each glean run. Only matched entries are sent —
 # no raw log content. Used to build Avocet training data.
 #
 #   export TURNSTONE_SUBMIT_ENDPOINT=https://harvest.circuitforge.tech/your-node-id
 #   bash /opt/turnstone/podman-standalone.sh
 #
 # TURNSTONE_SOURCE_HOST is auto-detected from `hostname` — override if needed.
 #
 # ── Multi-agent diagnose pipeline ────────────────────────────────────────────
 # The 5-stage ML pipeline requires three env vars and a writable HF cache dir:
 #
 #   TURNSTONE_MULTI_AGENT_DIAGNOSE=true   — enable the pipeline
 #   GPU_SERVER_URL=http://<orch-host>:7700 — cf-orch coordinator or Ollama base URL
 #
 # ML models are downloaded on first diagnose run and cached in HF_CACHE_DIR.
 # On a CPU-only host (no GPU) set TURNSTONE_EMBED_DEVICE=cpu (default).
 #
 # If your host has no WireGuard to Heimdall — use the public cf-orch endpoint:
 #   export GPU_SERVER_URL=https://orch.circuitforge.tech
 #   export TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 #   sudo bash /opt/turnstone/podman-standalone.sh
 #
 # For WireGuard-connected Docker hosts — WireGuard reaches Heimdall LAN directly,
 # use docker-standalone.sh (not this script — Docker host):
 #   export GPU_SERVER_URL=http://<YOUR_HOST_IP>:7700
 #   export TURNSTONE_MULTI_AGENT_DIAGNOSE=true
 #   bash ~/turnstone/docker-standalone.sh
 # ── Turnstone container ───────────────────────────────────────────────────────
 # Image is built locally — no registry auto-update label.
@ -116,22 +84,6 @@ TZ="${TZ:-America/Los_Angeles}"
 # Must be run as root (sudo bash podman-standalone.sh) — rootful Podman only.
 #
 # Bootstrap data and patterns dirs if this is a first run
 mkdir -p "${DATA_DIR}" "${PATTERNS_DIR}" "${HF_CACHE_DIR}"
 # Copy default patterns if the dir is empty (first run only)
 if [ -z "$(ls -A "${PATTERNS_DIR}")" ]; then
  cp "${REPO_DIR}/patterns/default.yaml" "${PATTERNS_DIR}/"
  # Copy host-specific sources if present, otherwise copy the generic template
  HOST_SOURCES="${REPO_DIR}/patterns/sources-$(hostname).yaml"
  if [ -f "${HOST_SOURCES}" ]; then
    cp "${HOST_SOURCES}" "${PATTERNS_DIR}/sources.yaml"
    echo "==> Installed host-specific sources: ${HOST_SOURCES}"
  else
    cp "${REPO_DIR}/patterns/sources.yaml" "${PATTERNS_DIR}/"
    echo "==> Installed default sources.yaml — edit ${PATTERNS_DIR}/sources.yaml for this host"
  fi
 fi
 # Build image from current source (bakes app/ code into the image)
 echo "Building Turnstone image..."
 podman build -t localhost/turnstone:latest "${REPO_DIR}"
@ -145,25 +97,13 @@ podman run -d \
  --net=host \
  -v "${DATA_DIR}:/data:Z" \
  -v "${PATTERNS_DIR}:/patterns:Z" \
  -v "${HF_CACHE_DIR}:/hf-cache:Z" \
  -v /opt:/opt:ro \
  -v /var/log:/var/log:ro \
  -e TURNSTONE_DB=/data/turnstone.db \
  -e TURNSTONE_SOURCE_HOST="$(hostname)" \
  -e TURNSTONE_BUNDLE_ENDPOINT="${TURNSTONE_BUNDLE_ENDPOINT:-}" \
  -e TURNSTONE_SUBMIT_ENDPOINT="${TURNSTONE_SUBMIT_ENDPOINT:-}" \
  -e PYTHONUNBUFFERED=1 \
  -e TZ="${TZ}" \
  -e TURNSTONE_MULTI_AGENT_DIAGNOSE="${TURNSTONE_MULTI_AGENT_DIAGNOSE:-false}" \
  -e GPU_SERVER_URL="${GPU_SERVER_URL:-}" \
  -e HF_HOME=/hf-cache \
  -e TURNSTONE_AUTO_INCIDENT="${TURNSTONE_AUTO_INCIDENT:-true}" \
  -e TURNSTONE_AUTO_INCIDENT_THRESHOLD="${TURNSTONE_AUTO_INCIDENT_THRESHOLD:-5}" \
  -e TURNSTONE_AUTO_INCIDENT_WINDOW="${TURNSTONE_AUTO_INCIDENT_WINDOW:-600}" \
  -e TURNSTONE_CLASSIFIER_MODEL="${TURNSTONE_CLASSIFIER_MODEL:-byviz/bylastic_classification_logs}" \
  -e TURNSTONE_EMBED_BACKEND="${TURNSTONE_EMBED_BACKEND:-sentence_transformers}" \
  -e TURNSTONE_EMBED_MODEL="${TURNSTONE_EMBED_MODEL:-sentence-transformers/all-MiniLM-L6-v2}" \
  -e TURNSTONE_EMBED_DEVICE="${TURNSTONE_EMBED_DEVICE:-cpu}" \
  --health-cmd="curl -f http://localhost:8534/turnstone/health || exit 1" \
  --health-interval=30s \
  --health-timeout=10s \
@ -193,8 +133,8 @@ echo "Check container health with:"
 echo "  sudo podman ps"
 echo "  sudo podman logs turnstone"
 echo ""
-echo "To glean all sources now:"
+echo "To ingest all sources now:"
-echo "  sudo podman exec turnstone python scripts/glean_corpus.py \\"
+echo "  sudo podman exec turnstone python scripts/ingest_corpus.py \\"
 echo "    --sources /patterns/sources.yaml --db /data/turnstone.db"
 echo ""
 echo "To add a new source: edit /opt/turnstone/patterns/sources.yaml — no restart needed."
--- a/requirements.txt
+++ b/requirements.txt
@ -1,20 +1,8 @@
 fastapi>=0.110.0
 uvicorn[standard]>=0.27.0
 # Postgres backend — optional; SQLite is used when DATABASE_URL is unset
 psycopg[binary,pool]>=3.1.0
 pydantic>=2.0.0
 pyyaml>=6.0
 aiofiles>=23.0.0
 python-multipart>=0.0.9
 dateparser>=1.2.0
 httpx>=0.27.0
 paramiko
 # Multi-agent diagnose pipeline — ML deps
 # classifier.py and suppressor.py have ImportError guards and fall back gracefully,
 # but these are included unconditionally so container images are fully capable.
 # Install CPU-only torch to avoid pulling the ~2GB CUDA wheel into the image.
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch>=2.2.0
 transformers>=4.40.0
 sentence-transformers>=3.0.0
--- a/scripts/build_fts_index.py
+++ b/scripts/build_fts_index.py
@ -1,4 +1,4 @@
-"""CLI: build (or update) the FTS5 full-text search index after glean."""
+"""CLI: build (or update) the FTS5 full-text search index after ingest."""
 from __future__ import annotations
 import sys
@ -13,7 +13,7 @@ if __name__ == "__main__":
    if not db_path.exists():
        print(f"ERROR: database not found: {db_path}", file=sys.stderr)
-        print("Run glean first: python scripts/glean_corpus.py", file=sys.stderr)
+        print("Run ingest first: python scripts/ingest_corpus.py", file=sys.stderr)
        sys.exit(1)
    print(f"Building FTS index for {db_path} ...")
--- a/scripts/collect_cluster_logs.sh
+++ b/scripts/collect_cluster_logs.sh
@ -18,9 +18,9 @@ DATA_DIR=/devl/turnstone-cluster/data
 WINDOW="20 minutes ago"
 SSH_OPTS="-o ConnectTimeout=5 -o BatchMode=yes -o StrictHostKeyChecking=no"
 PYTHON=/devl/miniconda3/envs/cf/bin/python
-INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/glean_corpus.py"
+INGEST="${PYTHON} /Library/Development/CircuitForge/turnstone/scripts/ingest_corpus.py"
 DB=/devl/turnstone-cluster/data/turnstone.db
-LOG=/devl/turnstone-cluster/data/glean.log
+LOG=/devl/turnstone-cluster/data/ingest.log
 mkdir -p "${DATA_DIR}"
@ -48,7 +48,6 @@ declare -A NODES=(
  [sif]="${DATA_DIR}/sif-journal.jsonl"
  [cass]="${DATA_DIR}/cass-journal.jsonl"
  [strahl]="${DATA_DIR}/strahl-journal.jsonl"
  [muninn]="${DATA_DIR}/muninn-journal.jsonl"
 )
 for node in "${!NODES[@]}"; do
@ -99,21 +98,6 @@ else
  echo "navi: unreachable, skipping docker logs"
 fi
 # ── Navi qBittorrent app logs (volume-mounted files, not in docker logs) ──────
 # qBit writes rich per-torrent events to a file inside the compose volume.
 # These are NOT captured by `docker logs` — must be pulled directly.
 QBIT_LOG_BASE="/opt/containers/arr"
 for instance in qbit-tb0 qbit-tb1 qbit-tb2; do
  remote_log="${QBIT_LOG_BASE}/${instance}/qBittorrent/logs/qbittorrent.log"
  local_out="${NAVI_DIR}/${instance}-app.log"
  if ssh ${SSH_OPTS} navi "test -f '${remote_log}'" 2>/dev/null; then
    ssh ${SSH_OPTS} navi "cat '${remote_log}'" > "${local_out}" 2>/dev/null || : > "${local_out}"
  else
    : > "${local_out}"
  fi
 done
 echo "navi qbit app logs: $(cat "${NAVI_DIR}"/qbit-tb*.log 2>/dev/null | wc -l) lines"
 # ── Strahl Docker containers ──────────────────────────────────────────────────
 STRAHL_DIR="${DATA_DIR}/docker-strahl"
 mkdir -p "${STRAHL_DIR}"
@ -157,7 +141,7 @@ fi
  # Remote journals (explicit source IDs via YAML)
  ${INGEST} --sources /devl/turnstone-cluster/patterns/sources-cluster.yaml --db "${DB}"
-  # Docker and Plex logs (source IDs derived from filenames by directory glean)
+  # Docker and Plex logs (source IDs derived from filenames by directory ingest)
  for dir in "${HEIMDALL_DIR}" "${NAVI_DIR}" "${STRAHL_DIR}" "${PLEX_DIR}"; do
    [[ -d "${dir}" ]] && ls "${dir}"/*.jsonl "${dir}"/*.log 2>/dev/null | grep -q . && \
      ${INGEST} "${dir}" "${DB}" || true
--- a/scripts/export_journal.sh
+++ b/scripts/export_journal.sh
@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Export recent system messages to files the Turnstone container can glean.
+# Export recent system messages to files the Turnstone container can ingest.
 #
 # Exports:
 #   journal-export.jsonl  — journald (if journalctl is available)
@ -11,11 +11,11 @@
 # Usage (standalone):
 #   sudo bash /opt/turnstone/scripts/export_journal.sh
 #
-# Cron (combined with glean):
+# Cron (combined with ingest):
 #   */15 * * * * bash /opt/turnstone/scripts/export_journal.sh && \
 #     podman exec turnstone python scripts/ingest_corpus.py \
 #       --sources /patterns/sources.yaml --db /data/turnstone.db \
-#       >> /var/log/turnstone-glean.log 2>&1
+#       >> /var/log/turnstone-ingest.log 2>&1
 set -euo pipefail
--- a/scripts/gen_corpus.py
+++ b/scripts/gen_corpus.py
@ -1,383 +0,0 @@
 """Synthetic log corpus generator.
 Produces realistic-but-entirely-artificial log files for demos, load tests,
 and parser regression suites — no production data required.
 Usage:
    python scripts/gen_corpus.py --days 7 --out /tmp/demo-corpus/
    python scripts/gen_corpus.py --days 1 --out /tmp/test-run/ --seed 42 --error-rate 0.15
    python scripts/gen_corpus.py --help
 Output tree:
    <out>/journald/system.jsonl   — systemd/kernel journald JSON
    <out>/docker/services.jsonl   — containerised app stdout
    <out>/qbittorrent/qbt.log     — hotio-format qBittorrent log
    <out>/ext_device/device.log    — vendor device plaintext log
 """
 from __future__ import annotations
 import argparse
 import json
 import random
 import sys
 from datetime import datetime, timedelta, timezone
 from pathlib import Path
 from typing import Callable
 # ── Severity distribution ──────────────────────────────────────────────────────
 _SYSLOG_PRIORITY = {
    "CRITICAL": "2",
    "ERROR":    "3",
    "WARN":     "4",
    "INFO":     "6",
    "DEBUG":    "7",
 }
 _SEVERITY_WEIGHTS = {
    "INFO":     0.70,
    "DEBUG":    0.10,
    "WARN":     0.12,
    "ERROR":    0.06,
    "CRITICAL": 0.02,
 }
 def _pick_severity(rng: random.Random, error_rate: float) -> str:
    """Return a severity string, boosting ERROR/CRITICAL by error_rate."""
    weights = dict(_SEVERITY_WEIGHTS)
    boost = error_rate * 0.08  # distribute extra weight to error tiers
    weights["ERROR"]    += boost
    weights["CRITICAL"] += boost / 2
    weights["INFO"]     -= boost * 1.2
    weights["DEBUG"]    -= boost * 0.3
    choices = list(weights.keys())
    probs   = [max(0.0, weights[k]) for k in choices]
    return rng.choices(choices, weights=probs, k=1)[0]
 # ── Timestamp helpers ──────────────────────────────────────────────────────────
 def _ts_seq(start: datetime, end: datetime, rng: random.Random) -> list[datetime]:
    """Return a sorted list of random timestamps between start and end."""
    total_seconds = (end - start).total_seconds()
    # Roughly 1 event every ~4 seconds on average across all sources
    count = int(total_seconds / 4)
    offsets = sorted(rng.uniform(0, total_seconds) for _ in range(count))
    return [start + timedelta(seconds=o) for o in offsets]
 def _micros(dt: datetime) -> str:
    """Journald __REALTIME_TIMESTAMP: microseconds since epoch, as string."""
    return str(int(dt.timestamp() * 1_000_000))
 # ── Message libraries ──────────────────────────────────────────────────────────
 _JOURNALD_UNITS = [
    "sshd.service", "nginx.service", "docker.service", "systemd-resolved.service",
    "cron.service", "systemd-journald.service", "NetworkManager.service",
    "turnstone.service", "podman.service", "fail2ban.service",
 ]
 _JOURNALD_MESSAGES: dict[str, list[str]] = {
    "INFO": [
        "Started {unit}.",
        "Listening on {port}/tcp.",
        "Reloaded configuration for {unit}.",
        "New connection from {ip}:{port}",
        "Session opened for user {user} by (uid=0)",
        "Accepted publickey for {user} from {ip} port {port}",
        "System time synchronized from NTP server {ip}",
        "Unit {unit} entered active state.",
        "Loaded kernel module {module}.",
        "DNS query resolved: {host} -> {ip}",
    ],
    "DEBUG": [
        "Polling interval set to {n}ms",
        "Cache hit for key '{key}'",
        "Heartbeat OK from {host}",
        "Timer {n} fired",
        "Worker {n} idle",
    ],
    "WARN": [
        "High memory usage on {unit}: {pct}% used",
        "Slow DNS response ({ms}ms) for {host}",
        "Deprecated option '{key}' in config — will be removed in next release",
        "Retrying connection to {host} (attempt {n}/5)",
        "Journal size limit reached, rotating",
        "Disk usage at {pct}% on /dev/sda1",
    ],
    "ERROR": [
        "Failed to start {unit}: exit code {n}",
        "Connection refused to {host}:{port}",
        "Segmentation fault in {unit} (core dumped)",
        "Authentication failure for user {user} from {ip}",
        "Timeout waiting for {unit} to become ready",
        "Failed to bind {port}/tcp: address already in use",
    ],
    "CRITICAL": [
        "Kernel panic — not syncing: {msg}",
        "Out of memory: killed process {n} ({unit})",
        "Hardware error on /dev/sda1: I/O error",
        "Disk quota exceeded on /home for user {user}",
        "Critical service {unit} failed; system may be unstable",
    ],
 }
 _DOCKER_SERVICES = [
    "caddy", "postgres", "redis", "turnstone", "avocet",
    "prometheus", "grafana", "loki", "minio", "vllm",
 ]
 _DOCKER_MESSAGES: dict[str, list[str]] = {
    "INFO": [
        "level=info msg=\"Server listening on 0.0.0.0:{port}\"",
        "level=info msg=\"Connected to database at {host}:5432\"",
        'level=info msg="GET /api/health 200 {ms}ms" user={user}',
        'level=info msg="POST /api/v1/jobs 201 {ms}ms"',
        "INFO: Worker pool size: {n}",
        "INFO: Cache warmed — {n} entries loaded",
        "INFO: Startup complete in {ms}ms",
        "INFO: Scheduled job '{key}' executed successfully",
    ],
    "DEBUG": [
        "DEBUG: SQL query took {ms}ms: SELECT * FROM {key}",
        "DEBUG: Redis HIT for key {key}",
        "level=debug msg=\"span {key} completed\" duration={ms}ms",
        "DEBUG: Trace ID {key}: handler returned 200",
    ],
    "WARN": [
        "level=warn msg=\"Slow query ({ms}ms) on table {key}\"",
        "WARN: Connection pool at {pct}% capacity",
        "WARN: Rate limit approaching for client {ip}",
        "WARN: Deprecated endpoint /v1/{key} called by {ip}",
        "level=warn msg=\"GC pause {ms}ms — possible memory pressure\"",
    ],
    "ERROR": [
        "level=error msg=\"Unhandled exception in handler '{key}'\" err={msg}",
        "ERROR: Database connection lost: {msg}",
        "level=error msg=\"Failed to acquire lock on {key} after {ms}ms\"",
        "ERROR: HTTP 500 POST /api/v1/{key}: internal server error",
        "ERROR: Redis NOAUTH: authentication required",
    ],
    "CRITICAL": [
        "level=critical msg=\"Panic: nil pointer dereference in {key}\"",
        "CRITICAL: Fatal: cannot open database: {msg}",
        "CRITICAL: OOM killer invoked — process {n} terminated",
    ],
 }
 _QBT_MESSAGES: dict[str, list[str]] = {
    "INFO": [
        "Successfully listening on IP: 0.0.0.0; port: {port}",
        "Torrent '{key}' added to download queue",
        "Download of '{key}' complete ({n} MB)",
        "Seeding '{key}' at {n} KB/s",
        "Tracker '{host}' working, {n} seeds",
        "Peer {ip} connected to torrent '{key}'",
        "Free disk space: {n} GB",
    ],
    "WARN": [
        "Tracker '{host}' is not working (retrying)",
        "Slow download speed ({n} KB/s) for '{key}'",
        "Too many open files — reducing connection limit",
        "DHT bootstrap failed, retrying in {n}s",
    ],
    "CRITICAL": [
        "Not enough space on disk to download '{key}'",
        "File I/O error for torrent '{key}': {msg}",
        "Unable to bind listen port {port}",
    ],
 }
 _EXT_DEVICE_CODES: dict[str, list[str]] = {
    "INFO": [
        "SYS-0100 Device boot complete, firmware v{n}.{n}.{n}",
        "SYS-0101 Sensor array calibration OK",
        "NET-0200 Link established on interface eth{n}",
        "CFG-0300 Configuration loaded from flash",
        "HW-0400 Fan speed nominal: {n} RPM",
    ],
    "WARN": [
        "NET-0210 Link quality degraded: RSSI -{n} dBm",
        "HW-0410 Fan speed elevated: {n} RPM (threshold: {n} RPM)",
        "CFG-0310 Unknown config key '{key}' ignored",
        "SYS-0110 Watchdog near timeout — {n}ms remaining",
    ],
    "ERROR": [
        "ERR-1001 Sensor read failure on channel {n}: timeout",
        "ERR-1002 I2C bus {n} NACK from address 0x{key}",
        "ERR-2001 Network tx queue overflow — dropped {n} packets",
        "ERR-3001 Flash write error at sector {n}",
    ],
    "CRITICAL": [
        "ERR-9001 Thermal runaway detected — initiating shutdown",
        "ERR-9002 Supply voltage out of range: {n}mV",
        "ERR-9003 Memory parity error at address 0x{key}",
    ],
 }
 # ── Template substitution ──────────────────────────────────────────────────────
 _HOSTS  = ["node1", "node2", "node3", "node4", "gateway", "remotehost"]
 _USERS  = ["alan", "root", "deployer", "backup", "nobody"]
 _MODULES = ["btrfs", "xfs", "nf_conntrack", "ip6table_filter", "overlay"]
 def _fill(template: str, rng: random.Random) -> str:
    """Replace {placeholder} tokens with plausible random values."""
    def _sub(m: re.Match) -> str:
        import re
        key = m.group(1)
        if key == "ip":     return f"10.{rng.randint(0,255)}.{rng.randint(0,255)}.{rng.randint(1,254)}"
        if key == "port":   return str(rng.randint(1024, 65535))
        if key == "n":      return str(rng.randint(1, 9999))
        if key == "pct":    return str(rng.randint(50, 99))
        if key == "ms":     return str(rng.randint(1, 5000))
        if key == "unit":   return rng.choice(_JOURNALD_UNITS)
        if key == "user":   return rng.choice(_USERS)
        if key == "host":   return rng.choice(_HOSTS)
        if key == "module": return rng.choice(_MODULES)
        if key == "msg":    return rng.choice(["unexpected EOF", "connection reset", "no such file"])
        if key == "key":    return rng.choice(["auth", "jobs", "cache", "index", "sessions", "queue"])
        return m.group(0)
    import re
    return re.sub(r"\{(\w+)\}", _sub, template)
 def _pick_msg(library: dict[str, list[str]], severity: str, rng: random.Random) -> str:
    candidates = library.get(severity) or library.get("INFO", ["log entry"])
    return _fill(rng.choice(candidates), rng)
 # ── Per-format generators ──────────────────────────────────────────────────────
 def gen_journald(path: Path, start: datetime, end: datetime, rng: random.Random, error_rate: float) -> int:
    """Emit journald JSON lines (-o json format)."""
    lines = 0
    hostname = rng.choice(_HOSTS)
    with path.open("w") as fh:
        for dt in _ts_seq(start, end, rng):
            severity = _pick_severity(rng, error_rate)
            unit     = rng.choice(_JOURNALD_UNITS)
            msg      = _pick_msg(_JOURNALD_MESSAGES, severity, rng)
            entry = {
                "__REALTIME_TIMESTAMP": _micros(dt),
                "MESSAGE":             msg,
                "PRIORITY":            _SYSLOG_PRIORITY.get(severity, "6"),
                "_HOSTNAME":           hostname,
                "_SYSTEMD_UNIT":       unit,
                "SYSLOG_IDENTIFIER":   unit.replace(".service", ""),
            }
            fh.write(json.dumps(entry) + "\n")
            lines += 1
    return lines
 def gen_docker(path: Path, start: datetime, end: datetime, rng: random.Random, error_rate: float) -> int:
    """Emit Docker-format JSON lines (SOURCE + MESSAGE envelope)."""
    lines = 0
    with path.open("w") as fh:
        for dt in _ts_seq(start, end, rng):
            severity = _pick_severity(rng, error_rate)
            service  = rng.choice(_DOCKER_SERVICES)
            msg      = _pick_msg(_DOCKER_MESSAGES, severity, rng)
            entry = {
                "SOURCE":  f"docker:{service}",
                "MESSAGE": msg,
            }
            fh.write(json.dumps(entry) + "\n")
            lines += 1
    return lines
 def gen_qbittorrent(path: Path, start: datetime, end: datetime, rng: random.Random, error_rate: float) -> int:
    """Emit hotio-format qBittorrent plaintext log."""
    _CODE = {"INFO": "N", "WARN": "W", "CRITICAL": "C", "ERROR": "C", "DEBUG": "N"}
    lines = 0
    with path.open("w") as fh:
        for dt in _ts_seq(start, end, rng):
            severity = _pick_severity(rng, error_rate)
            msg      = _pick_msg(_QBT_MESSAGES, severity, rng)
            code     = _CODE.get(severity, "N")
            ts_str   = dt.strftime("%Y-%m-%dT%H:%M:%S")
            fh.write(f"({code}) {ts_str} - {msg}\n")
            lines += 1
    return lines
 def gen_ext_device(path: Path, start: datetime, end: datetime, rng: random.Random, error_rate: float) -> int:
    """Emit vendor device plaintext log (ISO timestamp + level + ERR/SYS/NET code + message)."""
    lines = 0
    with path.open("w") as fh:
        for dt in _ts_seq(start, end, rng):
            severity = _pick_severity(rng, error_rate)
            msg      = _pick_msg(_EXT_DEVICE_CODES, severity, rng)
            ts_str   = dt.strftime("%Y-%m-%dT%H:%M:%S")
            fh.write(f"{ts_str} [{severity}] {msg}\n")
            lines += 1
    return lines
 # ── Orchestration ──────────────────────────────────────────────────────────────
 _GENERATORS: list[tuple[str, str, Callable]] = [
    ("journald",    "system.jsonl",  gen_journald),
    ("docker",      "services.jsonl", gen_docker),
    ("qbittorrent", "qbt.log",       gen_qbittorrent),
    ("ext_device",  "device.log",    gen_ext_device),
 ]
 def generate(
    out: Path,
    days: int,
    seed: int | None,
    error_rate: float,
    reference_time: datetime | None = None,
 ) -> dict[str, int]:
    rng   = random.Random(seed)
    end   = reference_time or datetime.now(tz=timezone.utc)
    start = end - timedelta(days=days)
    totals: dict[str, int] = {}
    for subdir, filename, gen_fn in _GENERATORS:
        dest = out / subdir / filename
        dest.parent.mkdir(parents=True, exist_ok=True)
        # Each source gets its own seeded sub-RNG so streams are independent
        sub_rng = random.Random(rng.randint(0, 2**31))
        count   = gen_fn(dest, start, end, sub_rng, error_rate)
        totals[str(dest.relative_to(out))] = count
        print(f"  {dest.relative_to(out)}: {count:,} lines")
    return totals
 # ── CLI ────────────────────────────────────────────────────────────────────────
 def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(
        description="Generate a synthetic Turnstone log corpus for demos and testing."
    )
    parser.add_argument("--days",       type=int,   default=7,    help="Days of history to generate (default: 7)")
    parser.add_argument("--out",        type=Path,  required=True, help="Output directory")
    parser.add_argument("--seed",       type=int,   default=None, help="RNG seed for reproducibility")
    parser.add_argument("--error-rate", type=float, default=0.05, help="Error injection rate 0.0-1.0 (default: 0.05)")
    args = parser.parse_args(argv)
    if not 0.0 <= args.error_rate <= 1.0:
        print("ERROR: --error-rate must be between 0.0 and 1.0", file=sys.stderr)
        return 1
    args.out.mkdir(parents=True, exist_ok=True)
    print(f"Generating {args.days}-day corpus → {args.out}  (seed={args.seed}, error_rate={args.error_rate})")
    totals = generate(args.out, args.days, args.seed, args.error_rate)
    total_lines = sum(totals.values())
    print(f"Done — {total_lines:,} total log lines across {len(totals)} files")
    return 0
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/harvest_docs.py
+++ b/scripts/harvest_docs.py
@ -1,266 +0,0 @@
 #!/usr/bin/env python3
 """harvest_docs.py — Bulk-upload documentation into Turnstone's context RAG.
 Reads a YAML manifest that describes which files or directories to upload,
 then POSTs each file to the Turnstone /api/context/docs endpoint.
 Usage:
    # From a manifest file
    python harvest_docs.py --manifest manifests/my-cluster.yaml
    # Explicit files (no manifest needed)
    python harvest_docs.py --base-url http://localhost:8534 file1.md dir/file2.yaml
    # Dry run — show what would be uploaded without sending
    python harvest_docs.py --manifest manifests/my-cluster.yaml --dry-run
 Manifest format (YAML):
    base_url: http://localhost:8534     # optional; overridden by --base-url
    sources:
      - path: /absolute/path/to/file.md
        label: friendly-name            # optional; overrides filename in DB
      - path: /absolute/path/to/dir/
        include: ["*.md", "*.yaml"]    # glob patterns; default: see INCLUDE_EXTS
        exclude: ["CLAUDE*", "SESSION_*", "*_keys*"]
        recursive: false               # default false
 """
 from __future__ import annotations
 import argparse
 import fnmatch
 import sys
 import urllib.request
 import urllib.error
 from pathlib import Path
 try:
    import yaml
    _HAS_YAML = True
 except ImportError:
    _HAS_YAML = False
 # File extensions included when walking a directory with no explicit `include`.
 INCLUDE_EXTS = {".md", ".yaml", ".yml", ".txt", ".conf", ".rst"}
 # Default exclude patterns applied to every directory source (unless overridden).
 DEFAULT_EXCLUDES = [
    "CLAUDE*",
    "SESSION_*",
    "HANDOFF_*",
    "*.key",
    "*.pem",
    "*.crt",
    "node_modules",
    ".git",
    "__pycache__",
 ]
 UPLOAD_PATH = "/turnstone/api/context/docs"
 # ---------------------------------------------------------------------------
 # File collection
 # ---------------------------------------------------------------------------
 def _matches_any(name: str, patterns: list[str]) -> bool:
    return any(fnmatch.fnmatch(name, p) for p in patterns)
 def _collect_from_dir(
    root: Path,
    include: list[str],
    exclude: list[str],
    recursive: bool,
 ) -> list[Path]:
    pattern = "**/*" if recursive else "*"
    candidates: list[Path] = []
    for p in root.glob(pattern):
        if not p.is_file():
            continue
        # Exclude any path component that matches an exclude pattern
        if any(_matches_any(part, exclude) for part in p.parts):
            continue
        if include:
            if not _matches_any(p.name, include):
                continue
        else:
            if p.suffix.lower() not in INCLUDE_EXTS:
                continue
        candidates.append(p)
    return sorted(candidates)
 def resolve_sources(sources: list[dict]) -> list[tuple[Path, str]]:
    """Return list of (path, label) pairs from a manifest sources list."""
    results: list[tuple[Path, str]] = []
    for entry in sources:
        raw_path = entry.get("path", "")
        p = Path(raw_path).expanduser().resolve()
        label: str = entry.get("label", "")
        include: list[str] = entry.get("include", [])
        exclude: list[str] = entry.get("exclude", DEFAULT_EXCLUDES)
        recursive: bool = entry.get("recursive", False)
        if not p.exists():
            print(f"  [WARN] path not found, skipping: {p}", file=sys.stderr)
            continue
        if p.is_file():
            results.append((p, label or p.name))
        elif p.is_dir():
            found = _collect_from_dir(p, include, exclude, recursive)
            for f in found:
                results.append((f, f.name))
        else:
            print(f"  [WARN] not a file or directory, skipping: {p}", file=sys.stderr)
    return results
 # ---------------------------------------------------------------------------
 # Upload
 # ---------------------------------------------------------------------------
 def _build_multipart(boundary: bytes, filename: str, content: bytes) -> bytes:
    """Build a minimal multipart/form-data body for a single file field."""
    lines: list[bytes] = [
        b"--" + boundary,
        f'Content-Disposition: form-data; name="file"; filename="{filename}"'.encode(),
        b"Content-Type: application/octet-stream",
        b"",
        content,
        b"--" + boundary + b"--",
        b"",
    ]
    return b"\r\n".join(lines)
 def upload_file(base_url: str, path: Path, label: str) -> dict:
    """POST a file to Turnstone's context doc endpoint. Returns response dict."""
    url = base_url.rstrip("/") + UPLOAD_PATH
    content = path.read_bytes()
    filename = label or path.name
    boundary = b"----TurnstoneHarvest"
    body = _build_multipart(boundary, filename, content)
    content_type = f"multipart/form-data; boundary={boundary.decode()}"
    req = urllib.request.Request(
        url,
        data=body,
        headers={"Content-Type": content_type},
        method="POST",
    )
    try:
        with urllib.request.urlopen(req, timeout=30) as resp:
            import json
            return json.loads(resp.read())
    except urllib.error.HTTPError as e:
        body_text = e.read().decode(errors="replace")
        return {"error": f"HTTP {e.code}: {body_text[:200]}"}
    except Exception as exc:
        return {"error": str(exc)}
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Bulk-upload docs into Turnstone context RAG.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    parser.add_argument(
        "--manifest", "-m",
        metavar="FILE",
        help="YAML manifest describing sources to upload",
    )
    parser.add_argument(
        "--base-url", "-u",
        default="http://localhost:8534",
        metavar="URL",
        help="Turnstone base URL (default: http://localhost:8534)",
    )
    parser.add_argument(
        "--dry-run", "-n",
        action="store_true",
        help="Show files that would be uploaded without actually uploading",
    )
    parser.add_argument(
        "files",
        nargs="*",
        metavar="FILE",
        help="Explicit files to upload (alternative to --manifest)",
    )
    args = parser.parse_args()
    base_url = args.base_url
    sources: list[tuple[Path, str]] = []
    if args.manifest:
        if not _HAS_YAML:
            print("ERROR: PyYAML is required for --manifest. Run: pip install pyyaml", file=sys.stderr)
            sys.exit(1)
        manifest_path = Path(args.manifest).expanduser().resolve()
        if not manifest_path.exists():
            print(f"ERROR: manifest not found: {manifest_path}", file=sys.stderr)
            sys.exit(1)
        data = yaml.safe_load(manifest_path.read_text())
        base_url = args.base_url if args.base_url != "http://localhost:8534" else data.get("base_url", base_url)
        sources = resolve_sources(data.get("sources", []))
    for raw in args.files:
        p = Path(raw).expanduser().resolve()
        if not p.exists():
            print(f"  [WARN] not found, skipping: {p}", file=sys.stderr)
            continue
        if p.is_file():
            sources.append((p, p.name))
        else:
            print(f"  [WARN] {p} is a directory; use a manifest with recursive:true for directory sources", file=sys.stderr)
    if not sources:
        print("No files to upload. Pass --manifest or explicit file paths.")
        sys.exit(0)
    print(f"Turnstone: {base_url}")
    print(f"Files to upload: {len(sources)}")
    if args.dry_run:
        print("\n[DRY RUN] Would upload:")
    print()
    ok = 0
    failed = 0
    for path, label in sources:
        size_kb = path.stat().st_size / 1024
        if args.dry_run:
            print(f"  {label}  ({size_kb:.1f} KB)  ← {path}")
            ok += 1
            continue
        print(f"  Uploading {label} ({size_kb:.1f} KB)…", end=" ", flush=True)
        result = upload_file(base_url, path, label)
        if "error" in result:
            print(f"FAILED — {result['error']}")
            failed += 1
        else:
            chunks = result.get("chunks_written", result.get("chunks_created", "?"))
            facts = result.get("facts_written", 0)
            extra = f", {facts} facts" if facts else ""
            print(f"OK  ({chunks} chunks{extra})")
            ok += 1
    print()
    if args.dry_run:
        print(f"Dry run complete. {ok} file(s) would be uploaded.")
    else:
        print(f"Done. {ok} uploaded, {failed} failed.")
        if failed:
            sys.exit(1)
 if __name__ == "__main__":
    main()
--- a/scripts/ingest_corpus.py
+++ b/scripts/ingest_corpus.py
@ -1,15 +1,11 @@
-"""CLI: glean a log file or corpus directory into the Turnstone SQLite database.
+"""CLI: ingest a log file or corpus directory into the Turnstone SQLite database.
 Usage:
    # Single file or directory (legacy)
-    python scripts/glean_corpus.py <file_or_dir> [db_path] [--force]
+    python scripts/ingest_corpus.py <file_or_dir> [db_path]
    # Sources config (multi-service)
-    python scripts/glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]
+    python scripts/ingest_corpus.py --sources <sources.yaml> [--db <db_path>]
 Options:
    --force     Bypass fingerprint checks and re-glean all files, re-applying
                all patterns. Use after updating patterns/default.yaml.
 """
 from __future__ import annotations
@ -21,7 +17,7 @@ logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
 sys.path.insert(0, str(Path(__file__).parent.parent))
-from app.glean.pipeline import glean_dir, glean_file, glean_sources
+from app.ingest.pipeline import ingest, ingest_file, ingest_sources
 def _print_stats(stats: dict[str, int]) -> None:
@ -37,36 +33,33 @@ if __name__ == "__main__":
    if not args:
        print(
            "Usage:\n"
-            "  glean_corpus.py <file_or_dir> [db_path] [--force]\n"
+            "  ingest_corpus.py <file_or_dir> [db_path]\n"
-            "  glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]",
+            "  ingest_corpus.py --sources <sources.yaml> [--db <db_path>]",
            file=sys.stderr,
        )
        sys.exit(1)
    force = "--force" in args
    args = [a for a in args if a != "--force"]
    if args[0] == "--sources":
        if len(args) < 2:
-            print("Usage: glean_corpus.py --sources <sources.yaml> [--db <db_path>] [--force]", file=sys.stderr)
+            print("Usage: ingest_corpus.py --sources <sources.yaml> [--db <db_path>]", file=sys.stderr)
            sys.exit(1)
        sources_file = Path(args[1])
        db_path = Path("data/turnstone.db")
        if "--db" in args:
            db_path = Path(args[args.index("--db") + 1])
        db_path.parent.mkdir(parents=True, exist_ok=True)
-        print(f"Gleaning sources from {sources_file} → {db_path}")
+        print(f"Ingesting sources from {sources_file} → {db_path}")
-        stats = glean_sources(sources_file, db_path, force=force)
+        stats = ingest_sources(sources_file, db_path)
        _print_stats(stats)
    else:
        target = Path(args[0])
        db_path = Path(args[1]) if len(args) > 1 else Path("data/turnstone.db")
        db_path.parent.mkdir(parents=True, exist_ok=True)
-        print(f"Gleaning {target} → {db_path}")
+        print(f"Ingesting {target} → {db_path}")
        if target.is_file():
-            stats = glean_file(target, db_path, force=force)
+            stats = ingest_file(target, db_path)
        elif target.is_dir():
-            stats = glean_dir(target, db_path, force=force)
+            stats = ingest(target, db_path)
        else:
            print(f"Error: {target} is not a file or directory", file=sys.stderr)
            sys.exit(1)
--- a/scripts/manifests/example.yaml
+++ b/scripts/manifests/example.yaml
@ -1,38 +0,0 @@
 # Turnstone context doc manifest — example / template
 # Run: python scripts/harvest_docs.py --manifest scripts/manifests/example.yaml
 #
 # Copy this file, adjust paths and patterns for your environment.
 # Keep manifests in version control alongside your docs so ingestion config
 # is auditable and reproducible.
 # Turnstone URL (can be overridden with --base-url on the command line)
 base_url: http://localhost:8534
 sources:
  # ── Single file ────────────────────────────────────────────────────────────
  - path: /path/to/runbooks/service-restart.md
    label: runbook-service-restart.md      # name stored in context DB (optional)
  # ── Directory — include specific extensions, exclude sensitive patterns ─────
  - path: /path/to/runbooks/
    include: ["*.md", "*.yaml"]            # only these extensions
    exclude:                               # skip these filename patterns
      - "CLAUDE*"                          # Claude session prompts
      - "SESSION_*"                        # session summaries
      - "HANDOFF_*"                        # handoff notes
      - "*.key"                            # private keys
      - "*.pem"
    recursive: false                       # set true to walk subdirectories
  # ── Recursive directory walk ───────────────────────────────────────────────
  - path: /path/to/docs/
    include: ["*.md"]
    exclude:
      - "CLAUDE*"
      - "*.key"
      - "node_modules"
      - ".git"
    recursive: true
  # ── Minimal entry (defaults: INCLUDE_EXTS filter, DEFAULT_EXCLUDES applied) -
  - path: /path/to/infrastructure.md
--- a/scripts/manifests/heimdall-devops.yaml
+++ b/scripts/manifests/heimdall-devops.yaml
@ -1,53 +0,0 @@
 # Turnstone context doc manifest — Heimdall home lab cluster
 # Run: python scripts/harvest_docs.py --manifest scripts/manifests/heimdall-devops.yaml
 #
 # Sections:
 #   infrastructure/  — network topology, machine specs, service ports
 #   runbooks/        — incident postmortems and operational procedures
 #   tdarr/           — media transcoding failure modes and recovery
 #
 # Files intentionally excluded from this manifest:
 #   - WireGuard .conf files and KEYS.txt (contain private keys)
 #   - SESSION_* and HANDOFF_* files (Claude session prompts, not operational docs)
 #   - CLAUDE.md files (Claude context prompts, not operational docs)
 #   - Raw tdarr scan data (tdarr/data/*.txt — scan output, not prose)
 #   - projects/helmet-3d, projects/mycroft-precise (unrelated to cluster ops)
 #   - collapse-stack/ (resilience planning, not daily log triage material)
 #   - bastion/sdcard-config, bastion/rpi-config (one-time setup artifacts)
 base_url: http://localhost:8534
 sources:
  # ── Service inventory (most immediately useful for log attribution) ────────
  - path: /Library/Development/CircuitForge/circuitforge-infra/inventory/services.md
    label: service-inventory.md
  # ── Infrastructure topology (partially outdated — note added at top of file)
  - path: /Library/Development/CircuitForge/circuitforge-infra/infrastructure/docs/INFRASTRUCTURE.md
    label: infrastructure-topology.md
  - path: /Library/Development/CircuitForge/circuitforge-infra/infrastructure/docs/GPU_CLUSTERING.md
    label: gpu-clustering.md
  - path: /Library/Development/CircuitForge/circuitforge-infra/infrastructure/ssh_configs/PROXYJUMP_CONFIG.md
    label: ssh-proxyjump-config.md
  # ── Runbooks ───────────────────────────────────────────────────────────────
  - path: /Library/Development/CircuitForge/circuitforge-infra/runbooks/cf-orch-coordinator.md
    label: runbook-cf-orch-coordinator.md
  - path: /Library/Development/CircuitForge/circuitforge-infra/runbooks/docker-nfs-boot-race-and-image-security.md
    label: runbook-docker-nfs-boot-race.md
  - path: /Library/Development/CircuitForge/circuitforge-infra/runbooks/PIHOLE_DNS_HANDOFF.md
    label: runbook-pihole-dns.md
  # ── Media server / Tdarr ───────────────────────────────────────────────────
  - path: /Library/Development/devl/Devops/tdarr/docs/TDARR_RECOVERY_README.md
    label: tdarr-recovery.md
  - path: /Library/Development/devl/Devops/tdarr/docs/NVENC_CORRUPTION_DETECTION.md
    label: tdarr-nvenc-corruption.md
  - path: /Library/Development/devl/Devops/tdarr/docs/TDARR_ROBUST_WORKFLOW.md
    label: tdarr-robust-workflow.md
--- a/scripts/migrate_sqlite_to_postgres.py
+++ b/scripts/migrate_sqlite_to_postgres.py
@ -1,204 +0,0 @@
 #!/usr/bin/env python3
 """One-shot migration: copy data from existing SQLite DBs into Postgres.
 Usage:
    DATABASE_URL=postgresql://... python scripts/migrate_sqlite_to_postgres.py \
        --main-db    data/turnstone.db \
        --context-db data/turnstone-context.db \
        --incidents-db data/turnstone-incidents.db \
        [--tenant-id heimdall]
 The script is idempotent: rows already present in Postgres (same id) are skipped.
 It must be run ONCE per node after deploying the shared Postgres backend.
 Prerequisites:
    pip install 'psycopg[binary,pool]'
    Set DATABASE_URL to the target Postgres connection string.
 """
 from __future__ import annotations
 import argparse
 import os
 import sqlite3
 import sys
 from pathlib import Path
 # Allow running from the project root without installing the package
 sys.path.insert(0, str(Path(__file__).parent.parent))
 def _pg_connect():
    import psycopg  # type: ignore[import]
    url = os.environ.get("DATABASE_URL")
    if not url:
        print("ERROR: DATABASE_URL not set", file=sys.stderr)
        sys.exit(1)
    return psycopg.connect(url, autocommit=False)
 def _ensure_schema_pg() -> None:
    from app.db.schema import ensure_schema, ensure_context_schema, ensure_incidents_schema
    from pathlib import Path
    ensure_schema(Path("/dev/null"))       # db_path ignored for Postgres
    ensure_context_schema(Path("/dev/null"))
    ensure_incidents_schema(Path("/dev/null"))
    print("Postgres schema verified")
 def _migrate_table(
    src_conn: sqlite3.Connection,
    dst_conn,
    table: str,
    tenant_id: str,
    columns: list[str],
    conflict_cols: list[str],
 ) -> int:
    """Copy rows from SQLite table to Postgres. Returns rows inserted."""
    # Check if source table exists
    try:
        rows = src_conn.execute(f"SELECT * FROM {table} LIMIT 0").fetchall()  # noqa: S608
    except sqlite3.OperationalError:
        print(f"  {table}: not found in SQLite — skipping")
        return 0
    # Fetch all rows
    src_conn.row_factory = sqlite3.Row
    rows = src_conn.execute(f"SELECT * FROM {table}").fetchall()  # noqa: S608
    if not rows:
        print(f"  {table}: empty — skipping")
        return 0
    # Build INSERT ... ON CONFLICT DO NOTHING
    col_list = ", ".join(columns)
    placeholders = ", ".join("%s" for _ in columns)
    conflict = ", ".join(conflict_cols)
    sql = (
        f"INSERT INTO {table} ({col_list}) VALUES ({placeholders}) "  # noqa: S608
        f"ON CONFLICT ({conflict}) DO NOTHING"
    )
    inserted = 0
    with dst_conn.cursor() as cur:
        for row in rows:
            # Build values: inject tenant_id if not present in source row
            vals = []
            for col in columns:
                if col == "tenant_id":
                    try:
                        val = row["tenant_id"] or tenant_id
                    except (IndexError, KeyError):
                        val = tenant_id
                else:
                    try:
                        vals.append(row[col])
                    except (IndexError, KeyError):
                        vals.append(None)
                    continue
                vals.append(val)
            cur.execute(sql, vals)
            inserted += cur.rowcount
    dst_conn.commit()
    print(f"  {table}: {inserted}/{len(rows)} rows inserted ({len(rows) - inserted} skipped)")
    return inserted
 def main() -> None:
    parser = argparse.ArgumentParser(description="Migrate Turnstone SQLite → Postgres")
    parser.add_argument("--main-db", default="data/turnstone.db")
    parser.add_argument("--context-db", default="data/turnstone-context.db")
    parser.add_argument("--incidents-db", default="data/turnstone-incidents.db")
    parser.add_argument("--tenant-id", default=None, help="Override tenant ID (default: socket.gethostname())")
    args = parser.parse_args()
    if args.tenant_id:
        os.environ["TURNSTONE_TENANT_ID"] = args.tenant_id
    import socket
    tenant_id = os.environ.get("TURNSTONE_TENANT_ID") or socket.gethostname()
    print(f"Migrating as tenant_id={tenant_id!r}")
    # Ensure Postgres schema exists first
    os.environ.setdefault("DATABASE_URL", "")  # schema functions check this
    _ensure_schema_pg()
    pg = _pg_connect()
    total = 0
    # ── Main DB ───────────────────────────────────────────────────────────────
    main_path = Path(args.main_db)
    if main_path.exists():
        print(f"\nMigrating main DB: {main_path}")
        src = sqlite3.connect(str(main_path))
        src.row_factory = sqlite3.Row
        total += _migrate_table(src, pg, "log_entries", tenant_id,
            columns=["tenant_id", "id", "source_id", "sequence", "timestamp_raw",
                     "timestamp_iso", "ingest_time", "severity", "repeat_count",
                     "out_of_order", "matched_patterns", "text"],
            conflict_cols=["tenant_id", "id"])
        total += _migrate_table(src, pg, "glean_fingerprints", tenant_id,
            columns=["tenant_id", "path", "mtime", "size", "gleaned_at"],
            conflict_cols=["tenant_id", "path"])
        total += _migrate_table(src, pg, "blocklist_candidates", tenant_id,
            columns=["id", "tenant_id", "domain_or_ip", "source_device_ip", "source_device_name",
                     "first_seen", "last_seen", "hit_count", "status", "pushed_at",
                     "log_evidence", "matched_rule", "llm_score", "llm_reason"],
            conflict_cols=["id"])
        src.close()
    else:
        print(f"Main DB not found at {main_path} — skipping")
    # ── Context DB ────────────────────────────────────────────────────────────
    ctx_path = Path(args.context_db)
    if ctx_path.exists():
        print(f"\nMigrating context DB: {ctx_path}")
        src = sqlite3.connect(str(ctx_path))
        total += _migrate_table(src, pg, "context_facts", tenant_id,
            columns=["id", "tenant_id", "category", "key", "value", "source", "created_at"],
            conflict_cols=["id"])
        total += _migrate_table(src, pg, "context_documents", tenant_id,
            columns=["id", "tenant_id", "filename", "doc_type", "full_text", "file_size", "uploaded_at"],
            conflict_cols=["id"])
        total += _migrate_table(src, pg, "context_chunks", tenant_id,
            columns=["id", "tenant_id", "document_id", "chunk_index", "text"],
            conflict_cols=["id"])
        src.close()
    else:
        print(f"Context DB not found at {ctx_path} — skipping")
    # ── Incidents DB ──────────────────────────────────────────────────────────
    inc_path = Path(args.incidents_db)
    if inc_path.exists():
        print(f"\nMigrating incidents DB: {inc_path}")
        src = sqlite3.connect(str(inc_path))
        total += _migrate_table(src, pg, "incidents", tenant_id,
            columns=["id", "tenant_id", "label", "issue_type", "started_at", "ended_at",
                     "notes", "created_at", "severity"],
            conflict_cols=["id"])
        total += _migrate_table(src, pg, "received_bundles", tenant_id,
            columns=["id", "tenant_id", "source_host", "issue_type", "label", "severity",
                     "started_at", "bundled_at", "entry_count", "bundle_json"],
            conflict_cols=["id"])
        total += _migrate_table(src, pg, "sent_bundles", tenant_id,
            columns=["id", "tenant_id", "incident_id", "exported_at", "sanitized",
                     "entry_count", "bundle_json"],
            conflict_cols=["id"])
        src.close()
    else:
        print(f"Incidents DB not found at {inc_path} — skipping")
    pg.close()
    print(f"\nDone. Total rows inserted: {total}")
 if __name__ == "__main__":
    main()
--- a/scripts/update.sh
+++ b/scripts/update.sh
@ -6,10 +6,8 @@
 #   sudo bash /opt/turnstone/scripts/update.sh feat/live-watch   # test a branch
 #
 # Local files preserved across updates:
-#   patterns/watch.yaml              — site-specific watch source config
+#   patterns/watch.yaml   — site-specific watch source config
-#   data/corpus_watermark.txt        — corpus export watermark (last exported rowid)
+#   data/                 — database and live journal files (bind-mounted, untouched)
 #   data/incident_watermark.txt      — incident export watermark (last exported timestamp)
 #   data/                            — database and live journal files (bind-mounted, untouched)
 set -euo pipefail
@ -23,9 +21,7 @@ echo "==> Turnstone update: branch=$BRANCH"
 # ── Preserve site-local config ────────────────────────────────────────────────
 # watch.yaml is tracked in git as a template but overridden per-host.
-# Corpus watermarks track the last exported entry/incident — must survive updates
+# Back it up before the pull and restore it after.
 # or the next export run will re-push everything from the beginning.
 # Back them up before the pull and restore after.
 WATCH_YAML="$REPO_DIR/patterns/watch.yaml"
 WATCH_BACKUP=""
 if [ -f "$WATCH_YAML" ]; then
@ -33,19 +29,6 @@ if [ -f "$WATCH_YAML" ]; then
  cp "$WATCH_YAML" "$WATCH_BACKUP"
 fi
 CORPUS_WM="$REPO_DIR/data/corpus_watermark.txt"
 INCIDENT_WM="$REPO_DIR/data/incident_watermark.txt"
 CORPUS_WM_BACKUP=""
 INCIDENT_WM_BACKUP=""
 if [ -f "$CORPUS_WM" ]; then
  CORPUS_WM_BACKUP=$(mktemp /tmp/corpus-wm.XXXXXX)
  cp "$CORPUS_WM" "$CORPUS_WM_BACKUP"
 fi
 if [ -f "$INCIDENT_WM" ]; then
  INCIDENT_WM_BACKUP=$(mktemp /tmp/incident-wm.XXXXXX)
  cp "$INCIDENT_WM" "$INCIDENT_WM_BACKUP"
 fi
 # ── Pull ──────────────────────────────────────────────────────────────────────
 git fetch --all --tags --quiet
@ -67,16 +50,6 @@ if [ -n "$WATCH_BACKUP" ]; then
  rm -f "$WATCH_BACKUP"
  echo "==> Restored patterns/watch.yaml"
 fi
 if [ -n "$CORPUS_WM_BACKUP" ]; then
  cp "$CORPUS_WM_BACKUP" "$CORPUS_WM"
  rm -f "$CORPUS_WM_BACKUP"
  echo "==> Restored data/corpus_watermark.txt"
 fi
 if [ -n "$INCIDENT_WM_BACKUP" ]; then
  cp "$INCIDENT_WM_BACKUP" "$INCIDENT_WM"
  rm -f "$INCIDENT_WM_BACKUP"
  echo "==> Restored data/incident_watermark.txt"
 fi
 # ── Build ─────────────────────────────────────────────────────────────────────
 echo "==> Building $IMAGE ..."
--- a/tests/context/test_diagnose_context.py
+++ b/tests/context/test_diagnose_context.py
@ -4,7 +4,6 @@ import sqlite3
 from pathlib import Path
 from unittest.mock import patch
 import pytest
 from app.db.schema import ensure_schema, ensure_context_schema
 from app.services.llm import summarize
 from app.services.search import SearchResult
@ -65,14 +64,36 @@ def test_summarize_without_context_block_unchanged():
@pytest.fixture
 def db_with_facts(tmp_path):
    db_path = tmp_path / "t.db"
    ensure_schema(db_path)
    ensure_context_schema(db_path)
    conn = sqlite3.connect(str(db_path))
-    conn.execute(
+    conn.executescript("""
-        "INSERT INTO context_facts(id, tenant_id, category, key, value, source, created_at) "
+        CREATE TABLE log_entries (
-        "VALUES (?,?,?,?,?,?,?)",
+            id TEXT PRIMARY KEY, source_id TEXT NOT NULL, sequence INTEGER NOT NULL,
-        ("f1", "", "service", "plex", "port:32400", "wizard", "2026-05-13T00:00:00+00:00"),
+            timestamp_raw TEXT, timestamp_iso TEXT, ingest_time TEXT NOT NULL,
-    )
+            severity TEXT, repeat_count INTEGER DEFAULT 1, out_of_order INTEGER DEFAULT 0,
            matched_patterns TEXT DEFAULT '[]', text TEXT NOT NULL
        );
        CREATE VIRTUAL TABLE IF NOT EXISTS log_fts USING fts5(
            text, entry_id UNINDEXED, source_id UNINDEXED, sequence UNINDEXED,
            severity UNINDEXED, timestamp_iso UNINDEXED, matched_patterns UNINDEXED,
            repeat_count UNINDEXED, out_of_order UNINDEXED, tokenize='porter ascii'
        );
        CREATE TABLE context_facts (
            id TEXT PRIMARY KEY, category TEXT NOT NULL, key TEXT NOT NULL,
            value TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL
        );
        CREATE TABLE context_documents (
            id TEXT PRIMARY KEY, filename TEXT NOT NULL, doc_type TEXT NOT NULL,
            full_text TEXT NOT NULL, file_size INTEGER, uploaded_at TEXT NOT NULL
        );
        CREATE TABLE context_chunks (
            id TEXT PRIMARY KEY, document_id TEXT NOT NULL
                REFERENCES context_documents(id) ON DELETE CASCADE,
            chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
        );
        INSERT INTO context_facts VALUES (
            'f1','service','plex','port:32400','wizard','2026-05-13T00:00:00+00:00'
        );
    """)
    conn.commit()
    conn.close()
    return db_path
--- a/tests/context/test_doc_upload.py
+++ b/tests/context/test_doc_upload.py
@ -1,9 +1,9 @@
 """End-to-end upload pipeline: file bytes → DB rows."""
 import sqlite3
 import pytest
 from pathlib import Path
-from app.db.schema import ensure_context_schema
+from app.ingest.doc_upload import ingest_upload
 from app.glean.doc_upload import glean_upload
 from app.context.store import list_facts, list_documents
 from app.context.chunker import UnsupportedDocType
@ -11,7 +11,24 @@ from app.context.chunker import UnsupportedDocType
@pytest.fixture
 def db(tmp_path):
    db_path = tmp_path / "t.db"
-    ensure_context_schema(db_path)
+    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
        CREATE TABLE context_facts (
            id TEXT PRIMARY KEY, category TEXT NOT NULL, key TEXT NOT NULL,
            value TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL
        );
        CREATE TABLE context_documents (
            id TEXT PRIMARY KEY, filename TEXT NOT NULL, doc_type TEXT NOT NULL,
            full_text TEXT NOT NULL, file_size INTEGER, uploaded_at TEXT NOT NULL
        );
        CREATE TABLE context_chunks (
            id TEXT PRIMARY KEY, document_id TEXT NOT NULL
                REFERENCES context_documents(id) ON DELETE CASCADE,
            chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
        );
    """)
    conn.commit()
    conn.close()
    return db_path
@ -23,7 +40,7 @@ services:
    ports:
      - "32400:32400"
 """
-    result = glean_upload(db, "docker-compose.yml", yaml_bytes)
+    result = ingest_upload(db, "docker-compose.yml", yaml_bytes)
    assert result["doc_type"] == "yaml"
    assert result["facts_written"] >= 1
    assert result["chunks_written"] >= 1
@ -36,7 +53,7 @@ services:
 def test_ingest_markdown_no_facts(db):
    md = b"# Runbook\n\nRestart plex with `systemctl restart plex`."
-    result = glean_upload(db, "runbook.md", md)
+    result = ingest_upload(db, "runbook.md", md)
    assert result["doc_type"] == "markdown"
    assert result["facts_written"] == 0
    assert result["chunks_written"] >= 1
@ -44,4 +61,4 @@ def test_ingest_markdown_no_facts(db):
 def test_ingest_raises_on_bad_type(db):
    with pytest.raises(UnsupportedDocType):
-        glean_upload(db, "report.pdf", b"data")
+        ingest_upload(db, "report.pdf", b"data")
--- a/tests/context/test_embedder.py
+++ b/tests/context/test_embedder.py
@ -1,17 +1,13 @@
-"""Tests for app/context/embedder.py — delegates to app.services.embeddings."""
+"""Tests for app/context/embedder.py — graceful no-op without sqlite-vec."""
 import sqlite3
 import struct
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 import numpy as np
 import pytest
 from app.context import embedder as emb_mod
-@pytest.fixture()
+@pytest.fixture
-def db(tmp_path: Path) -> Path:
+def db(tmp_path):
    db_path = tmp_path / "t.db"
    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
@ -24,78 +20,34 @@ def db(tmp_path: Path) -> Path:
                REFERENCES context_documents(id) ON DELETE CASCADE,
            chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
        );
-        INSERT INTO context_documents
+        INSERT INTO context_documents VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
            VALUES ('d1','test.md','markdown','hello',5,'2026-01-01T00:00:00+00:00');
        INSERT INTO context_chunks VALUES ('c1','d1',0,'hello world',NULL);
        INSERT INTO context_chunks VALUES ('c2','d1',1,'second chunk',NULL);
    """)
    conn.commit()
    conn.close()
    return db_path
-def _mock_embedder(dim: int = 3) -> MagicMock:
+def test_embed_skipped_when_extension_absent(db):
-    """Return a mock Embedder that returns constant dim-length vectors."""
+    with patch.object(emb_mod, "EMBEDDING_AVAILABLE", False):
-    m = MagicMock()
+        count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
-    m.dim = dim
+    assert count == 0
    m.embed_batch.return_value = [np.zeros(dim, dtype=np.float32)] * 10
    return m
-class TestEmbedChunks:
+def test_embed_calls_ollama_when_available(db):
-    def test_returns_zero_when_no_embedder(self, db: Path) -> None:
+    import httpx
        with patch("app.context.embedder.get_embedder", return_value=None):
            count = emb_mod.embed_chunks(db, "d1")
        assert count == 0
-    def test_returns_zero_when_no_unembedded_chunks(self, db: Path) -> None:
+    class FakeResponse:
-        # Pre-fill both chunks with a blob
+        status_code = 200
-        blob = struct.pack("3f", 0.1, 0.2, 0.3)
+        def raise_for_status(self): pass
-        conn = sqlite3.connect(str(db))
+        def json(self): return {"embedding": [0.1, 0.2, 0.3]}
        conn.execute("UPDATE context_chunks SET embedding=?", (blob,))
        conn.commit()
        conn.close()
-        embedder = _mock_embedder()
+    with patch.object(emb_mod, "EMBEDDING_AVAILABLE", True), \
-        with patch("app.context.embedder.get_embedder", return_value=embedder):
+         patch("app.context.embedder.httpx.post", return_value=FakeResponse()):
-            count = emb_mod.embed_chunks(db, "d1")
+        count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434")
-        assert count == 0
+    assert count == 1
-        embedder.embed_batch.assert_not_called()
+    # Verify blob was written
-
+    conn = sqlite3.connect(str(db))
-    def test_embeds_all_null_chunks(self, db: Path) -> None:
+    row = conn.execute("SELECT embedding FROM context_chunks WHERE id='c1'").fetchone()
-        embedder = _mock_embedder(dim=3)
+    conn.close()
-        with patch("app.context.embedder.get_embedder", return_value=embedder):
+    assert row[0] is not None
            count = emb_mod.embed_chunks(db, "d1")
        assert count == 2  # two chunks in fixture
    def test_blobs_written_to_db(self, db: Path) -> None:
        vec = np.array([0.1, 0.2, 0.3], dtype=np.float32)
        embedder = _mock_embedder(dim=3)
        embedder.embed_batch.return_value = [vec, vec]
        with patch("app.context.embedder.get_embedder", return_value=embedder):
            emb_mod.embed_chunks(db, "d1")
        conn = sqlite3.connect(str(db))
        rows = conn.execute(
            "SELECT embedding FROM context_chunks WHERE document_id='d1'"
        ).fetchall()
        conn.close()
        for (blob,) in rows:
            assert blob is not None
            unpacked = struct.unpack(f"{len(blob)//4}f", blob)
            assert len(unpacked) == 3
    def test_legacy_llm_url_param_accepted(self, db: Path) -> None:
        """Ensure backward-compat signature still works (llm_url ignored)."""
        embedder = _mock_embedder()
        with patch("app.context.embedder.get_embedder", return_value=embedder):
            count = emb_mod.embed_chunks(db, "d1", "http://localhost:11434", "nomic-embed-text")
        assert count == 2
    def test_embed_batch_error_returns_zero(self, db: Path) -> None:
        embedder = _mock_embedder()
        embedder.embed_batch.side_effect = RuntimeError("model exploded")
        with patch("app.context.embedder.get_embedder", return_value=embedder):
            count = emb_mod.embed_chunks(db, "d1")
        assert count == 0
--- a/tests/context/test_schema.py
+++ b/tests/context/test_schema.py
@ -1,13 +1,13 @@
-"""Verify the three context tables are created by ensure_context_schema."""
+"""Verify the three new context tables are created by ensure_schema."""
 import sqlite3
 from pathlib import Path
 import pytest
-from app.db.schema import ensure_context_schema
+from app.ingest.pipeline import ensure_schema
 def test_context_tables_created(tmp_path):
    db = tmp_path / "t.db"
-    ensure_context_schema(db)
+    ensure_schema(db)
    conn = sqlite3.connect(str(db))
    tables = {r[0] for r in conn.execute(
        "SELECT name FROM sqlite_master WHERE type='table'"
@ -20,5 +20,5 @@ def test_context_tables_created(tmp_path):
 def test_context_schema_idempotent(tmp_path):
    db = tmp_path / "t.db"
-    ensure_context_schema(db)
+    ensure_schema(db)
-    ensure_context_schema(db)  # second call must not raise
+    ensure_schema(db)  # second call must not raise
--- a/tests/context/test_store.py
+++ b/tests/context/test_store.py
@ -2,7 +2,6 @@
 import sqlite3
 import pytest
 from pathlib import Path
 from app.db.schema import ensure_context_schema
 from app.context.store import (
    add_fact, list_facts, delete_fact,
    add_document, list_documents, delete_document,
@ -13,7 +12,24 @@ from app.context.store import (
@pytest.fixture
 def db(tmp_path):
    db_path = tmp_path / "t.db"
-    ensure_context_schema(db_path)
+    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
        CREATE TABLE context_facts (
            id TEXT PRIMARY KEY, category TEXT NOT NULL, key TEXT NOT NULL,
            value TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL
        );
        CREATE TABLE context_documents (
            id TEXT PRIMARY KEY, filename TEXT NOT NULL, doc_type TEXT NOT NULL,
            full_text TEXT NOT NULL, file_size INTEGER, uploaded_at TEXT NOT NULL
        );
        CREATE TABLE context_chunks (
            id TEXT PRIMARY KEY, document_id TEXT NOT NULL
                REFERENCES context_documents(id) ON DELETE CASCADE,
            chunk_index INTEGER NOT NULL, text TEXT NOT NULL, embedding BLOB
        );
    """)
    conn.commit()
    conn.close()
    return db_path
--- a/tests/context/test_wizard.py
+++ b/tests/context/test_wizard.py
@ -2,14 +2,21 @@
 import sqlite3
 import pytest
 from pathlib import Path
 from app.db.schema import ensure_context_schema
 from app.context.wizard import get_schema, advance_step, is_complete, apply_session, TOTAL_STEPS
@pytest.fixture
 def db(tmp_path):
    db_path = tmp_path / "t.db"
-    ensure_context_schema(db_path)
+    conn = sqlite3.connect(str(db_path))
    conn.executescript("""
        CREATE TABLE context_facts (
            id TEXT PRIMARY KEY, category TEXT NOT NULL, key TEXT NOT NULL,
            value TEXT NOT NULL, source TEXT, created_at TEXT NOT NULL
        );
    """)
    conn.commit()
    conn.close()
    return db_path
--- a/tests/test_anomaly.py
+++ b/tests/test_anomaly.py
@ -1,220 +0,0 @@
 """Tests for app/services/anomaly.py — anomaly scoring pipeline."""
 from __future__ import annotations
 import sqlite3
 import uuid
 from pathlib import Path
 from unittest.mock import MagicMock
 import pytest
 import app.services.anomaly as anomaly_mod
 from app.db.schema import ensure_schema
 from app.services.anomaly import (
    ScoringResult,
    acknowledge_detection,
    list_detections,
    reset_pipeline,
    score_unscored,
 )
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
 def _reset_pipeline():
    """Ensure the ML singleton is cleared between tests."""
    reset_pipeline()
    yield
    reset_pipeline()
@pytest.fixture
 def db(tmp_path: Path) -> Path:
    db_path = tmp_path / "t.db"
    ensure_schema(db_path)
    return db_path
 def _insert_entry(db_path: Path, text: str, entry_id: str | None = None) -> str:
    eid = entry_id or str(uuid.uuid4())
    conn = sqlite3.connect(str(db_path))
    conn.execute(
        "INSERT INTO log_entries(id, tenant_id, source_id, sequence, ingest_time, text) "
        "VALUES (?,?,?,?,?,?)",
        (eid, "", "src", 1, "2026-01-01T00:00:00", text),
    )
    conn.commit()
    conn.close()
    return eid
 # ---------------------------------------------------------------------------
 # score_unscored
 # ---------------------------------------------------------------------------
 def test_score_unscored_no_model_returns_skipped(db: Path):
    result = score_unscored(db, model_id="")
    assert result.skipped is True
    assert result.scored == 0
 def test_score_unscored_scores_entries(db: Path, monkeypatch):
    _insert_entry(db, "kernel panic — OOM killer invoked")
    _insert_entry(db, "user login successful")
    mock_pipe = MagicMock(return_value=[
        {"label": "SYSTEM_FAILURE", "score": 0.92},
        {"label": "NORMAL", "score": 0.88},
    ])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    result = score_unscored(db, model_id="fake-model", batch_size=10)
    assert result.skipped is False
    assert result.scored == 2
 def test_score_unscored_creates_detection_above_threshold(db: Path, monkeypatch):
    _insert_entry(db, "segfault in service")
    mock_pipe = MagicMock(return_value=[
        {"label": "SYSTEM_FAILURE", "score": 0.95},
    ])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    result = score_unscored(db, model_id="fake-model", threshold=0.80)
    assert result.detections == 1
    detections = list_detections(db)
    assert len(detections) == 1
    assert detections[0]["anomaly_label"] == "SYSTEM_FAILURE"
    assert detections[0]["anomaly_score"] == pytest.approx(0.95)
 def test_score_unscored_no_detection_below_threshold(db: Path, monkeypatch):
    _insert_entry(db, "warning: disk at 80%")
    mock_pipe = MagicMock(return_value=[
        {"label": "PERFORMANCE_ISSUE", "score": 0.60},
    ])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    result = score_unscored(db, model_id="fake-model", threshold=0.80)
    assert result.detections == 0
    assert result.scored == 1
 def test_score_unscored_normal_label_never_detection(db: Path, monkeypatch):
    _insert_entry(db, "service started successfully")
    mock_pipe = MagicMock(return_value=[
        {"label": "NORMAL", "score": 0.99},
    ])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    result = score_unscored(db, model_id="fake-model", threshold=0.50)
    assert result.detections == 0
 def test_score_unscored_idempotent(db: Path, monkeypatch):
    """Entries already scored are not re-scored on subsequent runs."""
    _insert_entry(db, "first entry")
    call_count = 0
    def _side_effect(texts, **_kwargs):
        nonlocal call_count
        call_count += 1
        return [{"label": "NORMAL", "score": 0.90} for _ in texts]
    mock_pipe = MagicMock(side_effect=_side_effect)
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    score_unscored(db, model_id="fake-model")
    score_unscored(db, model_id="fake-model")
    assert call_count == 1  # second run finds no unscored rows
 def test_score_unscored_pipeline_error_returns_error(db: Path, monkeypatch):
    _insert_entry(db, "some log line")
    mock_pipe = MagicMock(side_effect=RuntimeError("CUDA OOM"))
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    result = score_unscored(db, model_id="fake-model")
    assert result.error is not None
    assert "CUDA OOM" in result.error
 # ---------------------------------------------------------------------------
 # list_detections / acknowledge_detection
 # ---------------------------------------------------------------------------
 def test_list_detections_empty(db: Path):
    assert list_detections(db) == []
 def test_list_detections_filters_unacked(db: Path, monkeypatch):
    _insert_entry(db, "crash")
    mock_pipe = MagicMock(return_value=[{"label": "SYSTEM_FAILURE", "score": 0.91}])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    score_unscored(db, model_id="fake-model", threshold=0.80)
    all_dets = list_detections(db)
    assert len(all_dets) == 1
    unacked = list_detections(db, unacked_only=True)
    assert len(unacked) == 1
 def test_acknowledge_detection(db: Path, monkeypatch):
    _insert_entry(db, "network anomaly")
    mock_pipe = MagicMock(return_value=[{"label": "NETWORK_ANOMALY", "score": 0.88}])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    score_unscored(db, model_id="fake-model", threshold=0.80)
    dets = list_detections(db)
    assert len(dets) == 1
    det_id = dets[0]["id"]
    updated = acknowledge_detection(db, det_id, notes="benign test traffic")
    assert updated is True
    unacked = list_detections(db, unacked_only=True)
    assert len(unacked) == 0
    all_dets = list_detections(db)
    assert all_dets[0]["acknowledged"] == 1
    assert all_dets[0]["notes"] == "benign test traffic"
 def test_acknowledge_detection_unknown_id(db: Path):
    updated = acknowledge_detection(db, "nonexistent-id")
    assert updated is False
 def test_list_detections_label_filter(db: Path, monkeypatch):
    _insert_entry(db, "OOM kill")
    _insert_entry(db, "network timeout")
    mock_pipe = MagicMock(side_effect=[
        [{"label": "SYSTEM_FAILURE", "score": 0.93}],
        [{"label": "NETWORK_ANOMALY", "score": 0.85}],
    ])
    monkeypatch.setattr(anomaly_mod, "_pipeline", mock_pipe)
    score_unscored(db, model_id="fake-model", batch_size=1, threshold=0.80)
    score_unscored(db, model_id="fake-model", batch_size=1, threshold=0.80)
    sys_dets = list_detections(db, label="SYSTEM_FAILURE")
    assert all(d["anomaly_label"] == "SYSTEM_FAILURE" for d in sys_dets)
    net_dets = list_detections(db, label="NETWORK_ANOMALY")
    assert all(d["anomaly_label"] == "NETWORK_ANOMALY" for d in net_dets)
--- a/tests/test_blocklist_endpoints.py
+++ b/tests/test_blocklist_endpoints.py
@ -9,18 +9,15 @@ from unittest.mock import MagicMock, patch
@pytest.fixture
 def client(tmp_path):
    from fastapi.testclient import TestClient
-    from app.glean.pipeline import ensure_schema
+    from app.ingest.pipeline import ensure_schema
    import app.rest as rest_module
    db = tmp_path / "test.db"
    ensure_schema(db)
    with patch.object(rest_module, "DB_PATH", db), \
         patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
         patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
         patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
-         patch.object(rest_module, "_compiled_patterns", []), \
+         patch.object(rest_module, "_compiled_patterns", []):
         patch.object(rest_module, "_pattern_domain", {}):
        with TestClient(rest_module.app, raise_server_exceptions=True) as c:
            yield c
@ -28,7 +25,7 @@ def client(tmp_path):
@pytest.fixture
 def client_with_candidate(tmp_path):
    from fastapi.testclient import TestClient
-    from app.glean.pipeline import ensure_schema
+    from app.ingest.pipeline import ensure_schema
    import app.rest as rest_module
    import sqlite3, uuid
@ -44,11 +41,8 @@ def client_with_candidate(tmp_path):
    conn.close()
    with patch.object(rest_module, "DB_PATH", db), \
         patch.object(rest_module, "CONTEXT_DB_PATH", tmp_path / "context.db"), \
         patch.object(rest_module, "INCIDENTS_DB_PATH", tmp_path / "incidents.db"), \
         patch.object(rest_module, "PREFS_PATH", tmp_path / "prefs.json"), \
-         patch.object(rest_module, "_compiled_patterns", []), \
+         patch.object(rest_module, "_compiled_patterns", []):
         patch.object(rest_module, "_pattern_domain", {}):
        with TestClient(rest_module.app, raise_server_exceptions=True) as c:
            yield c, cid
--- a/tests/test_cybersec.py
+++ b/tests/test_cybersec.py
@ -1,233 +0,0 @@
 """Tests for the cybersec zero-shot scoring pipeline."""
 from __future__ import annotations
 import sqlite3
 import tempfile
 from pathlib import Path
 from unittest.mock import MagicMock
 import pytest
 from app.db.schema import ensure_schema
 from app.services.cybersec import (
    CybersecResult,
    CYBERSEC_LABELS,
    _NORMAL_LABEL,
    reset_pipeline,
    score_security_entries,
    list_cybersec_detections,
 )
 import app.services.cybersec as cybersec_mod
@pytest.fixture(autouse=True)
 def _reset(tmp_path):
    reset_pipeline()
    yield
    reset_pipeline()
@pytest.fixture
 def db(tmp_path) -> Path:
    path = tmp_path / "test.db"
    ensure_schema(path)
    return path
 def _insert_entry(db: Path, entry_id: str, text: str,
                  anomaly_label: str | None = None,
                  matched_patterns: str = "[]") -> None:
    with sqlite3.connect(db) as conn:
        conn.execute(
            """INSERT OR IGNORE INTO log_entries
               (id, tenant_id, source_id, sequence, ingest_time, text,
                anomaly_label, matched_patterns)
               VALUES (?, '', 'test-src', 1, '2026-01-01T00:00:00Z', ?, ?, ?)""",
            (entry_id, text, anomaly_label, matched_patterns),
        )
        conn.commit()
 # ---------------------------------------------------------------------------
 # No model configured → skipped
 # ---------------------------------------------------------------------------
 def test_no_model_returns_skipped(db):
    result = score_security_entries(db, model_id="")
    assert result.skipped is True
    assert result.scored == 0
 # ---------------------------------------------------------------------------
 # No eligible entries → skipped
 # ---------------------------------------------------------------------------
 def test_no_eligible_entries_skipped(db):
    _insert_entry(db, "e1", "Started nginx.service", anomaly_label=None, matched_patterns="[]")
    mock_pipe = MagicMock(return_value=[{"labels": [_NORMAL_LABEL], "scores": [0.99]}])
    monkeypatch = pytest.MonkeyPatch()
    monkeypatch.setattr(cybersec_mod, "_pipeline", mock_pipe)
    result = score_security_entries(db, model_id="fake-model")
    assert result.skipped is True
    monkeypatch.undo()
 # ---------------------------------------------------------------------------
 # Security entry gets scored
 # ---------------------------------------------------------------------------
 def test_security_entry_scored(db, monkeypatch):
    _insert_entry(db, "e1",
                  "Failed password for root from 192.168.1.1 port 22 ssh2",
                  anomaly_label="SECURITY_ANOMALY")
    mock_pipe = MagicMock(return_value=[{
        "labels": ["authentication failure or brute force attack", _NORMAL_LABEL],
        "scores": [0.85, 0.15],
    }])
    monkeypatch.setattr(cybersec_mod, "_pipeline", mock_pipe)
    result = score_security_entries(db, model_id="fake-model", threshold=0.70)
    assert result.scored == 1
    assert result.detections == 1
    assert result.error is None
    with sqlite3.connect(db) as conn:
        conn.row_factory = sqlite3.Row
        row = conn.execute("SELECT ml_score, ml_label, ml_scored_at FROM log_entries WHERE id='e1'").fetchone()
        assert row["ml_score"] == pytest.approx(0.85)
        assert row["ml_label"] == "authentication failure or brute force attack"
        assert row["ml_scored_at"] is not None
 # ---------------------------------------------------------------------------
 # Detection created above threshold
 # ---------------------------------------------------------------------------
 def test_detection_inserted_above_threshold(db, monkeypatch):
    _insert_entry(db, "e1", "sudo: authentication failure", anomaly_label="ERROR")
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["privilege escalation or unauthorized access", _NORMAL_LABEL],
        "scores": [0.75, 0.25],
    }]))
    score_security_entries(db, model_id="fake-model", threshold=0.60)
    with sqlite3.connect(db) as conn:
        conn.row_factory = sqlite3.Row
        dets = conn.execute("SELECT * FROM detections WHERE scorer='cybersec'").fetchall()
    assert len(dets) == 1
    assert dets[0]["anomaly_label"] == "privilege escalation or unauthorized access"
    assert dets[0]["severity"] == "CRITICAL"
 # ---------------------------------------------------------------------------
 # Normal label → no detection even above score threshold
 # ---------------------------------------------------------------------------
 def test_normal_label_no_detection(db, monkeypatch):
    _insert_entry(db, "e1", "Started nginx.service", anomaly_label="INFO",
                  matched_patterns='["service_start"]')
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": [_NORMAL_LABEL, "network intrusion or port scan"],
        "scores": [0.95, 0.05],
    }]))
    result = score_security_entries(db, model_id="fake-model", threshold=0.60)
    assert result.detections == 0
 # ---------------------------------------------------------------------------
 # Below threshold → scored but no detection
 # ---------------------------------------------------------------------------
 def test_below_threshold_no_detection(db, monkeypatch):
    _insert_entry(db, "e1", "Some suspicious text", anomaly_label="WARN")
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["network intrusion or port scan", _NORMAL_LABEL],
        "scores": [0.45, 0.55],
    }]))
    result = score_security_entries(db, model_id="fake-model", threshold=0.60)
    assert result.scored == 1
    assert result.detections == 0
 # ---------------------------------------------------------------------------
 # Pattern-matched entry (not anomaly-flagged) still gets scored
 # ---------------------------------------------------------------------------
 def test_pattern_matched_entry_scored(db, monkeypatch):
    _insert_entry(db, "e1", "SSH port forwarding conflict detected",
                  anomaly_label=None,
                  matched_patterns='["ssh_forward_conflict"]')
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["network intrusion or port scan", _NORMAL_LABEL],
        "scores": [0.70, 0.30],
    }]))
    result = score_security_entries(db, model_id="fake-model", threshold=0.60)
    assert result.scored == 1
    assert result.detections == 1
 # ---------------------------------------------------------------------------
 # Idempotency — re-run finds nothing unscored
 # ---------------------------------------------------------------------------
 def test_idempotent_rerun(db, monkeypatch):
    _insert_entry(db, "e1", "Failed login", anomaly_label="ERROR")
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["authentication failure or brute force attack"],
        "scores": [0.80],
    }]))
    score_security_entries(db, model_id="fake-model", threshold=0.60)
    result2 = score_security_entries(db, model_id="fake-model", threshold=0.60)
    assert result2.skipped is True
 # ---------------------------------------------------------------------------
 # list_cybersec_detections filters to scorer='cybersec'
 # ---------------------------------------------------------------------------
 def test_list_cybersec_detections(db, monkeypatch):
    _insert_entry(db, "e1", "Failed login", anomaly_label="ERROR")
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["authentication failure or brute force attack"],
        "scores": [0.90],
    }]))
    score_security_entries(db, model_id="fake-model", threshold=0.60)
    rows = list_cybersec_detections(db)
    assert len(rows) == 1
    assert rows[0]["scorer"] == "cybersec"
 # ---------------------------------------------------------------------------
 # list_detections scorer filter (anomaly service)
 # ---------------------------------------------------------------------------
 def test_list_detections_scorer_filter(db, monkeypatch):
    from app.services.anomaly import list_detections
    _insert_entry(db, "e1", "Failed login", anomaly_label="ERROR")
    monkeypatch.setattr(cybersec_mod, "_pipeline", MagicMock(return_value=[{
        "labels": ["authentication failure or brute force attack"],
        "scores": [0.90],
    }]))
    score_security_entries(db, model_id="fake-model", threshold=0.60)
    all_dets = list_detections(db)
    cybersec_dets = list_detections(db, scorer="cybersec")
    anomaly_dets = list_detections(db, scorer="anomaly")
    assert len(cybersec_dets) == 1
    assert len(anomaly_dets) == 0
    assert len(all_dets) >= 1
--- a/tests/test_diagnose_classifier.py
+++ b/tests/test_diagnose_classifier.py
@ -1,299 +0,0 @@
 """Tests for app/services/diagnose/classifier.py — SeverityClassifier.
 All ML-path tests mock ``transformers.pipeline`` so no model weights are
 downloaded during the test suite.
 """
 from __future__ import annotations
 from dataclasses import FrozenInstanceError
 from pathlib import Path
 from typing import Any
 from unittest.mock import MagicMock, patch
 import pytest
 import app.services.diagnose.classifier as clf_module
 from app.services.diagnose.classifier import SeverityClassifier
 from app.services.diagnose.models import ClassifiedTimeline, EventCluster, TimelineResult
 # ---------------------------------------------------------------------------
 # Fixtures
 # ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
 def reset_ml_singleton():
    """Ensure the module-level ML singleton is cleared before and after each test."""
    clf_module._ml_classifier = None
    yield
    clf_module._ml_classifier = None
 # ---------------------------------------------------------------------------
 # Test-object builders
 # ---------------------------------------------------------------------------
 def _make_cluster(
    representative_text: str = "test log",
    pattern_tags: tuple[str, ...] = (),
    severity: str = "INFO",
 ) -> EventCluster:
    return EventCluster(
        cluster_id="abc123",
        entries=("e1",),
        start_iso=None,
        end_iso=None,
        duration_seconds=0.0,
        source_ids=("src",),
        pattern_tags=pattern_tags,
        severity=severity,  # type: ignore[arg-type]
        burst=False,
        gap_before_seconds=0.0,
        representative_text=representative_text,
    )
 def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
    return TimelineResult(
        clusters=clusters,
        total_entries=0,
        window_start=None,
        window_end=None,
        gap_count=0,
        burst_count=0,
        dominant_sources=(),
    )
 def _mock_hf_pipeline(label: str, score: float) -> MagicMock:
    """Return a mock HF pipeline callable that always yields one result."""
    pipe = MagicMock()
    pipe.return_value = [{"label": label, "score": score}]
    return pipe
 # ---------------------------------------------------------------------------
 # Path A — ML classification
 # ---------------------------------------------------------------------------
 class TestMLPath:
    def test_ml_error_maps_to_error(self) -> None:
        """ML returning ERROR with score 0.98 → cluster severity ERROR."""
        pipe = _mock_hf_pipeline("ERROR", 0.98)
        with patch(
            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
        ):
            clf = SeverityClassifier(model_id="fake/model")
            result = clf.classify(_make_timeline(((_make_cluster("disk error detected")),)))
        assert result.cluster_severities["abc123"] == "ERROR"
        assert result.classifier_used == "ml"
        assert result.model_id == "fake/model"
    def test_ml_critical_promotion(self) -> None:
        """ERROR + score > 0.95 + 'kernel panic' in text → promoted to CRITICAL."""
        pipe = _mock_hf_pipeline("ERROR", 0.97)
        with patch(
            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
        ):
            clf = SeverityClassifier(model_id="fake/model")
            result = clf.classify(
                _make_timeline((_make_cluster("kernel panic: not syncing VFS"),))
            )
        assert result.cluster_severities["abc123"] == "CRITICAL"
    def test_ml_debug_demotion(self) -> None:
        """INFO + score < 0.4 → demoted to DEBUG."""
        pipe = _mock_hf_pipeline("INFO", 0.3)
        with patch(
            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
        ):
            clf = SeverityClassifier(model_id="fake/model")
            result = clf.classify(_make_timeline((_make_cluster("routine ping"),)))
        assert result.cluster_severities["abc123"] == "DEBUG"
    def test_ml_warning_maps_to_warn(self) -> None:
        """ML returning WARNING → mapped to WARN."""
        pipe = _mock_hf_pipeline("WARNING", 0.85)
        with patch(
            "app.services.diagnose.classifier._get_ml_classifier", return_value=pipe
        ):
            clf = SeverityClassifier(model_id="fake/model")
            result = clf.classify(_make_timeline((_make_cluster("low disk space"),)))
        assert result.cluster_severities["abc123"] == "WARN"
 # ---------------------------------------------------------------------------
 # Path B — pattern_tags fallback
 # ---------------------------------------------------------------------------
 class TestPatternTagsPath:
    def test_pattern_tags_resolve_error_severity(self, tmp_path: Path) -> None:
        """Cluster with pattern_tag 'service_crash_loop' → ERROR from pattern file."""
        pattern_yaml = tmp_path / "default.yaml"
        pattern_yaml.write_text(
            "patterns:\n"
            "  - name: service_crash_loop\n"
            "    pattern: crash\n"
            "    severity: ERROR\n"
            "    description: Service crashed in a loop\n"
        )
        clf = SeverityClassifier(model_id="", pattern_file=pattern_yaml)
        cluster = _make_cluster(
            representative_text="service crashed",
            pattern_tags=("service_crash_loop",),
        )
        result = clf.classify(_make_timeline((cluster,)))
        assert result.cluster_severities["abc123"] == "ERROR"
        assert result.classifier_used == "pattern_tags"
        assert result.model_id is None
 # ---------------------------------------------------------------------------
 # Path C — regex fallback
 # ---------------------------------------------------------------------------
 class TestRegexPath:
    def test_regex_detects_error(self) -> None:
        """No ML, no pattern file: 'ERROR: disk full' → ERROR via regex."""
        clf = SeverityClassifier(model_id="")
        result = clf.classify(
            _make_timeline((_make_cluster("ERROR: disk full"),))
        )
        assert result.cluster_severities["abc123"] == "ERROR"
        assert result.classifier_used == "regex"
    def test_regex_defaults_to_info_when_no_match(self) -> None:
        """No severity keyword in text → defaults to INFO."""
        clf = SeverityClassifier(model_id="")
        result = clf.classify(
            _make_timeline((_make_cluster("mount: disk mounted successfully"),))
        )
        assert result.cluster_severities["abc123"] == "INFO"
 # ---------------------------------------------------------------------------
 # Fallback behaviour
 # ---------------------------------------------------------------------------
 class TestImportErrorFallback:
    def test_transformers_import_error_falls_back_to_pattern_tags(
        self, tmp_path: Path
    ) -> None:
        """ImportError from transformers → clean fallback to pattern_tags path."""
        pattern_yaml = tmp_path / "default.yaml"
        pattern_yaml.write_text(
            "patterns:\n"
            "  - name: auth_failure\n"
            "    pattern: auth\n"
            "    severity: ERROR\n"
            "    description: Auth failure\n"
        )
        def _raising_get_ml(*_args: Any, **_kwargs: Any) -> None:
            raise ImportError("No module named 'transformers'")
        with patch(
            "app.services.diagnose.classifier._get_ml_classifier",
            side_effect=_raising_get_ml,
        ):
            clf = SeverityClassifier(model_id="fake/model", pattern_file=pattern_yaml)
            cluster = _make_cluster(
                representative_text="auth failed",
                pattern_tags=("auth_failure",),
            )
            result = clf.classify(_make_timeline((cluster,)))
        # ML was attempted (classifier_used == "ml") but pattern_tags resolved it
        assert result.classifier_used == "ml"
        assert result.cluster_severities["abc123"] == "ERROR"
 # ---------------------------------------------------------------------------
 # Edge cases
 # ---------------------------------------------------------------------------
 class TestEdgeCases:
    def test_empty_timeline_produces_empty_severities(self) -> None:
        """TimelineResult with no clusters → empty cluster_severities, no crash."""
        clf = SeverityClassifier(model_id="")
        result = clf.classify(_make_timeline())
        assert isinstance(result, ClassifiedTimeline)
        assert result.cluster_severities == {}
        assert result.classifier_used == "regex"
    def test_classified_timeline_is_frozen(self) -> None:
        """ClassifiedTimeline must be frozen (FrozenInstanceError on mutation)."""
        clf = SeverityClassifier(model_id="")
        result = clf.classify(_make_timeline((_make_cluster(),)))
        with pytest.raises(FrozenInstanceError):
            result.classifier_used = "ml"  # type: ignore[misc]
 # ---------------------------------------------------------------------------
 # Hybrid-BERT label mapping shim (turnstone#41)
 # ---------------------------------------------------------------------------
 class TestHybridBertLabelMap:
    """_map_label must translate Hybrid-BERT vocabulary to SeverityLabel."""
    def _run(self, label: str, score: float = 0.9, text: str = "log line") -> str:
        from app.services.diagnose.classifier import _map_label
        return _map_label(label, score, text)
    def test_normal_maps_to_info(self) -> None:
        assert self._run("normal") == "INFO"
    def test_security_anomaly_maps_to_error(self) -> None:
        assert self._run("security_anomaly") == "ERROR"
    def test_system_failure_maps_to_critical(self) -> None:
        assert self._run("system_failure") == "CRITICAL"
    def test_performance_issue_maps_to_warn(self) -> None:
        assert self._run("performance_issue") == "WARN"
    def test_network_anomaly_maps_to_warn(self) -> None:
        assert self._run("network_anomaly") == "WARN"
    def test_config_error_maps_to_error(self) -> None:
        assert self._run("config_error") == "ERROR"
    def test_hardware_issue_maps_to_critical(self) -> None:
        assert self._run("hardware_issue") == "CRITICAL"
    def test_hybrid_bert_labels_are_case_insensitive(self) -> None:
        from app.services.diagnose.classifier import _map_label
        assert _map_label("SECURITY_ANOMALY", 0.9, "x") == "ERROR"
        assert _map_label("Security_Anomaly", 0.9, "x") == "ERROR"
    def test_system_failure_critical_promotion_not_doubled(self) -> None:
        """system_failure already maps to CRITICAL — keyword promotion is a no-op."""
        assert self._run("system_failure", score=0.99, text="kernel panic") == "CRITICAL"
    def test_normal_low_confidence_demotes_to_debug(self) -> None:
        """normal + low score → INFO base → DEBUG (same demotion rule as INFO)."""
        assert self._run("normal", score=0.2) == "DEBUG"
    def test_standard_labels_still_work(self) -> None:
        """Existing standard-vocabulary labels must not be broken by the shim."""
        from app.services.diagnose.classifier import _map_label
        assert _map_label("ERROR", 0.9, "x") == "ERROR"
        assert _map_label("WARNING", 0.9, "x") == "WARN"
        assert _map_label("CRITICAL", 0.9, "x") == "CRITICAL"
--- a/tests/test_diagnose_hypothesizer.py
+++ b/tests/test_diagnose_hypothesizer.py
@ -1,486 +0,0 @@
 """Tests for app/services/diagnose/hypothesizer.py — RootCauseHypothesizer.
 All tests use mocking; no real LLM calls are made.
 """
 from __future__ import annotations
 import json
 import re
 from typing import Any
 from unittest.mock import MagicMock, patch
 import pytest
 from app.context.retriever import RetrievedContext
 from app.services.diagnose.hypothesizer import RootCauseHypothesizer
 from app.services.diagnose.models import (
    ClassifiedTimeline,
    EventCluster,
    Hypothesis,
    TimelineResult,
 )
 # ---------------------------------------------------------------------------
 # Fixture helpers
 # ---------------------------------------------------------------------------
 def _make_cluster(
    cluster_id: str = "c1",
    representative_text: str = "kernel: oom-killer invoked",
    severity: str = "ERROR",
    source_ids: tuple[str, ...] = ("syslog",),
    pattern_tags: tuple[str, ...] = ("oom",),
    start_iso: str | None = "2024-01-01T00:00:00+00:00",
 ) -> EventCluster:
    return EventCluster(
        cluster_id=cluster_id,
        entries=("e1",),
        start_iso=start_iso,
        end_iso=None,
        duration_seconds=1.0,
        source_ids=source_ids,
        pattern_tags=pattern_tags,
        severity=severity,  # type: ignore[arg-type]
        burst=False,
        gap_before_seconds=0.0,
        representative_text=representative_text,
    )
 def _make_timeline(clusters: tuple[EventCluster, ...] = ()) -> TimelineResult:
    return TimelineResult(
        clusters=clusters,
        total_entries=len(clusters),
        window_start=None,
        window_end=None,
        gap_count=0,
        burst_count=0,
        dominant_sources=(),
    )
 def _make_classified(
    clusters: tuple[EventCluster, ...] = (),
    cluster_severities: dict | None = None,
 ) -> ClassifiedTimeline:
    if cluster_severities is None:
        cluster_severities = {c.cluster_id: c.severity for c in clusters}
    return ClassifiedTimeline(
        timeline=_make_timeline(clusters),
        cluster_severities=cluster_severities,
        classifier_used="pattern_tags",
        model_id=None,
    )
 def _make_ctx(chunks: list[dict] | None = None) -> RetrievedContext:
    return RetrievedContext(
        facts=[],
        chunks=chunks or [{"text": "Memory pressure runbook.", "filename": "runbook.md"}],
    )
 def _llm_json_response(items: list[dict[str, Any]]) -> MagicMock:
    """Build a mock httpx.Response that returns the given list as JSON."""
    mock_resp = MagicMock()
    mock_resp.status_code = 200
    mock_resp.json.return_value = {
        "choices": [{"message": {"content": json.dumps(items)}}]
    }
    return mock_resp
 _SAMPLE_HYPOTHESES = [
    {
        "title": "OOM killer terminated critical process",
        "description": "The kernel invoked the OOM killer due to memory exhaustion. A process was terminated unexpectedly. This caused service disruption.",
        "confidence": 0.85,
        "severity": "CRITICAL",
        "supporting_clusters": ["c1"],
    },
    {
        "title": "Disk I/O saturation",
        "description": "High disk I/O latency was detected. Write operations stalled causing log backpressure. Check iostat for device utilisation.",
        "confidence": 0.6,
        "severity": "ERROR",
        "supporting_clusters": ["c2"],
    },
 ]
 # ---------------------------------------------------------------------------
 # Test 1: Valid JSON response returns correct Hypothesis objects
 # ---------------------------------------------------------------------------
 def test_valid_json_response_returns_hypotheses():
    """Valid LLM JSON array produces a list of Hypothesis objects with correct fields."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="why is memory failing?",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 2
    assert isinstance(results[0], Hypothesis)
    assert results[0].title == "OOM killer terminated critical process"
    assert results[0].confidence == pytest.approx(0.85)
    assert results[0].severity == "CRITICAL"
    assert results[0].supporting_cluster_ids == ("c1",)
    assert results[1].title == "Disk I/O saturation"
    assert results[1].severity == "ERROR"
 # ---------------------------------------------------------------------------
 # Test 2: hypothesis_id is a non-empty UUID string on each result
 # ---------------------------------------------------------------------------
 _UUID_RE = re.compile(
    r"^[0-9a-f]{8}-[0-9a-f]{4}-4[0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
 )
 def test_hypothesis_id_is_uuid():
    """Each returned Hypothesis carries a distinct UUID v4 hypothesis_id."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    mock_resp = _llm_json_response(_SAMPLE_HYPOTHESES)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 2
    for h in results:
        assert h.hypothesis_id, "hypothesis_id must not be empty"
        assert _UUID_RE.match(h.hypothesis_id), (
            f"hypothesis_id {h.hypothesis_id!r} is not a UUID v4"
        )
    # Each ID must be distinct
    ids = [h.hypothesis_id for h in results]
    assert len(set(ids)) == len(ids), "hypothesis_ids must be unique"
 # ---------------------------------------------------------------------------
 # Test 3: Malformed JSON response returns [] with a logged warning
 # ---------------------------------------------------------------------------
 def test_malformed_json_returns_empty_and_warns(caplog):
    """When the LLM returns non-JSON text, hypothesize() returns [] and logs a warning."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    bad_resp = MagicMock()
    bad_resp.status_code = 200
    bad_resp.json.return_value = {
        "choices": [{"message": {"content": "not valid json"}}]
    }
    import logging
    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=bad_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert results == []
    assert any("invalid JSON" in r.message or "JSON" in r.message for r in caplog.records)
 # ---------------------------------------------------------------------------
 # Test 4: Non-list JSON (dict) returns []
 # ---------------------------------------------------------------------------
 def test_non_list_json_returns_empty(caplog):
    """When the LLM returns a JSON object instead of an array, hypothesize() returns []."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    dict_resp = MagicMock()
    dict_resp.status_code = 200
    dict_resp.json.return_value = {
        "choices": [{"message": {"content": '{"error": "oops"}'}}]
    }
    import logging
    with caplog.at_level(logging.WARNING), patch("httpx.post", return_value=dict_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert results == []
    assert any("array" in r.message.lower() or "list" in r.message.lower() for r in caplog.records)
 # ---------------------------------------------------------------------------
 # Test 5: Empty clusters returns [] without any LLM call
 # ---------------------------------------------------------------------------
 def test_empty_clusters_returns_empty_no_llm_call():
    """ClassifiedTimeline with no clusters returns [] and never calls the LLM."""
    classified = _make_classified(clusters=())
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert results == []
    mock_post.assert_not_called()
 # ---------------------------------------------------------------------------
 # Test 6: No LLM URL returns [] without any HTTP call
 # ---------------------------------------------------------------------------
 def test_no_llm_url_returns_empty_no_http_call():
    """When llm_url is None, hypothesize() returns [] immediately with no HTTP requests."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url=None,
            llm_model="llama3",
        )
    assert results == []
    mock_post.assert_not_called()
 def test_empty_llm_url_returns_empty_no_http_call():
    """When llm_url is empty string, hypothesize() returns [] immediately."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="",
            llm_model="llama3",
        )
    assert results == []
    mock_post.assert_not_called()
 def test_no_llm_model_returns_empty_no_http_call():
    """When llm_model is None, hypothesize() returns [] immediately."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    with patch("httpx.post") as mock_post:
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model=None,
        )
    assert results == []
    mock_post.assert_not_called()
 # ---------------------------------------------------------------------------
 # Test 7: max_hypotheses is respected
 # ---------------------------------------------------------------------------
 def test_max_hypotheses_respected():
    """When LLM returns more items than max_hypotheses, only max_hypotheses are returned."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer(max_hypotheses=3)
    six_items = [
        {
            "title": f"Hypothesis {i}",
            "description": "Some description. A second sentence. Third sentence here.",
            "confidence": 0.5,
            "severity": "ERROR",
            "supporting_clusters": ["c1"],
        }
        for i in range(6)
    ]
    mock_resp = _llm_json_response(six_items)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 3
 # ---------------------------------------------------------------------------
 # Test 8: Severity validation — WARNING → WARN, garbage → ERROR
 # ---------------------------------------------------------------------------
 def test_severity_warning_maps_to_warn():
    """'WARNING' from the LLM is normalised to 'WARN'."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    items = [
        {
            "title": "A warning severity hypothesis",
            "description": "Test description. Second sentence. Third.",
            "confidence": 0.7,
            "severity": "WARNING",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 1
    assert results[0].severity == "WARN"
 def test_severity_garbage_maps_to_error():
    """An unrecognised severity string from the LLM defaults to 'ERROR'."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    items = [
        {
            "title": "A garbage severity hypothesis",
            "description": "Test description. Second sentence. Third.",
            "confidence": 0.4,
            "severity": "GARBAGE",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 1
    assert results[0].severity == "ERROR"
 # ---------------------------------------------------------------------------
 # Test 9: Confidence field works with string floats from the LLM
 # ---------------------------------------------------------------------------
 def test_confidence_string_float_coercion():
    """A confidence value returned as a string by the LLM is coerced to float via float()."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    items = [
        {
            "title": "String confidence test",
            "description": "Some description. Second sentence. Third.",
            "confidence": "0.8",  # LLM returned a string, not a float
            "severity": "INFO",
            "supporting_clusters": ["c1"],
        }
    ]
    mock_resp = _llm_json_response(items)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 1
    assert isinstance(results[0].confidence, float)
    assert results[0].confidence == pytest.approx(0.8)
 # ---------------------------------------------------------------------------
 # Test 10: Non-numeric confidence string falls back to default 0.5
 # ---------------------------------------------------------------------------
 def test_non_numeric_confidence_uses_default():
    """LLM returning 'high' for confidence should not raise and defaults to 0.5."""
    cluster = _make_cluster()
    classified = _make_classified(clusters=(cluster,))
    ctx = _make_ctx()
    hypothesizer = RootCauseHypothesizer()
    items = [
        {
            "title": "t",
            "description": "d",
            "confidence": "high",
            "severity": "ERROR",
            "supporting_clusters": [],
        }
    ]
    mock_resp = _llm_json_response(items)
    with patch("httpx.post", return_value=mock_resp):
        results = hypothesizer.hypothesize(
            classified, ctx, query="test",
            llm_url="http://localhost:11434",
            llm_model="llama3",
        )
    assert len(results) == 1
    assert isinstance(results[0].confidence, float)
    assert results[0].confidence == pytest.approx(0.5)
--- a/tests/test_diagnose_pipeline.py
+++ b/tests/test_diagnose_pipeline.py
@ -1,489 +0,0 @@
 """Tests for app/services/diagnose/pipeline.py and __init__.py feature flag wiring.
 All tests use mocking; no real LLM, ML, or DB calls are made.
 """
 from __future__ import annotations
 from pathlib import Path
 from typing import Any
 from unittest.mock import MagicMock, patch
 import pytest
 from app.context.retriever import RetrievedContext
 from app.services.diagnose.models import (
    ClassifiedTimeline,
    Hypothesis,
    RankedHypothesis,
    TimelineResult,
 )
 from app.services.search import SearchResult
 # ---------------------------------------------------------------------------
 # Shared helpers
 # ---------------------------------------------------------------------------
 def _make_search_result(
    entry_id: str = "e1",
    source_id: str = "syslog",
    timestamp_iso: str | None = "2026-01-01T00:00:00+00:00",
    severity: str | None = "ERROR",
    text: str = "ssh: invalid user",
 ) -> SearchResult:
    return SearchResult(
        entry_id=entry_id,
        source_id=source_id,
        sequence=1,
        timestamp_iso=timestamp_iso,
        severity=severity,
        repeat_count=1,
        out_of_order=False,
        matched_patterns=["ssh_fail"],
        text=text,
        rank=1.0,
    )
 def _make_ctx() -> RetrievedContext:
    return RetrievedContext(facts=[], chunks=[])
 def _make_timeline(n_clusters: int = 2) -> TimelineResult:
    return TimelineResult(
        clusters=tuple(),
        total_entries=5,
        window_start="2026-01-01T00:00:00+00:00",
        window_end="2026-01-01T01:00:00+00:00",
        gap_count=0,
        burst_count=1,
        dominant_sources=("syslog",),
    )
 def _make_classified(timeline: TimelineResult | None = None) -> ClassifiedTimeline:
    tl = timeline or _make_timeline()
    return ClassifiedTimeline(
        timeline=tl,
        cluster_severities={},
        classifier_used="regex",
        model_id=None,
    )
 def _make_hypothesis(
    hypothesis_id: str = "h1",
    title: str = "SSH flood",
    confidence: float = 0.87,
    severity: str = "CRITICAL",
 ) -> Hypothesis:
    return Hypothesis(
        hypothesis_id=hypothesis_id,
        title=title,
        description="Multiple failed SSH attempts.",
        confidence=confidence,
        supporting_cluster_ids=("c1",),
        runbook_refs=(),
        severity=severity,  # type: ignore[arg-type]
    )
 def _make_ranked(hypothesis: Hypothesis | None = None, suppress: bool = False) -> RankedHypothesis:
    h = hypothesis or _make_hypothesis()
    return RankedHypothesis(
        hypothesis=h,
        novelty_score=0.95,
        similarity_to_known=0.05,
        suppress=suppress,
        suppression_reason="similar to known" if suppress else None,
    )
 # ---------------------------------------------------------------------------
 # Helper: collect all events from run_pipeline
 # ---------------------------------------------------------------------------
 async def _collect_pipeline_events(**kwargs) -> list[dict[str, Any]]:
    """Run run_pipeline and collect all yielded events into a list."""
    from app.services.diagnose.pipeline import run_pipeline
    events = []
    async for event in run_pipeline(**kwargs):
        events.append(event)
    return events
 def _default_pipeline_kwargs(entries=None, db_path=None) -> dict:
    return dict(
        db_path=db_path or Path("/tmp/fake.db"),
        entries=entries or [_make_search_result()],
        ctx=_make_ctx(),
        query="ssh brute force",
        since="2026-01-01T00:00:00+00:00",
        until="2026-01-01T01:00:00+00:00",
        llm_url=None,
        llm_model=None,
        llm_api_key=None,
    )
 # ---------------------------------------------------------------------------
 # Mock factories for all 5 stage classes
 # ---------------------------------------------------------------------------
 def _mock_all_stages(
    hypotheses=None,
    ranked=None,
    synthesis_text="VERDICT: CRITICAL — SSH flood (87% confidence)",
 ):
    """Return a dict of patch targets and their mock return values."""
    timeline = _make_timeline()
    classified = _make_classified(timeline)
    hyps = hypotheses if hypotheses is not None else [_make_hypothesis()]
    rnk = ranked if ranked is not None else [_make_ranked()]
    mock_reconstructor = MagicMock()
    mock_reconstructor.return_value.reconstruct.return_value = timeline
    mock_classifier = MagicMock()
    mock_classifier.return_value.classify.return_value = classified
    mock_hypothesizer = MagicMock()
    mock_hypothesizer.return_value.hypothesize.return_value = hyps
    mock_suppressor = MagicMock()
    mock_suppressor.return_value.suppress.return_value = rnk
    mock_synthesizer = MagicMock()
    mock_synthesizer.return_value.synthesize.return_value = synthesis_text
    return {
        "app.services.diagnose.pipeline.TimelineReconstructor": mock_reconstructor,
        "app.services.diagnose.pipeline.SeverityClassifier": mock_classifier,
        "app.services.diagnose.pipeline.RootCauseHypothesizer": mock_hypothesizer,
        "app.services.diagnose.pipeline.FalsePositiveSuppressor": mock_suppressor,
        "app.services.diagnose.pipeline.SummarySynthesizer": mock_synthesizer,
    }
 # ---------------------------------------------------------------------------
 # 1. Feature flag off: legacy summarize() path runs, not run_pipeline
 # ---------------------------------------------------------------------------
 class TestFeatureFlagOff:
    @pytest.mark.asyncio
    async def test_legacy_path_when_flag_off(self):
        """With MULTI_AGENT_ENABLED=False, run_pipeline is never called."""
        from app.services import diagnose as diagnose_module
        entries = [_make_search_result()]
        with (
            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
            patch("app.services.diagnose.search", return_value=entries),
            patch("app.services.diagnose.entries_in_window", return_value=[]),
            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
            patch("app.services.diagnose.format_context_block", return_value=None),
            patch("app.services.diagnose.run_pipeline") as mock_pipeline,
            patch("app.services.diagnose.summarize", return_value=None),
        ):
            events = []
            async for event in diagnose_module.diagnose_stream(
                db_path=Path("/tmp/fake.db"),
                query="ssh failures",
                llm_url=None,
                llm_model=None,
            ):
                events.append(event)
        # run_pipeline must NOT have been called
        mock_pipeline.assert_not_called()
        # SSE sequence must end with done
        types = [e["type"] for e in events]
        assert "done" in types
        assert types[-1] == "done"
    @pytest.mark.asyncio
    async def test_legacy_done_event_is_last(self):
        """Legacy path: done is always the last event."""
        from app.services import diagnose as diagnose_module
        with (
            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", False),
            patch("app.services.diagnose.search", return_value=[]),
            patch("app.services.diagnose.entries_in_window", return_value=[]),
            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
            patch("app.services.diagnose.format_context_block", return_value=None),
        ):
            events = []
            async for event in diagnose_module.diagnose_stream(
                db_path=Path("/tmp/fake.db"),
                query="check logs",
            ):
                events.append(event)
        assert events[-1] == {"type": "done"}
 # ---------------------------------------------------------------------------
 # 2. Feature flag on, all stages mocked: verify SSE event sequence
 # ---------------------------------------------------------------------------
 class TestFeatureFlagOn:
    @pytest.mark.asyncio
    async def test_pipeline_stage_events_in_order(self):
        """pipeline_stage events must be emitted stages 1→2→3→4 in order."""
        mocks = _mock_all_stages()
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
        stages = [e["stage"] for e in stage_events]
        assert stages == [1, 2, 3, 4]
    @pytest.mark.asyncio
    async def test_hypotheses_event_after_stage4(self):
        """hypotheses event must appear after pipeline_stage stage=4."""
        mocks = _mock_all_stages()
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        stage4_idx = next(
            i for i, e in enumerate(events)
            if e.get("type") == "pipeline_stage" and e.get("stage") == 4
        )
        hyp_idx = next(i for i, e in enumerate(events) if e.get("type") == "hypotheses")
        assert hyp_idx > stage4_idx
    @pytest.mark.asyncio
    async def test_reasoning_event_emitted(self):
        """reasoning event must be present when synthesizer returns text."""
        mocks = _mock_all_stages(synthesis_text="VERDICT: CRITICAL — SSH flood")
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        reasoning_events = [e for e in events if e.get("type") == "reasoning"]
        assert len(reasoning_events) == 1
        assert "VERDICT" in reasoning_events[0]["text"]
    @pytest.mark.asyncio
    async def test_done_event_is_last(self):
        """done must always be the last event in the pipeline sequence."""
        mocks = _mock_all_stages()
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        assert events[-1] == {"type": "done"}
    @pytest.mark.asyncio
    async def test_pipeline_wired_from_diagnose_stream(self):
        """diagnose_stream routes through run_pipeline when flag is on."""
        from app.services import diagnose as diagnose_module
        entries = [_make_search_result()]
        async def fake_pipeline(**kwargs):
            yield {"type": "status", "message": "Building timeline…"}
            yield {"type": "pipeline_stage", "stage": 1, "name": "timeline", "message": "Built 1 clusters, 0 bursts"}
            yield {"type": "done"}
        with (
            patch.object(diagnose_module, "MULTI_AGENT_ENABLED", True),
            patch("app.services.diagnose.search", return_value=entries),
            patch("app.services.diagnose.entries_in_window", return_value=[]),
            patch("app.services.diagnose.retrieve_context", return_value=_make_ctx()),
            patch("app.services.diagnose.format_context_block", return_value=None),
            patch("app.services.diagnose.run_pipeline", side_effect=fake_pipeline),
        ):
            events = []
            async for event in diagnose_module.diagnose_stream(
                db_path=Path("/tmp/fake.db"),
                query="ssh failures",
            ):
                events.append(event)
        types = [e["type"] for e in events]
        assert "pipeline_stage" in types
        assert types[-1] == "done"
        # Legacy summarize() must NOT have been called — done event came from pipeline
        assert types.count("done") == 1
 # ---------------------------------------------------------------------------
 # 3. Empty entries: pipeline completes with done
 # ---------------------------------------------------------------------------
 class TestEmptyEntries:
    @pytest.mark.asyncio
    async def test_empty_entries_pipeline_completes(self):
        """Pipeline with entries=[] must still complete and emit done."""
        mocks = _mock_all_stages(hypotheses=[], ranked=[])
        kwargs = _default_pipeline_kwargs(entries=[])
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        types = [e["type"] for e in events]
        assert "done" in types
        assert types[-1] == "done"
    @pytest.mark.asyncio
    async def test_empty_entries_all_stage_events_present(self):
        """Even with empty entries, all 4 pipeline_stage events are emitted."""
        mocks = _mock_all_stages(hypotheses=[], ranked=[])
        kwargs = _default_pipeline_kwargs(entries=[])
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        stage_events = [e for e in events if e.get("type") == "pipeline_stage"]
        assert len(stage_events) == 4
 # ---------------------------------------------------------------------------
 # 4. No LLM: Stage 3 and Stage 5 return empty/fallback; done still emitted
 # ---------------------------------------------------------------------------
 class TestNoLLM:
    @pytest.mark.asyncio
    async def test_no_llm_pipeline_completes_with_done(self):
        """No llm_url/llm_model: pipeline runs all stages and emits done."""
        mocks = _mock_all_stages(hypotheses=[], ranked=[], synthesis_text="VERDICT: UNKNOWN — no hypotheses generated")
        kwargs = _default_pipeline_kwargs()
        # llm_url and llm_model already None in default kwargs
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        assert events[-1] == {"type": "done"}
    @pytest.mark.asyncio
    async def test_no_llm_no_reasoning_event_when_synthesis_empty(self):
        """When synthesizer returns empty string, no reasoning event is emitted."""
        mocks = _mock_all_stages(synthesis_text="")
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        reasoning_events = [e for e in events if e.get("type") == "reasoning"]
        assert len(reasoning_events) == 0
 # ---------------------------------------------------------------------------
 # 5. Stage 1 cluster count in pipeline_stage message
 # ---------------------------------------------------------------------------
 class TestStage1Message:
    @pytest.mark.asyncio
    async def test_stage1_message_contains_cluster_count(self):
        """pipeline_stage stage=1 message must report cluster count."""
        timeline = TimelineResult(
            clusters=tuple(),
            total_entries=10,
            window_start=None,
            window_end=None,
            gap_count=0,
            burst_count=3,
            dominant_sources=("syslog",),
        )
        classified = _make_classified(timeline)
        mock_reconstructor = MagicMock()
        mock_reconstructor.return_value.reconstruct.return_value = timeline
        mock_classifier = MagicMock()
        mock_classifier.return_value.classify.return_value = classified
        mock_hypothesizer = MagicMock()
        mock_hypothesizer.return_value.hypothesize.return_value = []
        mock_suppressor = MagicMock()
        mock_suppressor.return_value.suppress.return_value = []
        mock_synthesizer = MagicMock()
        mock_synthesizer.return_value.synthesize.return_value = "VERDICT: INFO — nothing found"
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mock_reconstructor),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mock_classifier),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mock_hypothesizer),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mock_suppressor),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mock_synthesizer),
        ):
            events = await _collect_pipeline_events(**kwargs)
        stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
        # 0 clusters (empty tuple), 3 bursts
        assert "0" in stage1["message"]  # cluster count
        assert "3" in stage1["message"]  # burst count
    @pytest.mark.asyncio
    async def test_stage1_name_is_timeline(self):
        """pipeline_stage stage=1 must have name='timeline'."""
        mocks = _mock_all_stages()
        kwargs = _default_pipeline_kwargs()
        with (
            patch("app.services.diagnose.pipeline.TimelineReconstructor", mocks["app.services.diagnose.pipeline.TimelineReconstructor"]),
            patch("app.services.diagnose.pipeline.SeverityClassifier", mocks["app.services.diagnose.pipeline.SeverityClassifier"]),
            patch("app.services.diagnose.pipeline.RootCauseHypothesizer", mocks["app.services.diagnose.pipeline.RootCauseHypothesizer"]),
            patch("app.services.diagnose.pipeline.FalsePositiveSuppressor", mocks["app.services.diagnose.pipeline.FalsePositiveSuppressor"]),
            patch("app.services.diagnose.pipeline.SummarySynthesizer", mocks["app.services.diagnose.pipeline.SummarySynthesizer"]),
        ):
            events = await _collect_pipeline_events(**kwargs)
        stage1 = next(e for e in events if e.get("type") == "pipeline_stage" and e.get("stage") == 1)
        assert stage1["name"] == "timeline"
--- a/tests/test_diagnose_suppressor.py
+++ b/tests/test_diagnose_suppressor.py
@ -1,432 +0,0 @@
 """Tests for app/services/diagnose/suppressor.py — FalsePositiveSuppressor.
 All tests use mocking; no real model downloads are made.
 """
 from __future__ import annotations
 import math
 import sqlite3
 from pathlib import Path
 from unittest.mock import MagicMock, patch
 import pytest
 import app.services.diagnose.suppressor as sup_module
 from app.services.diagnose.models import Hypothesis, RankedHypothesis
 from app.services.diagnose.suppressor import FalsePositiveSuppressor
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_hypothesis(
    title: str = "Test",
    description: str = "A test hypothesis.",
    confidence: float = 0.8,
    severity: str = "ERROR",
 ) -> Hypothesis:
    return Hypothesis(
        hypothesis_id="test-id",
        title=title,
        description=description,
        confidence=confidence,
        supporting_cluster_ids=(),
        runbook_refs=(),
        severity=severity,  # type: ignore[arg-type]
    )
 def _make_db_with_incidents(incidents: list[tuple[str, str]], db_path: Path) -> Path:
    """Create a temporary SQLite database with resolved incidents. Returns the db path."""
    with sqlite3.connect(str(db_path)) as conn:
        conn.execute(
            "CREATE TABLE incidents "
            "(id INTEGER PRIMARY KEY, label TEXT, notes TEXT, ended_at TEXT)"
        )
        for label, notes in incidents:
            conn.execute(
                "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
                (label, notes, "2024-01-01T00:00:00"),
            )
        conn.commit()
    return db_path
 def _make_empty_db(db_path: Path) -> Path:
    """Create a temporary SQLite DB with no incidents table."""
    with sqlite3.connect(str(db_path)) as conn:
        conn.execute("CREATE TABLE unrelated (id INTEGER PRIMARY KEY)")
        conn.commit()
    return db_path
 def _make_mock_embedder(
    embed_return: list[float] | None = None,
    embed_batch_return: list[list[float]] | None = None,
 ) -> MagicMock:
    """Build a mock embedder with controllable embed/embed_batch responses."""
    embedder = MagicMock()
    # Default: unit vector along first dimension
    default_vec = [1.0] + [0.0] * 383
    raw_single = embed_return if embed_return is not None else default_vec
    raw_batch = embed_batch_return if embed_batch_return is not None else [default_vec]
    # Wrap scalars in numpy-like MagicMock with .tolist()
    def _wrap(vec: list[float]) -> MagicMock:
        m = MagicMock()
        m.tolist.return_value = vec
        return m
    embedder.embed.return_value = _wrap(raw_single)
    embedder.embed_batch.return_value = [_wrap(v) for v in raw_batch]
    return embedder
 # ---------------------------------------------------------------------------
 # Autouse fixture: reset module-level cache between tests
 # ---------------------------------------------------------------------------
@pytest.fixture(autouse=True)
 def reset_suppressor_cache():
    sup_module._corpus_cache.clear()
    yield
    sup_module._corpus_cache.clear()
 # ---------------------------------------------------------------------------
 # Test 1: No model configured — passthrough, ranked by confidence
 # ---------------------------------------------------------------------------
 def test_no_model_passthrough_ranked_by_confidence(tmp_path):
    """model_id='' → all novelty_score=1.0, suppress=False, ranked by confidence desc."""
    h_low = _make_hypothesis(title="Low", confidence=0.3)
    h_high = _make_hypothesis(title="High", confidence=0.9)
    h_mid = _make_hypothesis(title="Mid", confidence=0.6)
    db_path = tmp_path / "turnstone.db"
    suppressor = FalsePositiveSuppressor(model_id="")
    results = suppressor.suppress([h_low, h_high, h_mid], db_path)
    assert len(results) == 3
    assert all(isinstance(r, RankedHypothesis) for r in results)
    assert all(r.novelty_score == pytest.approx(1.0) for r in results)
    assert all(r.similarity_to_known == pytest.approx(0.0) for r in results)
    assert all(r.suppress is False for r in results)
    assert all(r.suppression_reason is None for r in results)
    # Ranked by confidence descending
    confidences = [r.hypothesis.confidence for r in results]
    assert confidences == sorted(confidences, reverse=True)
 # ---------------------------------------------------------------------------
 # Test 2: High similarity → suppressed
 # ---------------------------------------------------------------------------
 def test_high_similarity_suppresses_hypothesis(tmp_path):
    """Hypothesis with embedding nearly identical to corpus → suppress=True."""
    identical_vec = [1.0] + [0.0] * 383
    corpus_vec = [1.0] + [0.0] * 383  # cosine similarity = 1.0
    mock_embedder = _make_mock_embedder(
        embed_return=identical_vec,
        embed_batch_return=[corpus_vec],
    )
    db_path = _make_db_with_incidents(
        [("OOM killer", "Memory pressure caused OOM kill")],
        tmp_path / "turnstone.db",
    )
    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        results = suppressor.suppress([_make_hypothesis()], db_path)
    assert len(results) == 1
    result = results[0]
    assert result.suppress is True
    assert result.suppression_reason is not None
    assert "Similar to resolved incident" in result.suppression_reason
    assert result.similarity_to_known == pytest.approx(1.0, abs=0.01)
    assert result.novelty_score == pytest.approx(0.0, abs=0.01)
 # ---------------------------------------------------------------------------
 # Test 3: Low similarity → not suppressed
 # ---------------------------------------------------------------------------
 def test_low_similarity_does_not_suppress(tmp_path):
    """Hypothesis with embedding orthogonal to corpus → suppress=False."""
    hypothesis_vec = [1.0] + [0.0] * 383
    corpus_vec = [0.0, 1.0] + [0.0] * 382  # orthogonal → similarity = 0.0
    mock_embedder = _make_mock_embedder(
        embed_return=hypothesis_vec,
        embed_batch_return=[corpus_vec],
    )
    db_path = _make_db_with_incidents(
        [("Disk I/O", "Storage saturation caused latency")],
        tmp_path / "turnstone.db",
    )
    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        results = suppressor.suppress([_make_hypothesis()], db_path)
    assert len(results) == 1
    result = results[0]
    assert result.suppress is False
    assert result.suppression_reason is None
    assert result.similarity_to_known == pytest.approx(0.0, abs=0.01)
    assert result.novelty_score == pytest.approx(1.0, abs=0.01)
 # ---------------------------------------------------------------------------
 # Test 3b: Borderline similarity — exactly at threshold vs. just below
 # ---------------------------------------------------------------------------
 def test_similarity_threshold_boundary(tmp_path):
    """similarity == threshold is suppressed; similarity just below threshold is not.
    This test locks down the boundary semantics: suppress when max_sim >= threshold,
    not when novelty_score < threshold (the inverted form that was the original bug).
    With threshold=0.85:
      - similarity=0.85 → suppressed (at boundary, inclusive)
      - similarity=0.84 → NOT suppressed (just below)
    """
    db_path = _make_db_with_incidents(
        [("Disk I/O", "Storage saturation caused latency")],
        tmp_path / "turnstone.db",
    )
    # Corpus unit vector along first axis
    corpus_vec = [1.0] + [0.0] * 383
    for sim_value, expected_suppress in [(0.85, True), (0.84, False)]:
        # Build a hypothesis embedding whose cosine similarity to corpus_vec ≈ sim_value.
        # query = [sim, sqrt(1 - sim^2), 0, ...] → cosine sim = sim exactly.
        import math
        hyp_vec = [sim_value, math.sqrt(max(0.0, 1.0 - sim_value ** 2))] + [0.0] * 382
        mock_embedder = _make_mock_embedder(
            embed_return=hyp_vec,
            embed_batch_return=[corpus_vec],
        )
        suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
        with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
            results = suppressor.suppress([_make_hypothesis()], db_path)
        assert len(results) == 1
        result = results[0]
        assert result.suppress is expected_suppress, (
            f"similarity={sim_value:.2f}: expected suppress={expected_suppress}, "
            f"got suppress={result.suppress} (similarity_to_known={result.similarity_to_known:.4f})"
        )
 # ---------------------------------------------------------------------------
 # Test 4: Empty hypotheses list returns []
 # ---------------------------------------------------------------------------
 def test_empty_hypotheses_returns_empty(tmp_path):
    """suppress([]) → [] regardless of model or db state."""
    db_path = tmp_path / "turnstone.db"
    suppressor = FalsePositiveSuppressor(model_id="test-model")
    results = suppressor.suppress([], db_path)
    assert results == []
 # ---------------------------------------------------------------------------
 # Test 5: Ranking by novelty_score * confidence
 # ---------------------------------------------------------------------------
 def test_ranking_by_novelty_times_confidence(tmp_path):
    """Results are sorted by novelty_score * confidence descending."""
    # Hypothesis A: novelty=0.9, confidence=0.5 → score=0.45
    # Hypothesis B: novelty=0.5, confidence=0.9 → score=0.45 (tie, order stable-ish)
    # Hypothesis C: novelty=0.8, confidence=0.9 → score=0.72  (highest)
    # Expected order: C, then A or B
    # We'll use orthogonal embeddings to get predictable similarities.
    # Corpus has 3 incidents with different embeddings.
    # We'll control novelty_score by setting similarity carefully.
    # Simplest: set up so each hypothesis gets a specific similarity to its corpus.
    # corpus_embs[0] = [1,0,0,...], [0,1,0,...], [0,0,1,...] — unit vectors
    # hyp A embed  = [cos(0.1), sin(0.1), 0...] → sim to corpus[0] = cos(0.1) ≈ 0.995 high
    # This gets complex. Instead, mock _load_embedder to return None and rely
    # on passthrough with controlled confidence, then verify confidence-based ranking.
    # Then do a second test variant with manual novelty injection via embed return values.
    # Simpler approach: create 3 hypotheses and verify output is sorted correctly
    # by providing distinct embeddings that produce known similarities.
    # Corpus: single vector [1, 0, 0, ...]
    corpus_vec = [1.0] + [0.0] * 383
    # H_A: similarity = 0.1 → novelty = 0.9, confidence = 0.5 → score = 0.45
    angle_a = math.acos(0.1)
    vec_a = [0.1, math.sin(angle_a)] + [0.0] * 382
    # H_B: similarity = 0.5 → novelty = 0.5, confidence = 0.9 → score = 0.45
    angle_b = math.acos(0.5)
    vec_b = [0.5, math.sin(angle_b)] + [0.0] * 382
    # H_C: similarity = 0.2 → novelty = 0.8, confidence = 0.9 → score = 0.72 (highest)
    angle_c = math.acos(0.2)
    vec_c = [0.2, math.sin(angle_c)] + [0.0] * 382
    h_a = _make_hypothesis(title="A", confidence=0.5)
    h_b = _make_hypothesis(title="B", confidence=0.9)
    h_c = _make_hypothesis(title="C", confidence=0.9)
    call_count = [0]
    vecs_in_order = [vec_a, vec_b, vec_c]
    def side_effect_embed(text: str) -> MagicMock:
        m = MagicMock()
        m.tolist.return_value = vecs_in_order[call_count[0] % len(vecs_in_order)]
        call_count[0] += 1
        return m
    mock_embedder = MagicMock()
    batch_m = MagicMock()
    batch_m.tolist.return_value = corpus_vec
    mock_embedder.embed_batch.return_value = [batch_m]
    mock_embedder.embed.side_effect = side_effect_embed
    db_path = _make_db_with_incidents(
        [("OOM", "Memory exhaustion")],
        tmp_path / "turnstone.db",
    )
    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        results = suppressor.suppress([h_a, h_b, h_c], db_path)
    assert len(results) == 3
    titles = [r.hypothesis.title for r in results]
    # H_C should be first (highest novelty*confidence score)
    assert titles[0] == "C", f"Expected C first, got {titles}"
    # Verify sort is descending by novelty*confidence
    scores = [r.novelty_score * r.hypothesis.confidence for r in results]
    assert scores == sorted(scores, reverse=True)
 # ---------------------------------------------------------------------------
 # Test 6: DB with no resolved incidents → novelty_score=1.0
 # ---------------------------------------------------------------------------
 def test_no_resolved_incidents_in_db_passthrough(tmp_path):
    """When incidents table is empty, all hypotheses get novelty_score=1.0."""
    db_path = _make_db_with_incidents([], tmp_path / "turnstone.db")  # table exists but zero rows
    mock_embedder = _make_mock_embedder()
    suppressor = FalsePositiveSuppressor(model_id="test-model")
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        results = suppressor.suppress([_make_hypothesis()], db_path)
    assert len(results) == 1
    assert results[0].novelty_score == pytest.approx(1.0)
    assert results[0].suppress is False
    # embed_batch should NOT have been called (empty corpus short-circuits)
    mock_embedder.embed_batch.assert_not_called()
 # ---------------------------------------------------------------------------
 # Test 7: DB query failure → graceful fallback, no crash
 # ---------------------------------------------------------------------------
 def test_db_query_failure_graceful_fallback(tmp_path):
    """When the incidents table is missing, suppress() returns passthrough without raising."""
    db_path = _make_empty_db(tmp_path / "turnstone.db")  # no 'incidents' table
    mock_embedder = _make_mock_embedder()
    suppressor = FalsePositiveSuppressor(model_id="test-model")
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        results = suppressor.suppress([_make_hypothesis()], db_path)
    assert len(results) == 1
    assert results[0].novelty_score == pytest.approx(1.0)
    assert results[0].suppress is False
 # ---------------------------------------------------------------------------
 # Test 8: Embedding service unavailable (returns None) → graceful fallback
 # ---------------------------------------------------------------------------
 def test_embedding_service_unavailable_passthrough(tmp_path):
    """When get_embedder() returns None, suppress() falls back without crashing."""
    db_path = _make_db_with_incidents(
        [("OOM", "Memory pressure")],
        tmp_path / "turnstone.db",
    )
    suppressor = FalsePositiveSuppressor(model_id="test-model")
    with patch.object(suppressor, "_load_embedder", return_value=None):
        results = suppressor.suppress([_make_hypothesis(confidence=0.7)], db_path)
    assert len(results) == 1
    assert results[0].novelty_score == pytest.approx(1.0)
    assert results[0].suppress is False
    assert results[0].suppression_reason is None
 # ---------------------------------------------------------------------------
 # Test 9: Corpus cache invalidated when corpus changes
 # ---------------------------------------------------------------------------
 def test_corpus_cache_invalidated_on_corpus_change(tmp_path):
    """When the corpus changes between calls, embed_batch is called again."""
    # First DB: one incident
    db_path = _make_db_with_incidents(
        [("OOM", "Memory pressure")],
        tmp_path / "turnstone.db",
    )
    corpus_vec_1 = [1.0] + [0.0] * 383
    corpus_vec_2 = [0.0, 1.0] + [0.0] * 382
    hyp_vec = [1.0] + [0.0] * 383
    # embedder will be called twice for embed_batch (different corpus each time)
    mock_embedder = MagicMock()
    single_m = MagicMock()
    single_m.tolist.return_value = hyp_vec
    batch_m1 = MagicMock()
    batch_m1.tolist.return_value = corpus_vec_1
    batch_m2 = MagicMock()
    batch_m2.tolist.return_value = corpus_vec_2
    mock_embedder.embed.return_value = single_m
    mock_embedder.embed_batch.side_effect = [[batch_m1], [batch_m2]]
    suppressor = FalsePositiveSuppressor(model_id="test-model", similarity_threshold=0.85)
    with patch.object(suppressor, "_load_embedder", return_value=mock_embedder):
        # First call — populates cache
        results_1 = suppressor.suppress([_make_hypothesis()], db_path)
        assert mock_embedder.embed_batch.call_count == 1
        # Mutate the DB to add a second incident (changes corpus)
        with sqlite3.connect(str(db_path)) as conn:
            conn.execute(
                "INSERT INTO incidents (label, notes, ended_at) VALUES (?, ?, ?)",
                ("Disk I/O", "Storage saturation", "2024-01-02T00:00:00"),
            )
            conn.commit()
        # Second call — corpus changed, should re-embed
        results_2 = suppressor.suppress([_make_hypothesis()], db_path)
        assert mock_embedder.embed_batch.call_count == 2, (
            "embed_batch should be called again when corpus changes"
        )
    assert len(results_1) == 1
    assert len(results_2) == 1
--- a/Show more
+++ b/Show more