turnstone/app/services/search.py

"""FTS5-based log search with optional hybrid BM25 + vector re-ranking."""
from __future__ import annotations

import json
import logging
import re
import sqlite3
from dataclasses import dataclass
from pathlib import Path

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class SearchResult:
    entry_id: str
    source_id: str
    sequence: int
    timestamp_iso: str | None
    severity: str | None
    repeat_count: int
    out_of_order: bool
    matched_patterns: list[str]
    text: str
    rank: float


def build_fts_index(db_path: Path) -> None:
    """Build (or rebuild) the FTS5 index from log_entries. Safe to re-run.

    Drops and recreates the table if the schema is stale (missing sequence column).
    """
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")

    # Check whether existing table has the sequence column; rebuild if not.
    needs_rebuild = False
    try:
        conn.execute("SELECT sequence FROM log_fts LIMIT 0")
    except sqlite3.OperationalError:
        needs_rebuild = True

    if needs_rebuild:
        conn.execute("DROP TABLE IF EXISTS log_fts")

    conn.executescript("""
        CREATE VIRTUAL TABLE IF NOT EXISTS log_fts USING fts5(
            text,
            entry_id      UNINDEXED,
            source_id     UNINDEXED,
            sequence      UNINDEXED,
            severity      UNINDEXED,
            timestamp_iso UNINDEXED,
            matched_patterns UNINDEXED,
            repeat_count  UNINDEXED,
            out_of_order  UNINDEXED,
            tokenize = 'porter ascii'
        );
    """)
    # Only insert rows not already indexed
    conn.execute("""
        INSERT INTO log_fts(text, entry_id, source_id, sequence, severity,
                            timestamp_iso, matched_patterns,
                            repeat_count, out_of_order)
        SELECT e.text, e.id, e.source_id, e.sequence, e.severity,
               e.timestamp_iso, e.matched_patterns,
               e.repeat_count, e.out_of_order
        FROM log_entries e
        WHERE e.id NOT IN (SELECT entry_id FROM log_fts WHERE entry_id IS NOT NULL)
    """)
    conn.commit()
    conn.close()


def _sanitize_fts_query(raw: str, or_mode: bool = False) -> str:
    """Strip FTS5 operator characters and return a safe MATCH expression.

    FTS5 reserves: " * + - ( ) ^ ~ : ?
    or_mode=True joins tokens with OR (any-of) instead of implicit AND (all-of).
    """
    cleaned = re.sub(r"[^a-zA-Z0-9 _]", " ", raw)
    tokens = cleaned.split()
    if not tokens:
        return '""'
    return (" OR " if or_mode else " ").join(tokens)


def search(
    db_path: Path,
    query: str,
    severity: str | None = None,
    source_filter: str | None = None,
    pattern_filter: str | None = None,
    since: str | None = None,
    until: str | None = None,
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
    semantic: bool = False,
) -> list[SearchResult]:
    """Full-text search with optional filters. Returns results ranked by relevance.

    When ``semantic=True`` and an embedding backend is configured, the BM25
    candidate pool is re-ranked using hybrid scoring (BM25 + cosine similarity).
    Falls back silently to pure BM25 when the embedder is unavailable.
    """
    if semantic:
        return _hybrid_search(
            db_path, query, severity=severity, source_filter=source_filter,
            pattern_filter=pattern_filter, since=since, until=until, limit=limit,
            include_repeats=include_repeats, or_mode=or_mode,
        )
    return _bm25_search(
        db_path, query, severity=severity, source_filter=source_filter,
        pattern_filter=pattern_filter, since=since, until=until, limit=limit,
        include_repeats=include_repeats, or_mode=or_mode,
    )


def _hybrid_search(
    db_path: Path,
    query: str,
    severity: str | None = None,
    source_filter: str | None = None,
    pattern_filter: str | None = None,
    since: str | None = None,
    until: str | None = None,
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
    alpha: float = 0.6,
    beta: float = 0.4,
) -> list[SearchResult]:
    """BM25 + vector re-ranking (late-fusion hybrid search).

    Fetches an oversized BM25 candidate pool, embeds the query and each
    candidate text in-process, then combines scores:

        hybrid_score = alpha * bm25_normalized + beta * cosine_sim

    BM25 normalization: FTS5 rank is negative (more negative = better match).
    We flip the sign and divide by the pool maximum so all BM25 scores land
    in (0, 1] — 1.0 for the top BM25 hit, approaching 0 for the weakest.

    Falls back to pure BM25 when the embedding backend is unavailable.
    """
    from app.services.embeddings import EMBEDDING_AVAILABLE, cosine_similarity, get_embedder

    # Fetch a large candidate pool — 5x limit, minimum 100 entries.
    pool_limit = max(limit * 5, 100)
    candidates = _bm25_search(
        db_path, query, severity=severity, source_filter=source_filter,
        pattern_filter=pattern_filter, since=since, until=until,
        limit=pool_limit, include_repeats=include_repeats, or_mode=or_mode,
    )

    if not candidates:
        return []

    if not EMBEDDING_AVAILABLE:
        return candidates[:limit]

    embedder = get_embedder()
    if embedder is None:
        return candidates[:limit]

    try:
        query_vec = embedder.embed(query)
        candidate_vecs = embedder.embed_batch([r.text for r in candidates])
    except Exception as exc:
        logger.warning("Hybrid search embedding failed (%s) — falling back to BM25", exc)
        return candidates[:limit]

    # Normalize BM25 ranks: FTS5 rank is negative, flip and scale to [0, 1].
    abs_ranks = [abs(r.rank) for r in candidates]
    max_rank = max(abs_ranks) or 1.0

    scored: list[tuple[float, SearchResult]] = []
    for result, abs_rank, cand_vec in zip(candidates, abs_ranks, candidate_vecs):
        bm25_norm = abs_rank / max_rank
        cos_sim = cosine_similarity(query_vec, cand_vec)
        hybrid = alpha * bm25_norm + beta * max(cos_sim, 0.0)
        scored.append((hybrid, result))

    scored.sort(key=lambda x: x[0], reverse=True)
    return [r for _, r in scored[:limit]]


def _bm25_search(
    db_path: Path,
    query: str,
    severity: str | None = None,
    source_filter: str | None = None,
    pattern_filter: str | None = None,
    since: str | None = None,
    until: str | None = None,
    limit: int = 20,
    include_repeats: bool = False,
    or_mode: bool = False,
) -> list[SearchResult]:
    """Pure BM25 FTS5 search — internal helper used by both search() and _hybrid_search()."""
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row

    fts_query = _sanitize_fts_query(query, or_mode=or_mode)
    conditions = ["log_fts MATCH ?"]
    params: list = [fts_query]

    if severity:
        conditions.append("severity = ?")
        params.append(severity.upper())
    if source_filter:
        conditions.append("source_id LIKE ?")
        params.append(f"%{source_filter}%")
    if pattern_filter:
        conditions.append("matched_patterns LIKE ?")
        params.append(f'%"{pattern_filter}"%')
    if since:
        conditions.append("timestamp_iso >= ?")
        params.append(since)
    if until:
        conditions.append("timestamp_iso <= ?")
        params.append(until)
    if not include_repeats:
        conditions.append("repeat_count = 1")

    where = " AND ".join(conditions)
    params.append(limit)

    try:
        rows = conn.execute(
            f"""
            SELECT entry_id, source_id, sequence, timestamp_iso, severity,
                   repeat_count, out_of_order, matched_patterns, text, rank
            FROM log_fts
            WHERE {where}
            ORDER BY rank
            LIMIT ?
            """,
            params,
        ).fetchall()
    except sqlite3.OperationalError as e:
        logger.warning("FTS query failed (%s) — index may not be built yet", e)
        conn.close()
        return []

    results = [
        SearchResult(
            entry_id=r["entry_id"],
            source_id=r["source_id"],
            sequence=r["sequence"],
            timestamp_iso=r["timestamp_iso"],
            severity=r["severity"],
            repeat_count=r["repeat_count"],
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
            rank=r["rank"],
        )
        for r in rows
    ]
    conn.close()
    return results


def entries_in_window(
    db_path: Path,
    since: str | None,
    until: str | None,
    severity: str | None = None,
    source_filter: str | None = None,
    limit: int = 100,
    per_source_cap: int | None = None,
) -> list[SearchResult]:
    """Return log entries within a time window using a plain SQL scan (no FTS).

    Used as a fallback when keyword search returns nothing — ensures incident
    detail always shows the raw log activity in the window even if no keywords match.

    per_source_cap: when set, limits rows per source_id so high-volume sources
    (e.g. network-syslog) don't crowd out lower-volume but more interesting ones.
    Errors/warnings are ranked first within each source partition.
    """
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row

    conditions: list[str] = ["repeat_count = 1"]
    params: list = []

    if since:
        conditions.append("timestamp_iso >= ?")
        params.append(since)
    if until:
        conditions.append("timestamp_iso <= ?")
        params.append(until)
    if severity:
        conditions.append("severity = ?")
        params.append(severity.upper())
    if source_filter:
        conditions.append("source_id LIKE ?")
        params.append(f"%{source_filter}%")

    where = " AND ".join(conditions)

    if per_source_cap is not None:
        # Use a window function to cap rows per source, errors/warnings first.
        query = f"""
            WITH ranked AS (
                SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
                       repeat_count, out_of_order, matched_patterns, text, 0.0 as rank,
                       ROW_NUMBER() OVER (
                           PARTITION BY source_id
                           ORDER BY
                               CASE UPPER(severity)
                                   WHEN 'CRITICAL' THEN 0
                                   WHEN 'ERROR'    THEN 1
                                   WHEN 'WARN'     THEN 2
                                   ELSE 3
                               END,
                               timestamp_iso
                       ) AS rn
                FROM log_entries
                WHERE {where}
            )
            SELECT entry_id, source_id, sequence, timestamp_iso, severity,
                   repeat_count, out_of_order, matched_patterns, text, rank
            FROM ranked
            WHERE rn <= ?
            ORDER BY timestamp_iso ASC
            LIMIT ?
        """
        params.extend([per_source_cap, limit])
    else:
        query = f"""
            SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
                   repeat_count, out_of_order, matched_patterns, text, 0.0 as rank
            FROM log_entries
            WHERE {where}
            ORDER BY timestamp_iso ASC
            LIMIT ?
        """
        params.append(limit)

    rows = conn.execute(query, params).fetchall()
    conn.close()

    return [
        SearchResult(
            entry_id=r["entry_id"],
            source_id=r["source_id"],
            sequence=r["sequence"],
            timestamp_iso=r["timestamp_iso"],
            severity=r["severity"],
            repeat_count=r["repeat_count"],
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
            rank=r["rank"],
        )
        for r in rows
    ]


def recent_source_errors(
    db_path: Path,
    source_filter: str,
    severity: str = "ERROR",
    limit: int = 10,
    since: str | None = None,
    until: str | None = None,
) -> list[SearchResult]:
    """Plain-SQL scan: most recent error entries from a named source.

    Bypasses FTS ranking so text content doesn't affect which errors surface.
    Used by diagnose when FTS keyword search returns nothing for a known source.
    """
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row

    conditions = [
        "source_id LIKE ?",
        "severity = ?",
        "repeat_count = 1",
    ]
    params: list = [f"%{source_filter}%", severity.upper()]

    if since:
        conditions.append("timestamp_iso >= ?")
        params.append(since)
    if until:
        conditions.append("timestamp_iso <= ?")
        params.append(until)

    params.append(limit)
    where = " AND ".join(conditions)

    rows = conn.execute(
        f"""
        SELECT id as entry_id, source_id, sequence, timestamp_iso, severity,
               repeat_count, out_of_order, matched_patterns, text, 0.0 as rank
        FROM log_entries
        WHERE {where}
        ORDER BY timestamp_iso DESC
        LIMIT ?
        """,
        params,
    ).fetchall()
    conn.close()

    return [
        SearchResult(
            entry_id=r["entry_id"],
            source_id=r["source_id"],
            sequence=r["sequence"],
            timestamp_iso=r["timestamp_iso"],
            severity=r["severity"],
            repeat_count=r["repeat_count"],
            out_of_order=bool(r["out_of_order"]),
            matched_patterns=json.loads(r["matched_patterns"] or "[]"),
            text=r["text"],
            rank=r["rank"],
        )
        for r in rows
    ]


def list_sources(db_path: Path) -> list[dict]:
    """Return sources with entry counts, grouped by prefix:host stem.

    source_ids with three or more colon-separated segments (e.g.
    ``muninn-journal:Muninn:ssh.service``) are collapsed to their first two
    segments (``muninn-journal:Muninn``).  Single- or two-segment IDs are
    returned as-is.  ``unit_count`` reports how many distinct sub-units were
    merged into each row.
    """
    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    rows = conn.execute("""
        SELECT
            CASE
                WHEN INSTR(SUBSTR(source_id, INSTR(source_id, ':')+1), ':') > 0
                THEN SUBSTR(source_id, 1,
                         INSTR(source_id, ':')
                         + INSTR(SUBSTR(source_id, INSTR(source_id, ':')+1), ':')
                         - 1)
                ELSE source_id
            END AS group_id,
            COUNT(DISTINCT source_id) AS unit_count,
            COUNT(*) AS entry_count,
            MIN(timestamp_iso) AS earliest,
            MAX(timestamp_iso) AS latest,
            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT')
                     THEN 1 ELSE 0 END) AS error_count
        FROM log_entries
        GROUP BY group_id
        ORDER BY entry_count DESC
    """).fetchall()
    conn.close()
    return [
        {
            "source_id": r[0],
            "unit_count": r[1],
            "entry_count": r[2],
            "earliest": r[3],
            "latest": r[4],
            "error_count": r[5],
        }
        for r in rows
    ]


def _compile_overrides(overrides: list[dict]) -> list[tuple[re.Pattern[str], str]]:
    """Return (compiled_pattern, override_severity) pairs for enabled rules."""
    compiled = []
    for rule in overrides:
        if not rule.get("enabled", True):
            continue
        try:
            compiled.append((re.compile(rule["pattern"], re.IGNORECASE), rule["override_severity"]))
        except re.error:
            pass
    return compiled


def _apply_overrides(text: str, original_severity: str, rules: list[tuple[re.Pattern[str], str]]) -> str:
    for pattern, new_sev in rules:
        if pattern.search(text):
            return new_sev
    return original_severity


def stats_summary(db_path: Path, window_hours: int = 24, severity_overrides: list[dict] | None = None) -> dict:
    """Return aggregate health stats for the dashboard.

    Queries plain log_entries (not FTS) so it works even before the index is built.
    """
    rules = _compile_overrides(severity_overrides or [])

    conn = sqlite3.connect(str(db_path), timeout=30.0)
    conn.execute("PRAGMA journal_mode=WAL")
    conn.row_factory = sqlite3.Row

    since_expr = f"strftime('%Y-%m-%dT%H:%M:%S', 'now', '-{window_hours} hours')"

    # Overall counts in window
    row = conn.execute(f"""
        SELECT
            COUNT(*) AS total,
            SUM(CASE WHEN severity = 'CRITICAL' THEN 1 ELSE 0 END) AS criticals,
            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS errors
        FROM log_entries
        WHERE timestamp_iso >= {since_expr}
          AND repeat_count = 1
    """).fetchone()
    total_24h = int(row["total"] or 0)
    criticals_24h = int(row["criticals"] or 0)
    errors_24h = int(row["errors"] or 0)

    # Per-source breakdown — grouped by prefix:host stem (same logic as list_sources).
    source_rows = conn.execute(f"""
        SELECT
            CASE
                WHEN INSTR(SUBSTR(source_id, INSTR(source_id, ':')+1), ':') > 0
                THEN SUBSTR(source_id, 1,
                         INSTR(source_id, ':')
                         + INSTR(SUBSTR(source_id, INSTR(source_id, ':')+1), ':')
                         - 1)
                ELSE source_id
            END AS group_id,
            COUNT(*) AS entry_count,
            SUM(CASE WHEN severity IN ('ERROR','CRITICAL','EMERGENCY','ALERT') THEN 1 ELSE 0 END) AS error_count,
            MAX(timestamp_iso) AS latest
        FROM log_entries
        WHERE timestamp_iso >= {since_expr}
          AND repeat_count = 1
        GROUP BY group_id
        ORDER BY error_count DESC, entry_count DESC
    """).fetchall()
    source_health = [
        {
            "source_id": r["group_id"],
            "entry_count": int(r["entry_count"]),
            "error_count": int(r["error_count"]),
            "latest": r["latest"],
        }
        for r in source_rows
    ]

    # Fetch candidate criticals (fetch more so filtering doesn't leave us with too few)
    crit_rows = conn.execute("""
        SELECT id as entry_id, source_id, timestamp_iso, severity, text
        FROM log_entries
        WHERE severity = 'CRITICAL' AND repeat_count = 1
        ORDER BY timestamp_iso DESC
        LIMIT 25
    """).fetchall()

    # Apply overrides: skip entries whose effective severity is no longer CRITICAL
    suppressed = 0
    recent_criticals = []
    for r in crit_rows:
        effective = _apply_overrides(r["text"], r["severity"], rules)
        if effective == "CRITICAL":
            recent_criticals.append({
                "entry_id": r["entry_id"],
                "source_id": r["source_id"],
                "timestamp_iso": r["timestamp_iso"],
                "severity": r["severity"],
                "text": r["text"],
            })
            if len(recent_criticals) == 5:
                break
        else:
            suppressed += 1

    last_row = conn.execute("SELECT MAX(ingest_time) AS t FROM log_entries").fetchone()
    last_gleaned: str | None = last_row["t"] if last_row else None

    conn.close()

    return {
        "window_hours": window_hours,
        "total_24h": total_24h,
        "criticals_24h": criticals_24h,
        "errors_24h": errors_24h,
        "source_health": source_health,
        "recent_criticals": recent_criticals,
        "suppressed_criticals": suppressed,
        "last_gleaned": last_gleaned,
    }


def format_results(results: list[SearchResult], max_text: int = 300) -> str:
    """Format search results as readable text for LLM context."""
    if not results:
        return "No matching log entries found."

    lines = []
    for r in results:
        ts = r.timestamp_iso or "no-timestamp"
        sev = r.severity or "?"
        src = r.source_id
        flags = []
        if r.repeat_count > 1:
            flags.append(f"repeat×{r.repeat_count}")
        if r.out_of_order:
            flags.append("out-of-order")
        if r.matched_patterns:
            flags.append(f"[{', '.join(r.matched_patterns)}]")
        flag_str = f"  {' '.join(flags)}" if flags else ""

        text = r.text[:max_text] + ("…" if len(r.text) > max_text else "")
        lines.append(f"[{ts} | {src} | {sev}{flag_str}]\n{text}")

    return "\n\n".join(lines)