pagepiper/app/services/bm25_index.py

"""
BM25 keyword search over the page_chunks corpus.

MIT — no tier gate. Available to all users with no Ollama required.
"""

from __future__ import annotations

import logging
import sqlite3
from dataclasses import dataclass

from rank_bm25 import BM25Okapi

logger = logging.getLogger(__name__)


@dataclass(frozen=True)
class BM25Result:
    """A single BM25 search result."""

    chunk_id: str
    doc_id: str
    page_number: int
    text: str
    score: float


class BM25Index:
    """
    In-memory BM25 index over page_chunks. Rebuilt lazily on demand.

    Thread-safety note: rebuilt synchronously in the request thread. For
    single-user local deployments this is acceptable.
    """

    def __init__(self) -> None:
        self._index: BM25Okapi | None = None
        self._chunks: list[dict] = []
        self._dirty: bool = True

    def mark_dirty(self) -> None:
        """Signal that the index needs rebuilding (call after any ingest completes)."""
        self._dirty = True

    def ensure_fresh(self, db_path: str) -> None:
        """Rebuild from SQLite if dirty."""
        if not self._dirty:
            return
        conn = sqlite3.connect(db_path)
        conn.row_factory = sqlite3.Row
        rows = conn.execute(
            "SELECT id, doc_id, page_number, text FROM page_chunks ORDER BY doc_id, page_number"
        ).fetchall()
        conn.close()
        self._load_chunks([dict(r) for r in rows])
        self._dirty = False
        logger.info("BM25 index rebuilt: %d chunks", len(self._chunks))

    def _load_chunks(self, chunks: list[dict]) -> None:
        self._chunks = chunks
        tokenized = [c["text"].lower().split() for c in chunks]
        self._index = BM25Okapi(tokenized) if tokenized else None

    def query(
        self,
        query_text: str,
        top_k: int = 10,
        doc_ids: list[str] | None = None,
    ) -> list[BM25Result]:
        """Search the corpus. Returns results sorted by descending BM25 score."""
        if not self._index or not self._chunks:
            return []

        scores = self._index.get_scores(query_text.lower().split())
        ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)

        results: list[BM25Result] = []
        for i, score in ranked:
            if score <= 0:
                continue
            c = self._chunks[i]
            if doc_ids is not None and c["doc_id"] not in doc_ids:
                continue
            results.append(
                BM25Result(
                    chunk_id=c["id"],
                    doc_id=c["doc_id"],
                    page_number=c["page_number"],
                    text=c["text"],
                    score=float(score),
                )
            )
            if len(results) >= top_k:
                break
        return results