pagepiper/app/api/search.py

# app/api/search.py
"""
BM25 keyword search across the document library.

MIT — no tier gate. No Ollama required.
"""
from __future__ import annotations

import logging
import os
from typing import Annotated

from fastapi import APIRouter, Depends
from pydantic import BaseModel, Field

from app.services.bm25_index import BM25Index

logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/search", tags=["search"])


class SearchRequest(BaseModel):
    query: str
    top_k: int = Field(default=10, ge=1, le=50)
    doc_ids: list[str] | None = None


class SearchResult(BaseModel):
    chunk_id: str
    doc_id: str
    page_number: int
    text_snippet: str   # first 300 chars of the page text
    bm25_score: float


def _get_bm25() -> BM25Index:
    import app.main as _main
    bm25 = getattr(_main, "_bm25", None)
    if bm25 is None:
        raise RuntimeError("BM25 index not initialised — app.main not loaded")
    return bm25


def _get_db_path() -> str:
    """Read lazily so test fixtures (monkeypatch.setattr) take effect."""
    import pathlib
    data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
    return str(data_dir / "pagepiper.db")


@router.post("")
def search(
    req: SearchRequest,
    bm25: Annotated[BM25Index, Depends(_get_bm25)],
) -> list[SearchResult]:
    bm25.ensure_fresh(_get_db_path())
    hits = bm25.query(req.query, top_k=req.top_k, doc_ids=req.doc_ids)
    return [
        SearchResult(
            chunk_id=h.chunk_id,
            doc_id=h.doc_id,
            page_number=h.page_number,
            text_snippet=(h.text or "")[:300],
            bm25_score=h.score,
        )
        for h in hits
    ]