pagepiper/app/api/search.py

67 lines
1.6 KiB
Python

# app/api/search.py
"""
BM25 keyword search across the document library.
MIT — no tier gate. No Ollama required.
"""
from __future__ import annotations
import logging
import os
from typing import Annotated
from fastapi import APIRouter, Depends
from pydantic import BaseModel, Field
from app.services.bm25_index import BM25Index
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/search", tags=["search"])
class SearchRequest(BaseModel):
query: str
top_k: int = Field(default=10, ge=1, le=50)
doc_ids: list[str] | None = None
class SearchResult(BaseModel):
chunk_id: str
doc_id: str
page_number: int
text_snippet: str # first 300 chars of the page text
bm25_score: float
def _get_bm25() -> BM25Index:
import app.main as _main
bm25 = getattr(_main, "_bm25", None)
if bm25 is None:
raise RuntimeError("BM25 index not initialised — app.main not loaded")
return bm25
def _get_db_path() -> str:
"""Read lazily so test fixtures (monkeypatch.setattr) take effect."""
import pathlib
data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
return str(data_dir / "pagepiper.db")
@router.post("")
def search(
req: SearchRequest,
bm25: Annotated[BM25Index, Depends(_get_bm25)],
) -> list[SearchResult]:
bm25.ensure_fresh(_get_db_path())
hits = bm25.query(req.query, top_k=req.top_k, doc_ids=req.doc_ids)
return [
SearchResult(
chunk_id=h.chunk_id,
doc_id=h.doc_id,
page_number=h.page_number,
text_snippet=(h.text or "")[:300],
bm25_score=h.score,
)
for h in hits
]