64 lines
1.5 KiB
Python
64 lines
1.5 KiB
Python
# app/api/search.py
|
|
"""
|
|
BM25 keyword search across the document library.
|
|
|
|
MIT — no tier gate. No Ollama required.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
from typing import Annotated
|
|
|
|
from fastapi import APIRouter, Depends
|
|
from pydantic import BaseModel, Field
|
|
|
|
from app.services.bm25_index import BM25Index
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/search", tags=["search"])
|
|
|
|
|
|
class SearchRequest(BaseModel):
|
|
query: str
|
|
top_k: int = Field(default=10, ge=1, le=50)
|
|
doc_ids: list[str] | None = None
|
|
|
|
|
|
class SearchResult(BaseModel):
|
|
chunk_id: str
|
|
doc_id: str
|
|
page_number: int
|
|
text_snippet: str # first 300 chars of the page text
|
|
bm25_score: float
|
|
|
|
|
|
def _get_bm25() -> BM25Index:
|
|
from app.main import _bm25
|
|
return _bm25
|
|
|
|
|
|
def _get_db_path() -> str:
|
|
"""Read lazily so test fixtures (monkeypatch.setattr) take effect."""
|
|
import pathlib
|
|
data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
|
return str(data_dir / "pagepiper.db")
|
|
|
|
|
|
@router.post("")
|
|
def search(
|
|
req: SearchRequest,
|
|
bm25: Annotated[BM25Index, Depends(_get_bm25)],
|
|
) -> list[SearchResult]:
|
|
bm25.ensure_fresh(_get_db_path())
|
|
hits = bm25.query(req.query, top_k=req.top_k, doc_ids=req.doc_ids)
|
|
return [
|
|
SearchResult(
|
|
chunk_id=h.chunk_id,
|
|
doc_id=h.doc_id,
|
|
page_number=h.page_number,
|
|
text_snippet=h.text[:300],
|
|
bm25_score=h.score,
|
|
)
|
|
for h in hits
|
|
]
|