# app/main.py """FastAPI application factory for pagepiper.""" from __future__ import annotations import logging import os import re import sqlite3 import threading from contextlib import asynccontextmanager from fastapi import FastAPI from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS from app.services.bm25_index import BM25Index logger = logging.getLogger("pagepiper") # Module-level BM25 singleton — shared across all requests _bm25 = BM25Index() def _apply_migrations() -> None: from scripts.db_migrate import migrate migrate(DB_PATH) def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None: """Re-run full ingest for a list of (doc_id, file_path) sequentially.""" for doc_id, file_path in docs: suffix = os.path.splitext(file_path)[1].lower() try: if suffix == ".epub": from scripts.ingest_epub import run else: from scripts.ingest_pdf import run logger.info("Auto re-embed: starting %s", os.path.basename(file_path)) run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path) except Exception as exc: logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc) def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None: """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed. sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing models requires dropping and recreating the whole file. Catches the mismatch at startup rather than surfacing it as an obscure OperationalError mid-request. """ if not os.path.exists(vec_db_path): return try: conn = sqlite3.connect(vec_db_path) row = conn.execute( "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'" ).fetchone() conn.close() except Exception as exc: logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc) return if not row: return # table not yet created — first embed will build it with the right dims m = re.search(r'float\[(\d+)\]', row[0]) if not m: return actual_dims = int(m.group(1)) if actual_dims == expected_dims: return logger.warning( "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed", actual_dims, expected_dims, vec_db_path, ) try: os.remove(vec_db_path) except OSError as exc: logger.error( "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc ) return # Collect all ready docs so we can rebuild their embeddings in the background. try: conn = sqlite3.connect(db_path) docs = conn.execute( "SELECT id, file_path FROM documents WHERE status='ready'" ).fetchall() conn.close() except Exception as exc: logger.warning("Could not query documents for re-embed: %s", exc) return if not docs: return logger.info("Queuing re-embed for %d document(s) in background", len(docs)) threading.Thread( target=_reembed_docs, args=(docs, db_path, vec_db_path), daemon=True, name="pagepiper-reembed", ).start() @asynccontextmanager async def lifespan(app: FastAPI): _apply_migrations() embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text") logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS) _check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH) _bm25.mark_dirty() # will rebuild on first search yield app = FastAPI(title="Pagepiper", lifespan=lifespan) # Wire BM25 dirty callback into library router from app.api import library as _lib_module # noqa: E402 _lib_module._mark_bm25_dirty = _bm25.mark_dirty # Register routers from app.api.library import router as library_router # noqa: E402 from app.api.ingest import router as ingest_router # noqa: E402 from app.api.search import router as search_router # noqa: E402 from app.api.chat import router as chat_router # noqa: E402 from app.api.feedback import router as feedback_router # noqa: E402 from app.api.feedback_attach import router as feedback_attach_router # noqa: E402 app.include_router(library_router) app.include_router(ingest_router) app.include_router(search_router) app.include_router(chat_router) app.include_router(feedback_router, prefix="/api/v1/feedback") app.include_router(feedback_attach_router, prefix="/api/v1/feedback")