pagepiper/app/main.py

# app/main.py
"""FastAPI application factory for pagepiper."""
from __future__ import annotations

import logging
import os
import re
import sqlite3
import threading
from contextlib import asynccontextmanager

from fastapi import FastAPI

from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS
from app.services.bm25_index import BM25Index

logger = logging.getLogger("pagepiper")

# Module-level BM25 singleton — shared across all requests
_bm25 = BM25Index()


def _apply_migrations() -> None:
    from scripts.db_migrate import migrate
    migrate(DB_PATH)


def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
    """Re-run full ingest for a list of (doc_id, file_path) sequentially."""
    for doc_id, file_path in docs:
        suffix = os.path.splitext(file_path)[1].lower()
        try:
            if suffix == ".epub":
                from scripts.ingest_epub import run
            else:
                from scripts.ingest_pdf import run
            logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
            run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
        except Exception as exc:
            logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)


def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
    """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.

    sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
    models requires dropping and recreating the whole file. Catches the mismatch at
    startup rather than surfacing it as an obscure OperationalError mid-request.
    """
    if not os.path.exists(vec_db_path):
        return
    try:
        conn = sqlite3.connect(vec_db_path)
        row = conn.execute(
            "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
        ).fetchone()
        conn.close()
    except Exception as exc:
        logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
        return

    if not row:
        return  # table not yet created — first embed will build it with the right dims

    m = re.search(r'float\[(\d+)\]', row[0])
    if not m:
        return
    actual_dims = int(m.group(1))
    if actual_dims == expected_dims:
        return

    logger.warning(
        "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
        actual_dims, expected_dims, vec_db_path,
    )
    try:
        os.remove(vec_db_path)
    except OSError as exc:
        logger.error(
            "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
        )
        return

    # Collect all ready docs so we can rebuild their embeddings in the background.
    try:
        conn = sqlite3.connect(db_path)
        docs = conn.execute(
            "SELECT id, file_path FROM documents WHERE status='ready'"
        ).fetchall()
        conn.close()
    except Exception as exc:
        logger.warning("Could not query documents for re-embed: %s", exc)
        return

    if not docs:
        return

    logger.info("Queuing re-embed for %d document(s) in background", len(docs))
    threading.Thread(
        target=_reembed_docs,
        args=(docs, db_path, vec_db_path),
        daemon=True,
        name="pagepiper-reembed",
    ).start()


@asynccontextmanager
async def lifespan(app: FastAPI):
    _apply_migrations()
    embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text")
    logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS)
    _check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH)
    _bm25.mark_dirty()  # will rebuild on first search
    yield


app = FastAPI(title="Pagepiper", lifespan=lifespan)

# Wire BM25 dirty callback into library router
from app.api import library as _lib_module  # noqa: E402
_lib_module._mark_bm25_dirty = _bm25.mark_dirty

# Register routers
from app.api.library import router as library_router  # noqa: E402
from app.api.ingest import router as ingest_router  # noqa: E402
from app.api.search import router as search_router  # noqa: E402
from app.api.chat import router as chat_router  # noqa: E402
from app.api.feedback import router as feedback_router  # noqa: E402
from app.api.feedback_attach import router as feedback_attach_router  # noqa: E402

app.include_router(library_router)
app.include_router(ingest_router)
app.include_router(search_router)
app.include_router(chat_router)
app.include_router(feedback_router, prefix="/api/v1/feedback")
app.include_router(feedback_attach_router, prefix="/api/v1/feedback")