pagepiper/app/startup.py

# app/startup.py
"""DB migration and vec schema check utilities — called at startup and on first user request."""
from __future__ import annotations

import logging
import os
import re
import sqlite3
import threading

logger = logging.getLogger("pagepiper")


def apply_migrations(db_path: str) -> None:
    from scripts.db_migrate import migrate
    migrate(db_path)


def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
    for doc_id, file_path in docs:
        suffix = os.path.splitext(file_path)[1].lower()
        try:
            if suffix == ".epub":
                from scripts.ingest_epub import run
            elif suffix == ".docx":
                from scripts.ingest_docx import run
            else:
                from scripts.ingest_pdf import run
            logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
            run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
        except Exception as exc:
            logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)


def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
    """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.

    sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
    models requires dropping and recreating the whole file. Catches the mismatch at
    startup rather than surfacing it as an obscure OperationalError mid-request.
    """
    if not os.path.exists(vec_db_path):
        return
    try:
        conn = sqlite3.connect(vec_db_path)
        row = conn.execute(
            "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
        ).fetchone()
        conn.close()
    except Exception as exc:
        logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
        return

    if not row:
        return

    m = re.search(r'float\[(\d+)\]', row[0])
    if not m:
        return
    actual_dims = int(m.group(1))
    if actual_dims == expected_dims:
        return

    logger.warning(
        "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
        actual_dims, expected_dims, vec_db_path,
    )
    try:
        os.remove(vec_db_path)
    except OSError as exc:
        logger.error(
            "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
        )
        return

    try:
        conn = sqlite3.connect(db_path)
        docs = conn.execute(
            "SELECT id, file_path FROM documents WHERE status='ready'"
        ).fetchall()
        conn.close()
    except Exception as exc:
        logger.warning("Could not query documents for re-embed: %s", exc)
        return

    if not docs:
        return

    logger.info("Queuing re-embed for %d document(s) in background", len(docs))
    threading.Thread(
        target=reembed_docs,
        args=(docs, db_path, vec_db_path),
        daemon=True,
        name="pagepiper-reembed",
    ).start()