# app/startup.py """DB migration and vec schema check utilities — called at startup and on first user request.""" from __future__ import annotations import logging import os import re import sqlite3 import threading logger = logging.getLogger("pagepiper") def apply_migrations(db_path: str) -> None: from scripts.db_migrate import migrate migrate(db_path) def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None: for doc_id, file_path in docs: suffix = os.path.splitext(file_path)[1].lower() try: if suffix == ".epub": from scripts.ingest_epub import run elif suffix == ".docx": from scripts.ingest_docx import run else: from scripts.ingest_pdf import run logger.info("Auto re-embed: starting %s", os.path.basename(file_path)) run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path) except Exception as exc: logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc) def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None: """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed. sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing models requires dropping and recreating the whole file. Catches the mismatch at startup rather than surfacing it as an obscure OperationalError mid-request. """ if not os.path.exists(vec_db_path): return try: conn = sqlite3.connect(vec_db_path) row = conn.execute( "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'" ).fetchone() conn.close() except Exception as exc: logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc) return if not row: return m = re.search(r'float\[(\d+)\]', row[0]) if not m: return actual_dims = int(m.group(1)) if actual_dims == expected_dims: return logger.warning( "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed", actual_dims, expected_dims, vec_db_path, ) try: os.remove(vec_db_path) except OSError as exc: logger.error( "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc ) return try: conn = sqlite3.connect(db_path) docs = conn.execute( "SELECT id, file_path FROM documents WHERE status='ready'" ).fetchall() conn.close() except Exception as exc: logger.warning("Could not query documents for re-embed: %s", exc) return if not docs: return logger.info("Queuing re-embed for %d document(s) in background", len(docs)) threading.Thread( target=reembed_docs, args=(docs, db_path, vec_db_path), daemon=True, name="pagepiper-reembed", ).start()