pagepiper/app/startup.py
pyr0ball 8eef52a054 feat: per-user database isolation for cloud instances (closes #4)
Implements Option A from the issue design: each cloud user gets their own
data directory (DATA_DIR/users/{user_id}/) with separate pagepiper.db,
pagepiper_vecs.db, uploads/, and books/. Local mode is unchanged.

Key changes:
- app/startup.py: extract apply_migrations, reembed_docs,
  check_and_rebuild_vec_schema out of main.py (no circular imports)
- app/config.py: add LOCAL_USER_ID constant and user_data_dir() helper
- app/cloud_session.py: extract resolve_authenticated_user(); require_paid_tier
  now returns user_id (str) instead of None
- app/deps.py: add UserCtx dataclass (db_path, vec_db_path, data_dir,
  watch_dir, bm25) + get_user_ctx dependency; per-user startup guard runs
  migrations + vec schema check once per process per user
- app/main.py: _bm25 singleton -> _bm25_map dict keyed by user_id;
  add _get_bm25_for(); lifespan only runs startup checks in local mode
- app/api/library.py, search.py, chat.py: thread UserCtx through all
  endpoints; remove module-level _mark_bm25_dirty injection pattern
- tests/conftest.py: override get_user_ctx in addition to get_db so all
  endpoints get a consistent test UserCtx
2026-05-13 16:31:51 -07:00

95 lines
3 KiB
Python

# app/startup.py
"""DB migration and vec schema check utilities — called at startup and on first user request."""
from __future__ import annotations
import logging
import os
import re
import sqlite3
import threading
logger = logging.getLogger("pagepiper")
def apply_migrations(db_path: str) -> None:
from scripts.db_migrate import migrate
migrate(db_path)
def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
for doc_id, file_path in docs:
suffix = os.path.splitext(file_path)[1].lower()
try:
if suffix == ".epub":
from scripts.ingest_epub import run
elif suffix == ".docx":
from scripts.ingest_docx import run
else:
from scripts.ingest_pdf import run
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
except Exception as exc:
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
models requires dropping and recreating the whole file. Catches the mismatch at
startup rather than surfacing it as an obscure OperationalError mid-request.
"""
if not os.path.exists(vec_db_path):
return
try:
conn = sqlite3.connect(vec_db_path)
row = conn.execute(
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
).fetchone()
conn.close()
except Exception as exc:
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
return
if not row:
return
m = re.search(r'float\[(\d+)\]', row[0])
if not m:
return
actual_dims = int(m.group(1))
if actual_dims == expected_dims:
return
logger.warning(
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
actual_dims, expected_dims, vec_db_path,
)
try:
os.remove(vec_db_path)
except OSError as exc:
logger.error(
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
)
return
try:
conn = sqlite3.connect(db_path)
docs = conn.execute(
"SELECT id, file_path FROM documents WHERE status='ready'"
).fetchall()
conn.close()
except Exception as exc:
logger.warning("Could not query documents for re-embed: %s", exc)
return
if not docs:
return
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
threading.Thread(
target=reembed_docs,
args=(docs, db_path, vec_db_path),
daemon=True,
name="pagepiper-reembed",
).start()