Implements Option A from the issue design: each cloud user gets their own
data directory (DATA_DIR/users/{user_id}/) with separate pagepiper.db,
pagepiper_vecs.db, uploads/, and books/. Local mode is unchanged.
Key changes:
- app/startup.py: extract apply_migrations, reembed_docs,
check_and_rebuild_vec_schema out of main.py (no circular imports)
- app/config.py: add LOCAL_USER_ID constant and user_data_dir() helper
- app/cloud_session.py: extract resolve_authenticated_user(); require_paid_tier
now returns user_id (str) instead of None
- app/deps.py: add UserCtx dataclass (db_path, vec_db_path, data_dir,
watch_dir, bm25) + get_user_ctx dependency; per-user startup guard runs
migrations + vec schema check once per process per user
- app/main.py: _bm25 singleton -> _bm25_map dict keyed by user_id;
add _get_bm25_for(); lifespan only runs startup checks in local mode
- app/api/library.py, search.py, chat.py: thread UserCtx through all
endpoints; remove module-level _mark_bm25_dirty injection pattern
- tests/conftest.py: override get_user_ctx in addition to get_db so all
endpoints get a consistent test UserCtx
95 lines
3 KiB
Python
95 lines
3 KiB
Python
# app/startup.py
|
|
"""DB migration and vec schema check utilities — called at startup and on first user request."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import threading
|
|
|
|
logger = logging.getLogger("pagepiper")
|
|
|
|
|
|
def apply_migrations(db_path: str) -> None:
|
|
from scripts.db_migrate import migrate
|
|
migrate(db_path)
|
|
|
|
|
|
def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
|
|
for doc_id, file_path in docs:
|
|
suffix = os.path.splitext(file_path)[1].lower()
|
|
try:
|
|
if suffix == ".epub":
|
|
from scripts.ingest_epub import run
|
|
elif suffix == ".docx":
|
|
from scripts.ingest_docx import run
|
|
else:
|
|
from scripts.ingest_pdf import run
|
|
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
|
|
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
|
|
except Exception as exc:
|
|
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
|
|
|
|
|
|
def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
|
|
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
|
|
|
|
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
|
|
models requires dropping and recreating the whole file. Catches the mismatch at
|
|
startup rather than surfacing it as an obscure OperationalError mid-request.
|
|
"""
|
|
if not os.path.exists(vec_db_path):
|
|
return
|
|
try:
|
|
conn = sqlite3.connect(vec_db_path)
|
|
row = conn.execute(
|
|
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
|
|
).fetchone()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
|
|
return
|
|
|
|
if not row:
|
|
return
|
|
|
|
m = re.search(r'float\[(\d+)\]', row[0])
|
|
if not m:
|
|
return
|
|
actual_dims = int(m.group(1))
|
|
if actual_dims == expected_dims:
|
|
return
|
|
|
|
logger.warning(
|
|
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
|
|
actual_dims, expected_dims, vec_db_path,
|
|
)
|
|
try:
|
|
os.remove(vec_db_path)
|
|
except OSError as exc:
|
|
logger.error(
|
|
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
|
|
)
|
|
return
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
docs = conn.execute(
|
|
"SELECT id, file_path FROM documents WHERE status='ready'"
|
|
).fetchall()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Could not query documents for re-embed: %s", exc)
|
|
return
|
|
|
|
if not docs:
|
|
return
|
|
|
|
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
|
|
threading.Thread(
|
|
target=reembed_docs,
|
|
args=(docs, db_path, vec_db_path),
|
|
daemon=True,
|
|
name="pagepiper-reembed",
|
|
).start()
|