pagepiper/app/startup.py
pyr0ball 1e066cf66c feat: encryption at rest infrastructure for cloud user data (closes #5)
Implements Option B (fscrypt) from the issue design: OS-level filesystem
encryption for per-user data directories on the cloud host.

- app/startup.py: warn_if_unencrypted() checks for fscrypt at startup in
  cloud mode and logs a SECURITY warning if the users/ directory is not
  encrypted — catches misconfigured deployments before any data is stored
- app/main.py: call warn_if_unencrypted() during lifespan in cloud mode
- scripts/setup_cloud_fscrypt.sh: operator script to encrypt a user's
  data directory with fscrypt (run as root on host before container start);
  supports --list and --status subcommands

Key management note: current implementation uses pam_passphrase protector.
For unattended server boot, integrate a raw_key protector from a secrets
manager (Vault, AWS Secrets Manager, etc.) — see script comments.

SQLCipher (Option A) deferred: sqlite-vec virtual table compatibility with
SQLCipher's encrypted VFS needs investigation before committing to that path.
2026-05-13 18:35:17 -07:00

137 lines
4.4 KiB
Python

# app/startup.py
"""DB migration and vec schema check utilities — called at startup and on first user request."""
from __future__ import annotations
import logging
import os
import re
import sqlite3
import subprocess
import threading
logger = logging.getLogger("pagepiper")
def warn_if_unencrypted(data_dir: str) -> None:
"""Log a warning if cloud mode is running without fscrypt encryption.
Checks whether the users/ subdirectory of data_dir is fscrypt-encrypted.
Non-fatal: warns but does not block startup.
"""
users_dir = os.path.join(data_dir, "users")
os.makedirs(users_dir, exist_ok=True)
if not _fscrypt_available():
logger.warning(
"SECURITY: fscrypt not found on this system. Cloud user data at %s is stored "
"unencrypted. Install fscrypt and run scripts/setup_cloud_fscrypt.sh to enable "
"encryption at rest.",
users_dir,
)
return
try:
result = subprocess.run(
["fscrypt", "status", users_dir],
capture_output=True, text=True, timeout=5,
)
if "Encrypted" not in result.stdout:
logger.warning(
"SECURITY: user data directory %s is not fscrypt-encrypted. "
"Run: sudo scripts/setup_cloud_fscrypt.sh <user_id>",
users_dir,
)
except Exception as exc:
logger.debug("fscrypt status check failed (non-fatal): %s", exc)
def _fscrypt_available() -> bool:
try:
subprocess.run(["fscrypt", "--version"], capture_output=True, timeout=2)
return True
except (FileNotFoundError, subprocess.TimeoutExpired):
return False
def apply_migrations(db_path: str) -> None:
from scripts.db_migrate import migrate
migrate(db_path)
def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
for doc_id, file_path in docs:
suffix = os.path.splitext(file_path)[1].lower()
try:
if suffix == ".epub":
from scripts.ingest_epub import run
elif suffix == ".docx":
from scripts.ingest_docx import run
else:
from scripts.ingest_pdf import run
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
except Exception as exc:
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
models requires dropping and recreating the whole file. Catches the mismatch at
startup rather than surfacing it as an obscure OperationalError mid-request.
"""
if not os.path.exists(vec_db_path):
return
try:
conn = sqlite3.connect(vec_db_path)
row = conn.execute(
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
).fetchone()
conn.close()
except Exception as exc:
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
return
if not row:
return
m = re.search(r'float\[(\d+)\]', row[0])
if not m:
return
actual_dims = int(m.group(1))
if actual_dims == expected_dims:
return
logger.warning(
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
actual_dims, expected_dims, vec_db_path,
)
try:
os.remove(vec_db_path)
except OSError as exc:
logger.error(
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
)
return
try:
conn = sqlite3.connect(db_path)
docs = conn.execute(
"SELECT id, file_path FROM documents WHERE status='ready'"
).fetchall()
conn.close()
except Exception as exc:
logger.warning("Could not query documents for re-embed: %s", exc)
return
if not docs:
return
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
threading.Thread(
target=reembed_docs,
args=(docs, db_path, vec_db_path),
daemon=True,
name="pagepiper-reembed",
).start()