Implements Option B (fscrypt) from the issue design: OS-level filesystem encryption for per-user data directories on the cloud host. - app/startup.py: warn_if_unencrypted() checks for fscrypt at startup in cloud mode and logs a SECURITY warning if the users/ directory is not encrypted — catches misconfigured deployments before any data is stored - app/main.py: call warn_if_unencrypted() during lifespan in cloud mode - scripts/setup_cloud_fscrypt.sh: operator script to encrypt a user's data directory with fscrypt (run as root on host before container start); supports --list and --status subcommands Key management note: current implementation uses pam_passphrase protector. For unattended server boot, integrate a raw_key protector from a secrets manager (Vault, AWS Secrets Manager, etc.) — see script comments. SQLCipher (Option A) deferred: sqlite-vec virtual table compatibility with SQLCipher's encrypted VFS needs investigation before committing to that path.
137 lines
4.4 KiB
Python
137 lines
4.4 KiB
Python
# app/startup.py
|
|
"""DB migration and vec schema check utilities — called at startup and on first user request."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import subprocess
|
|
import threading
|
|
|
|
logger = logging.getLogger("pagepiper")
|
|
|
|
|
|
def warn_if_unencrypted(data_dir: str) -> None:
|
|
"""Log a warning if cloud mode is running without fscrypt encryption.
|
|
|
|
Checks whether the users/ subdirectory of data_dir is fscrypt-encrypted.
|
|
Non-fatal: warns but does not block startup.
|
|
"""
|
|
users_dir = os.path.join(data_dir, "users")
|
|
os.makedirs(users_dir, exist_ok=True)
|
|
|
|
if not _fscrypt_available():
|
|
logger.warning(
|
|
"SECURITY: fscrypt not found on this system. Cloud user data at %s is stored "
|
|
"unencrypted. Install fscrypt and run scripts/setup_cloud_fscrypt.sh to enable "
|
|
"encryption at rest.",
|
|
users_dir,
|
|
)
|
|
return
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["fscrypt", "status", users_dir],
|
|
capture_output=True, text=True, timeout=5,
|
|
)
|
|
if "Encrypted" not in result.stdout:
|
|
logger.warning(
|
|
"SECURITY: user data directory %s is not fscrypt-encrypted. "
|
|
"Run: sudo scripts/setup_cloud_fscrypt.sh <user_id>",
|
|
users_dir,
|
|
)
|
|
except Exception as exc:
|
|
logger.debug("fscrypt status check failed (non-fatal): %s", exc)
|
|
|
|
|
|
def _fscrypt_available() -> bool:
|
|
try:
|
|
subprocess.run(["fscrypt", "--version"], capture_output=True, timeout=2)
|
|
return True
|
|
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
return False
|
|
|
|
|
|
def apply_migrations(db_path: str) -> None:
|
|
from scripts.db_migrate import migrate
|
|
migrate(db_path)
|
|
|
|
|
|
def reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
|
|
for doc_id, file_path in docs:
|
|
suffix = os.path.splitext(file_path)[1].lower()
|
|
try:
|
|
if suffix == ".epub":
|
|
from scripts.ingest_epub import run
|
|
elif suffix == ".docx":
|
|
from scripts.ingest_docx import run
|
|
else:
|
|
from scripts.ingest_pdf import run
|
|
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
|
|
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
|
|
except Exception as exc:
|
|
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
|
|
|
|
|
|
def check_and_rebuild_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
|
|
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
|
|
|
|
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
|
|
models requires dropping and recreating the whole file. Catches the mismatch at
|
|
startup rather than surfacing it as an obscure OperationalError mid-request.
|
|
"""
|
|
if not os.path.exists(vec_db_path):
|
|
return
|
|
try:
|
|
conn = sqlite3.connect(vec_db_path)
|
|
row = conn.execute(
|
|
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
|
|
).fetchone()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
|
|
return
|
|
|
|
if not row:
|
|
return
|
|
|
|
m = re.search(r'float\[(\d+)\]', row[0])
|
|
if not m:
|
|
return
|
|
actual_dims = int(m.group(1))
|
|
if actual_dims == expected_dims:
|
|
return
|
|
|
|
logger.warning(
|
|
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
|
|
actual_dims, expected_dims, vec_db_path,
|
|
)
|
|
try:
|
|
os.remove(vec_db_path)
|
|
except OSError as exc:
|
|
logger.error(
|
|
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
|
|
)
|
|
return
|
|
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
docs = conn.execute(
|
|
"SELECT id, file_path FROM documents WHERE status='ready'"
|
|
).fetchall()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Could not query documents for re-embed: %s", exc)
|
|
return
|
|
|
|
if not docs:
|
|
return
|
|
|
|
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
|
|
threading.Thread(
|
|
target=reembed_docs,
|
|
args=(docs, db_path, vec_db_path),
|
|
daemon=True,
|
|
name="pagepiper-reembed",
|
|
).start()
|