Implements Option A from the issue design: each cloud user gets their own
data directory (DATA_DIR/users/{user_id}/) with separate pagepiper.db,
pagepiper_vecs.db, uploads/, and books/. Local mode is unchanged.
Key changes:
- app/startup.py: extract apply_migrations, reembed_docs,
check_and_rebuild_vec_schema out of main.py (no circular imports)
- app/config.py: add LOCAL_USER_ID constant and user_data_dir() helper
- app/cloud_session.py: extract resolve_authenticated_user(); require_paid_tier
now returns user_id (str) instead of None
- app/deps.py: add UserCtx dataclass (db_path, vec_db_path, data_dir,
watch_dir, bm25) + get_user_ctx dependency; per-user startup guard runs
migrations + vec schema check once per process per user
- app/main.py: _bm25 singleton -> _bm25_map dict keyed by user_id;
add _get_bm25_for(); lifespan only runs startup checks in local mode
- app/api/library.py, search.py, chat.py: thread UserCtx through all
endpoints; remove module-level _mark_bm25_dirty injection pattern
- tests/conftest.py: override get_user_ctx in addition to get_db so all
endpoints get a consistent test UserCtx
277 lines
8.7 KiB
Python
277 lines
8.7 KiB
Python
# app/api/library.py
|
|
"""
|
|
Document library management API.
|
|
|
|
All endpoints in this module are MIT — no tier gate.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import sqlite3
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Callable
|
|
|
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile
|
|
|
|
from app.config import VEC_DIMENSIONS
|
|
from app.deps import UserCtx, get_db, get_user_ctx
|
|
|
|
_MAX_UPLOAD_BYTES = 200 * 1024 * 1024 # 200 MB
|
|
|
|
logger = logging.getLogger(__name__)
|
|
router = APIRouter(prefix="/api/library", tags=["library"])
|
|
|
|
|
|
_INGEST_TASKS = {
|
|
".pdf": "pagepiper/ingest_pdf",
|
|
".epub": "pagepiper/ingest_epub",
|
|
".docx": "pagepiper/ingest_docx",
|
|
}
|
|
|
|
_INGEST_RUNNERS = {
|
|
".pdf": "scripts.ingest_pdf",
|
|
".epub": "scripts.ingest_epub",
|
|
".docx": "scripts.ingest_docx",
|
|
}
|
|
|
|
|
|
def _dispatch_ingest(
|
|
doc_id: str,
|
|
file_path: str,
|
|
background_tasks: BackgroundTasks,
|
|
data_dir: Path,
|
|
mark_dirty_fn: Callable[[], None],
|
|
) -> str:
|
|
"""Dispatch an ingest task. Tries cf-orch; falls back to BackgroundTasks."""
|
|
import importlib
|
|
|
|
suffix = Path(file_path).suffix.lower()
|
|
task_name = _INGEST_TASKS.get(suffix, "pagepiper/ingest_pdf")
|
|
runner_module = _INGEST_RUNNERS.get(suffix, "scripts.ingest_pdf")
|
|
|
|
task_id = str(uuid.uuid4())
|
|
args = {
|
|
"doc_id": doc_id,
|
|
"file_path": file_path,
|
|
"db_path": str(data_dir / "pagepiper.db"),
|
|
"vec_db_path": str(data_dir / "pagepiper_vecs.db"),
|
|
}
|
|
|
|
try:
|
|
from circuitforge_core.tasks import dispatch_task # type: ignore[import]
|
|
task_id = dispatch_task(caller=task_name, args=args)
|
|
logger.info("Dispatched cf-orch ingest task %s for doc %s", task_id, doc_id)
|
|
except Exception:
|
|
mod = importlib.import_module(runner_module)
|
|
background_tasks.add_task(_run_ingest_background, mod.run, args, task_id, mark_dirty_fn)
|
|
logger.info(
|
|
"cf-orch unavailable — running ingest in background thread (task %s)", task_id
|
|
)
|
|
|
|
return task_id
|
|
|
|
|
|
def _run_ingest_background(
|
|
run_fn: Callable[..., None],
|
|
args: dict,
|
|
task_id: str,
|
|
mark_dirty_fn: Callable[[], None] | None = None,
|
|
) -> None:
|
|
from app.api.ingest import _task_registry
|
|
_task_registry[task_id] = {"status": "running", "progress": 0}
|
|
try:
|
|
run_fn(**args)
|
|
_task_registry[task_id] = {"status": "complete", "progress": 100}
|
|
if mark_dirty_fn:
|
|
mark_dirty_fn()
|
|
except Exception as exc:
|
|
logger.exception("Ingest task %s failed", task_id)
|
|
_task_registry[task_id] = {"status": "error", "error": str(exc)}
|
|
|
|
|
|
@router.get("")
|
|
def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]:
|
|
rows = db.execute(
|
|
"SELECT id, title, file_path, status, task_id, page_count, created_at"
|
|
" FROM documents ORDER BY created_at DESC"
|
|
).fetchall()
|
|
return [dict(r) for r in rows]
|
|
|
|
|
|
@router.post("/scan", status_code=202)
|
|
def scan_library(
|
|
background_tasks: BackgroundTasks,
|
|
db: sqlite3.Connection = Depends(get_db),
|
|
ctx: UserCtx = Depends(get_user_ctx),
|
|
) -> dict:
|
|
"""Scan the watched directory and queue ingest for any new PDFs."""
|
|
watch = ctx.watch_dir
|
|
if not watch.exists():
|
|
raise HTTPException(status_code=404, detail=f"Watch directory not found: {watch}")
|
|
|
|
pdfs = (
|
|
list(watch.glob("**/*.pdf"))
|
|
+ list(watch.glob("**/*.epub"))
|
|
+ list(watch.glob("**/*.docx"))
|
|
)
|
|
queued = []
|
|
|
|
for pdf_path in pdfs:
|
|
path_str = str(pdf_path.resolve())
|
|
existing = db.execute(
|
|
"SELECT id, status FROM documents WHERE file_path = ?", [path_str]
|
|
).fetchone()
|
|
|
|
if existing and existing["status"] == "ready":
|
|
continue
|
|
|
|
if existing:
|
|
doc_id = existing["id"]
|
|
else:
|
|
title = pdf_path.stem.replace("_", " ").replace("-", " ").title()
|
|
doc_id = db.execute(
|
|
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
|
|
[title, path_str, "pending"],
|
|
).fetchone()[0]
|
|
db.commit()
|
|
|
|
task_id = _dispatch_ingest(
|
|
doc_id, path_str, background_tasks, ctx.data_dir, ctx.bm25.mark_dirty
|
|
)
|
|
db.execute(
|
|
"UPDATE documents SET status='processing', task_id=? WHERE id=?",
|
|
[task_id, doc_id],
|
|
)
|
|
db.commit()
|
|
queued.append({"doc_id": doc_id, "task_id": task_id})
|
|
|
|
return {"discovered": len(pdfs), "queued": len(queued), "tasks": queued}
|
|
|
|
|
|
@router.post("/{doc_id}/reingest", status_code=202)
|
|
def reingest_document(
|
|
doc_id: str,
|
|
background_tasks: BackgroundTasks,
|
|
db: sqlite3.Connection = Depends(get_db),
|
|
ctx: UserCtx = Depends(get_user_ctx),
|
|
) -> dict:
|
|
row = db.execute("SELECT file_path FROM documents WHERE id=?", [doc_id]).fetchone()
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
task_id = _dispatch_ingest(
|
|
doc_id, row["file_path"], background_tasks, ctx.data_dir, ctx.bm25.mark_dirty
|
|
)
|
|
db.execute(
|
|
"UPDATE documents SET status='processing', task_id=?, error_msg=NULL WHERE id=?",
|
|
[task_id, doc_id],
|
|
)
|
|
db.commit()
|
|
return {"doc_id": doc_id, "task_id": task_id}
|
|
|
|
|
|
@router.delete("/{doc_id}", status_code=204)
|
|
def delete_document(
|
|
doc_id: str,
|
|
db: sqlite3.Connection = Depends(get_db),
|
|
ctx: UserCtx = Depends(get_user_ctx),
|
|
) -> None:
|
|
row = db.execute("SELECT id FROM documents WHERE id=?", [doc_id]).fetchone()
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
|
|
db.execute("DELETE FROM documents WHERE id=?", [doc_id])
|
|
db.commit()
|
|
|
|
try:
|
|
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore # type: ignore[import]
|
|
store = LocalSQLiteVecStore(
|
|
db_path=ctx.vec_db_path, table="page_vecs", dimensions=VEC_DIMENSIONS
|
|
)
|
|
store.delete_where({"doc_id": doc_id})
|
|
except Exception as exc:
|
|
logger.warning("Could not remove vectors for doc %s: %s", doc_id, exc)
|
|
|
|
ctx.bm25.mark_dirty()
|
|
|
|
|
|
def _get_vec_count(doc_id: str, vec_db_path: str) -> int:
|
|
try:
|
|
conn = sqlite3.connect(vec_db_path)
|
|
count = conn.execute(
|
|
"SELECT COUNT(*) FROM page_vecs_meta WHERE json_extract(metadata, '$.doc_id') = ?",
|
|
[doc_id],
|
|
).fetchone()[0]
|
|
conn.close()
|
|
return int(count)
|
|
except Exception:
|
|
return 0
|
|
|
|
|
|
@router.get("/{doc_id}/status")
|
|
def document_status(
|
|
doc_id: str,
|
|
db: sqlite3.Connection = Depends(get_db),
|
|
ctx: UserCtx = Depends(get_user_ctx),
|
|
) -> dict:
|
|
row = db.execute(
|
|
"SELECT id, status, task_id, page_count, error_msg FROM documents WHERE id=?",
|
|
[doc_id],
|
|
).fetchone()
|
|
if not row:
|
|
raise HTTPException(status_code=404, detail="Document not found")
|
|
result = dict(row)
|
|
result["vec_count"] = _get_vec_count(doc_id, ctx.vec_db_path)
|
|
return result
|
|
|
|
|
|
@router.post("/upload", status_code=202)
|
|
def upload_document(
|
|
file: UploadFile,
|
|
background_tasks: BackgroundTasks,
|
|
db: sqlite3.Connection = Depends(get_db),
|
|
ctx: UserCtx = Depends(get_user_ctx),
|
|
) -> dict:
|
|
"""Accept a PDF/EPUB upload, save to data/uploads/, and queue for indexing."""
|
|
name = Path(file.filename or "").name
|
|
suffix = Path(name).suffix.lower()
|
|
if suffix not in _INGEST_TASKS:
|
|
raise HTTPException(status_code=400, detail="Supported formats: PDF, EPUB, DOCX")
|
|
|
|
content = file.file.read()
|
|
if len(content) > _MAX_UPLOAD_BYTES:
|
|
raise HTTPException(status_code=413, detail="File exceeds 200 MB limit")
|
|
|
|
upload_dir = ctx.data_dir / "uploads"
|
|
upload_dir.mkdir(parents=True, exist_ok=True)
|
|
dest = upload_dir / name
|
|
dest.write_bytes(content)
|
|
|
|
path_str = str(dest.resolve())
|
|
existing = db.execute(
|
|
"SELECT id, status FROM documents WHERE file_path = ?", [path_str]
|
|
).fetchone()
|
|
|
|
if existing and existing["status"] == "ready":
|
|
return {"doc_id": existing["id"], "task_id": None, "filename": name, "status": "already_indexed"}
|
|
|
|
if existing:
|
|
doc_id = existing["id"]
|
|
else:
|
|
title = dest.stem.replace("_", " ").replace("-", " ").title()
|
|
doc_id = db.execute(
|
|
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
|
|
[title, path_str, "pending"],
|
|
).fetchone()[0]
|
|
db.commit()
|
|
|
|
task_id = _dispatch_ingest(
|
|
doc_id, path_str, background_tasks, ctx.data_dir, ctx.bm25.mark_dirty
|
|
)
|
|
db.execute(
|
|
"UPDATE documents SET status='processing', task_id=? WHERE id=?",
|
|
[task_id, doc_id],
|
|
)
|
|
db.commit()
|
|
return {"doc_id": doc_id, "task_id": task_id, "filename": name, "status": "queued"}
|