pagepiper/app/api/library.py
pyr0ball e52bdb5128 feat: RAG retrieval quality, artifact cleaning, and ingestion progress UI
Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
  after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
  instead of post-filtering an already-small global pool (fixes wrong-book
  results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
  snippet 200 → 400 chars

Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
  processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py

Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
  deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed

cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
  allocate() fires and keeps the Ollama model warm between requests

Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
  two-phase progress: indeterminate animation during extraction,
  determinate "Embedding N/M pages" bar once vectors start landing

Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
2026-05-06 08:25:58 -07:00

262 lines
8.5 KiB
Python

# app/api/library.py
"""
Document library management API.
All endpoints in this module are MIT — no tier gate.
"""
from __future__ import annotations
import logging
import sqlite3
import uuid
from pathlib import Path
from typing import Callable
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile
from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH, DATA_DIR
from app.deps import get_db
_MAX_UPLOAD_BYTES = 200 * 1024 * 1024 # 200 MB
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/library", tags=["library"])
# Injected by main.py after _bm25 is created
_mark_bm25_dirty: Callable[[], None] | None = None
_INGEST_TASKS = {
".pdf": "pagepiper/ingest_pdf",
".epub": "pagepiper/ingest_epub",
}
_INGEST_RUNNERS = {
".pdf": "scripts.ingest_pdf",
".epub": "scripts.ingest_epub",
}
def _dispatch_ingest(
doc_id: str,
file_path: str,
background_tasks: BackgroundTasks,
) -> str:
"""Dispatch an ingest task. Tries cf-orch; falls back to BackgroundTasks."""
import importlib
import os as _os
from pathlib import Path as _Path
suffix = _Path(file_path).suffix.lower()
task_name = _INGEST_TASKS.get(suffix, "pagepiper/ingest_pdf")
runner_module = _INGEST_RUNNERS.get(suffix, "scripts.ingest_pdf")
# Read lazily so test fixtures (monkeypatch.setenv) take effect
_data_dir = _Path(_os.environ.get("PAGEPIPER_DATA_DIR", "data"))
task_id = str(uuid.uuid4())
args = {
"doc_id": doc_id,
"file_path": file_path,
"db_path": str(_data_dir / "pagepiper.db"),
"vec_db_path": str(_data_dir / "pagepiper_vecs.db"),
}
try:
from circuitforge_core.tasks import dispatch_task # type: ignore[import]
task_id = dispatch_task(caller=task_name, args=args)
logger.info("Dispatched cf-orch ingest task %s for doc %s", task_id, doc_id)
except Exception:
mod = importlib.import_module(runner_module)
background_tasks.add_task(_run_ingest_background, mod.run, args, task_id)
logger.info(
"cf-orch unavailable — running ingest in background thread (task %s)", task_id
)
return task_id
def _run_ingest_background(run_fn: Callable[..., None], args: dict, task_id: str) -> None:
from app.api.ingest import _task_registry
_task_registry[task_id] = {"status": "running", "progress": 0}
try:
run_fn(**args)
_task_registry[task_id] = {"status": "complete", "progress": 100}
if _mark_bm25_dirty:
_mark_bm25_dirty()
except Exception as exc:
logger.exception("Ingest task %s failed", task_id)
_task_registry[task_id] = {"status": "error", "error": str(exc)}
@router.get("")
def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]:
rows = db.execute(
"SELECT id, title, file_path, status, task_id, page_count, created_at"
" FROM documents ORDER BY created_at DESC"
).fetchall()
return [dict(r) for r in rows]
@router.post("/scan", status_code=202)
def scan_library(
background_tasks: BackgroundTasks,
db: sqlite3.Connection = Depends(get_db),
) -> dict:
"""Scan the watched directory and queue ingest for any new PDFs."""
watch = WATCH_DIR
if not watch.exists():
raise HTTPException(status_code=404, detail=f"Watch directory not found: {watch}")
pdfs = list(watch.glob("**/*.pdf")) + list(watch.glob("**/*.epub"))
queued = []
for pdf_path in pdfs:
path_str = str(pdf_path.resolve())
existing = db.execute(
"SELECT id, status FROM documents WHERE file_path = ?", [path_str]
).fetchone()
if existing and existing["status"] == "ready":
continue # already indexed
if existing:
doc_id = existing["id"]
else:
title = pdf_path.stem.replace("_", " ").replace("-", " ").title()
doc_id = db.execute(
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
[title, path_str, "pending"],
).fetchone()[0]
db.commit()
task_id = _dispatch_ingest(doc_id, path_str, background_tasks)
db.execute(
"UPDATE documents SET status='processing', task_id=? WHERE id=?",
[task_id, doc_id],
)
db.commit()
queued.append({"doc_id": doc_id, "task_id": task_id})
return {"discovered": len(pdfs), "queued": len(queued), "tasks": queued}
@router.post("/{doc_id}/reingest", status_code=202)
def reingest_document(
doc_id: str,
background_tasks: BackgroundTasks,
db: sqlite3.Connection = Depends(get_db),
) -> dict:
row = db.execute("SELECT file_path FROM documents WHERE id=?", [doc_id]).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Document not found")
task_id = _dispatch_ingest(doc_id, row["file_path"], background_tasks)
db.execute(
"UPDATE documents SET status='processing', task_id=?, error_msg=NULL WHERE id=?",
[task_id, doc_id],
)
db.commit()
return {"doc_id": doc_id, "task_id": task_id}
@router.delete("/{doc_id}", status_code=204)
def delete_document(
doc_id: str,
db: sqlite3.Connection = Depends(get_db),
) -> None:
row = db.execute("SELECT id FROM documents WHERE id=?", [doc_id]).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Document not found")
db.execute("DELETE FROM documents WHERE id=?", [doc_id])
db.commit()
# Remove embeddings from vector store
try:
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore # type: ignore[import]
from app.config import VEC_DIMENSIONS
store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=VEC_DIMENSIONS)
store.delete_where({"doc_id": doc_id})
except Exception as exc:
logger.warning("Could not remove vectors for doc %s: %s", doc_id, exc)
if _mark_bm25_dirty:
_mark_bm25_dirty()
def _get_vec_count(doc_id: str) -> int:
"""Return how many vectors have been stored for this doc. Returns 0 on any error."""
try:
conn = sqlite3.connect(VEC_DB_PATH)
count = conn.execute(
"SELECT COUNT(*) FROM page_vecs_meta WHERE json_extract(metadata, '$.doc_id') = ?",
[doc_id],
).fetchone()[0]
conn.close()
return int(count)
except Exception:
return 0
@router.get("/{doc_id}/status")
def document_status(
doc_id: str,
db: sqlite3.Connection = Depends(get_db),
) -> dict:
row = db.execute(
"SELECT id, status, task_id, page_count, error_msg FROM documents WHERE id=?",
[doc_id],
).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Document not found")
result = dict(row)
result["vec_count"] = _get_vec_count(doc_id)
return result
@router.post("/upload", status_code=202)
def upload_document(
file: UploadFile,
background_tasks: BackgroundTasks,
db: sqlite3.Connection = Depends(get_db),
) -> dict:
"""Accept a PDF/EPUB upload, save to data/uploads/, and queue for indexing."""
name = Path(file.filename or "").name
suffix = Path(name).suffix.lower()
if suffix not in _INGEST_TASKS:
raise HTTPException(status_code=400, detail="Supported formats: PDF, EPUB")
content = file.file.read()
if len(content) > _MAX_UPLOAD_BYTES:
raise HTTPException(status_code=413, detail="File exceeds 200 MB limit")
upload_dir = DATA_DIR / "uploads"
upload_dir.mkdir(parents=True, exist_ok=True)
dest = upload_dir / name
dest.write_bytes(content)
path_str = str(dest.resolve())
existing = db.execute(
"SELECT id, status FROM documents WHERE file_path = ?", [path_str]
).fetchone()
if existing and existing["status"] == "ready":
return {"doc_id": existing["id"], "task_id": None, "filename": name, "status": "already_indexed"}
if existing:
doc_id = existing["id"]
else:
title = dest.stem.replace("_", " ").replace("-", " ").title()
doc_id = db.execute(
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
[title, path_str, "pending"],
).fetchone()[0]
db.commit()
task_id = _dispatch_ingest(doc_id, path_str, background_tasks)
db.execute(
"UPDATE documents SET status='processing', task_id=? WHERE id=?",
[task_id, doc_id],
)
db.commit()
return {"doc_id": doc_id, "task_id": task_id, "filename": name, "status": "queued"}