pagepiper/app/main.py
pyr0ball e52bdb5128 feat: RAG retrieval quality, artifact cleaning, and ingestion progress UI
Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
  after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
  instead of post-filtering an already-small global pool (fixes wrong-book
  results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
  snippet 200 → 400 chars

Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
  processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py

Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
  deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed

cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
  allocate() fires and keeps the Ollama model warm between requests

Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
  two-phase progress: indeterminate animation during extraction,
  determinate "Embedding N/M pages" bar once vectors start landing

Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
2026-05-06 08:25:58 -07:00

136 lines
4.6 KiB
Python

# app/main.py
"""FastAPI application factory for pagepiper."""
from __future__ import annotations
import logging
import os
import re
import sqlite3
import threading
from contextlib import asynccontextmanager
from fastapi import FastAPI
from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS
from app.services.bm25_index import BM25Index
logger = logging.getLogger("pagepiper")
# Module-level BM25 singleton — shared across all requests
_bm25 = BM25Index()
def _apply_migrations() -> None:
from scripts.db_migrate import migrate
migrate(DB_PATH)
def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
"""Re-run full ingest for a list of (doc_id, file_path) sequentially."""
for doc_id, file_path in docs:
suffix = os.path.splitext(file_path)[1].lower()
try:
if suffix == ".epub":
from scripts.ingest_epub import run
else:
from scripts.ingest_pdf import run
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
except Exception as exc:
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
models requires dropping and recreating the whole file. Catches the mismatch at
startup rather than surfacing it as an obscure OperationalError mid-request.
"""
if not os.path.exists(vec_db_path):
return
try:
conn = sqlite3.connect(vec_db_path)
row = conn.execute(
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
).fetchone()
conn.close()
except Exception as exc:
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
return
if not row:
return # table not yet created — first embed will build it with the right dims
m = re.search(r'float\[(\d+)\]', row[0])
if not m:
return
actual_dims = int(m.group(1))
if actual_dims == expected_dims:
return
logger.warning(
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
actual_dims, expected_dims, vec_db_path,
)
try:
os.remove(vec_db_path)
except OSError as exc:
logger.error(
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
)
return
# Collect all ready docs so we can rebuild their embeddings in the background.
try:
conn = sqlite3.connect(db_path)
docs = conn.execute(
"SELECT id, file_path FROM documents WHERE status='ready'"
).fetchall()
conn.close()
except Exception as exc:
logger.warning("Could not query documents for re-embed: %s", exc)
return
if not docs:
return
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
threading.Thread(
target=_reembed_docs,
args=(docs, db_path, vec_db_path),
daemon=True,
name="pagepiper-reembed",
).start()
@asynccontextmanager
async def lifespan(app: FastAPI):
_apply_migrations()
embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text")
logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS)
_check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH)
_bm25.mark_dirty() # will rebuild on first search
yield
app = FastAPI(title="Pagepiper", lifespan=lifespan)
# Wire BM25 dirty callback into library router
from app.api import library as _lib_module # noqa: E402
_lib_module._mark_bm25_dirty = _bm25.mark_dirty
# Register routers
from app.api.library import router as library_router # noqa: E402
from app.api.ingest import router as ingest_router # noqa: E402
from app.api.search import router as search_router # noqa: E402
from app.api.chat import router as chat_router # noqa: E402
from app.api.feedback import router as feedback_router # noqa: E402
from app.api.feedback_attach import router as feedback_attach_router # noqa: E402
app.include_router(library_router)
app.include_router(ingest_router)
app.include_router(search_router)
app.include_router(chat_router)
app.include_router(feedback_router, prefix="/api/v1/feedback")
app.include_router(feedback_attach_router, prefix="/api/v1/feedback")