Retrieval:
- Add _fetch_adjacent() to retriever: fetches page ± 1 chunks from DB
after ranking so mid-sentence EPUB chunk boundaries don't lose context
- Fix vec DB doc-filter: oversample to top_k*20 before Python filter
instead of post-filtering an already-small global pool (fixes wrong-book
results when searching within a single document)
- top_k default 5 → 10; context per chunk 500 → 1500 chars; citation
snippet 200 → 400 chars
Artifact cleaning:
- Add scripts/text_clean.py: strips ABC Amber LIT Converter watermarks,
processtext.com URLs, bare page numbers, piracy stamps from extracted text
- Wire clean_paragraph() into ingest_pdf.py and new ingest_epub.py
Startup validation:
- _check_vec_schema() at boot: detects embedding dimension mismatch,
deletes stale vec DB, and queues sequential re-embed in background thread
- Sequential _reembed_docs() prevents SQLite lock races on startup re-embed
cf-orch integration:
- Wire CF_ORCH_URL / CF_LICENSE_KEY into LLMRouter backend config so
allocate() fires and keeps the Ollama model warm between requests
Ingestion progress UI:
- GET /api/library/{doc_id}/status now returns vec_count from page_vecs_meta
- DocumentCard.vue polls status every 3 s while processing and shows
two-phase progress: indeterminate animation during extraction,
determinate "Embedding N/M pages" bar once vectors start landing
Other:
- Chat feedback endpoint + thumbs up/down UI (FeedbackButton.vue)
- EPUB ingest script (ingest_epub.py) with heading-based chunking
- migration 002: chat_feedback table
- README.md with setup and feature overview
136 lines
4.6 KiB
Python
136 lines
4.6 KiB
Python
# app/main.py
|
|
"""FastAPI application factory for pagepiper."""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import sqlite3
|
|
import threading
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI
|
|
|
|
from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS
|
|
from app.services.bm25_index import BM25Index
|
|
|
|
logger = logging.getLogger("pagepiper")
|
|
|
|
# Module-level BM25 singleton — shared across all requests
|
|
_bm25 = BM25Index()
|
|
|
|
|
|
def _apply_migrations() -> None:
|
|
from scripts.db_migrate import migrate
|
|
migrate(DB_PATH)
|
|
|
|
|
|
def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
|
|
"""Re-run full ingest for a list of (doc_id, file_path) sequentially."""
|
|
for doc_id, file_path in docs:
|
|
suffix = os.path.splitext(file_path)[1].lower()
|
|
try:
|
|
if suffix == ".epub":
|
|
from scripts.ingest_epub import run
|
|
else:
|
|
from scripts.ingest_pdf import run
|
|
logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
|
|
run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
|
|
except Exception as exc:
|
|
logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
|
|
|
|
|
|
def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
|
|
"""Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
|
|
|
|
sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
|
|
models requires dropping and recreating the whole file. Catches the mismatch at
|
|
startup rather than surfacing it as an obscure OperationalError mid-request.
|
|
"""
|
|
if not os.path.exists(vec_db_path):
|
|
return
|
|
try:
|
|
conn = sqlite3.connect(vec_db_path)
|
|
row = conn.execute(
|
|
"SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
|
|
).fetchone()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
|
|
return
|
|
|
|
if not row:
|
|
return # table not yet created — first embed will build it with the right dims
|
|
|
|
m = re.search(r'float\[(\d+)\]', row[0])
|
|
if not m:
|
|
return
|
|
actual_dims = int(m.group(1))
|
|
if actual_dims == expected_dims:
|
|
return
|
|
|
|
logger.warning(
|
|
"Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
|
|
actual_dims, expected_dims, vec_db_path,
|
|
)
|
|
try:
|
|
os.remove(vec_db_path)
|
|
except OSError as exc:
|
|
logger.error(
|
|
"Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
|
|
)
|
|
return
|
|
|
|
# Collect all ready docs so we can rebuild their embeddings in the background.
|
|
try:
|
|
conn = sqlite3.connect(db_path)
|
|
docs = conn.execute(
|
|
"SELECT id, file_path FROM documents WHERE status='ready'"
|
|
).fetchall()
|
|
conn.close()
|
|
except Exception as exc:
|
|
logger.warning("Could not query documents for re-embed: %s", exc)
|
|
return
|
|
|
|
if not docs:
|
|
return
|
|
|
|
logger.info("Queuing re-embed for %d document(s) in background", len(docs))
|
|
threading.Thread(
|
|
target=_reembed_docs,
|
|
args=(docs, db_path, vec_db_path),
|
|
daemon=True,
|
|
name="pagepiper-reembed",
|
|
).start()
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
_apply_migrations()
|
|
embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text")
|
|
logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS)
|
|
_check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH)
|
|
_bm25.mark_dirty() # will rebuild on first search
|
|
yield
|
|
|
|
|
|
app = FastAPI(title="Pagepiper", lifespan=lifespan)
|
|
|
|
# Wire BM25 dirty callback into library router
|
|
from app.api import library as _lib_module # noqa: E402
|
|
_lib_module._mark_bm25_dirty = _bm25.mark_dirty
|
|
|
|
# Register routers
|
|
from app.api.library import router as library_router # noqa: E402
|
|
from app.api.ingest import router as ingest_router # noqa: E402
|
|
from app.api.search import router as search_router # noqa: E402
|
|
from app.api.chat import router as chat_router # noqa: E402
|
|
from app.api.feedback import router as feedback_router # noqa: E402
|
|
from app.api.feedback_attach import router as feedback_attach_router # noqa: E402
|
|
|
|
app.include_router(library_router)
|
|
app.include_router(ingest_router)
|
|
app.include_router(search_router)
|
|
app.include_router(chat_router)
|
|
app.include_router(feedback_router, prefix="/api/v1/feedback")
|
|
app.include_router(feedback_attach_router, prefix="/api/v1/feedback")
|