diff --git a/.env.cloud.example b/.env.cloud.example index 1c8fb80..0063c7c 100644 --- a/.env.cloud.example +++ b/.env.cloud.example @@ -10,7 +10,9 @@ PAGEPIPER_BOOKS_DIR=/devl/pagepiper-cloud-data/books PAGEPIPER_OLLAMA_URL= # Embedding and chat model selection (only used when PAGEPIPER_OLLAMA_URL is set) -PAGEPIPER_EMBED_MODEL=nomic-embed-text +# mxbai-embed-large (1024-dim) is recommended; nomic-embed-text uses 768-dim +PAGEPIPER_EMBED_MODEL=mxbai-embed-large +PAGEPIPER_EMBED_DIMS=1024 PAGEPIPER_CHAT_MODEL=mistral:7b # Heimdall license server (optional — for per-user tier validation) @@ -20,3 +22,17 @@ HEIMDALL_ADMIN_TOKEN= # cf-orch streaming proxy — coordinator product key # Must match COORDINATOR_PRODUCT_KEYS["pagepiper"] in cf-orch.env on the coordinator COORDINATOR_PAGEPIPER_KEY= + +# cf-orch coordinator URL — routes chat/embed calls through managed GPU allocation +# CF_LICENSE_KEY is the auth token sent to the coordinator (same value as COORDINATOR_PAGEPIPER_KEY) +# Leave CF_ORCH_URL blank to skip allocation and hit PAGEPIPER_OLLAMA_URL directly +CF_ORCH_URL= +CF_LICENSE_KEY= +CF_APP_NAME=pagepiper + +# Forgejo API token — enables in-app feedback button (files issues to Circuit-Forge/pagepiper) +FORGEJO_API_TOKEN= + +# Enable thumbs up/down on chat answers (stores retrieval quality signals locally) +# Off by default — opt in when you want to collect correction data +# PAGEPIPER_CHAT_FEEDBACK=true diff --git a/.env.example b/.env.example index 2f3bcd1..473929f 100644 --- a/.env.example +++ b/.env.example @@ -10,3 +10,11 @@ PAGEPIPER_DATA_DIR=data # PAGEPIPER_OLLAMA_URL=http://localhost:11434 # PAGEPIPER_CHAT_MODEL=mistral:7b # PAGEPIPER_EMBED_MODEL=nomic-embed-text + +# Forgejo API token — enables the in-app feedback button (files Forgejo issues) +# Create a token at https://git.opensourcesolarpunk.com/user/settings/applications +# FORGEJO_API_TOKEN= + +# Enable thumbs up/down on chat answers (stores retrieval quality signals locally) +# Off by default — opt in when you want to collect correction data +# PAGEPIPER_CHAT_FEEDBACK=true diff --git a/README.md b/README.md new file mode 100644 index 0000000..d2e487f --- /dev/null +++ b/README.md @@ -0,0 +1,197 @@ +# Pagepiper + +**v0.1.0** | Self-hosted PDF and EPUB search for your personal library + +Pagepiper lets you drop PDFs and EPUBs into a library, index them, and search across the full text. With [Ollama](https://ollama.com) configured, you also get hybrid vector search and an LLM (large language model) chat interface that cites specific page numbers when it answers. + +Built for TTRPG (tabletop roleplaying game) players tired of ctrl-F'ing through Pathfinder core rulebooks. Works equally well for fan fiction EPUB collections, AO3 exports, and any personal document library. + +Try it: [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech) + +--- + +## Features + +| Feature | Free tier | Paid (BYOK) | +|---------|-----------|-------------| +| PDF and EPUB upload via browser drag-and-drop | Yes | Yes | +| Directory scan for existing files | Yes | Yes | +| BM25 full-text search (no LLM required) | Yes | Yes | +| Unlimited local ingestion | Yes | Yes | +| Hybrid BM25 + k-NN vector search | No | Yes (local Ollama) | +| LLM chat with page-level citations | No | Yes (local Ollama) | +| Thumbs up / down feedback on answers | No | Yes | + +BYOK (bring your own key) means you supply your own Ollama instance. No cloud API keys, no usage billing. + +**BM25** (Best Match 25) is a keyword ranking algorithm. It works without any LLM and runs entirely inside the Docker container. **k-NN** (k-nearest neighbor) vector search uses embeddings to find passages that are semantically similar to your question, even when the exact words don't match. + +--- + +## Tech Stack + +- **Backend:** FastAPI + SQLite (BM25 via custom BM25Index, vectors via sqlite-vec) +- **Frontend:** Vue 3 SPA served by nginx +- **Embedding model:** `nomic-embed-text` via Ollama (1024-dim, optional) +- **Chat LLM:** `mistral:7b` via Ollama (optional, any Ollama model works) +- **Deployment:** Docker Compose + +--- + +## Quick Start (Self-Hosting) + +### Prerequisites + +- [Docker](https://docs.docker.com/get-docker/) and Docker Compose +- PDFs or EPUBs you want to search +- Optional: [Ollama](https://ollama.com) for semantic search and RAG (retrieval-augmented generation) chat + +### 1. Clone the repo + +```bash +git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper +cd pagepiper +``` + +### 2. Configure + +```bash +cp .env.example .env +``` + +Open `.env` and set your paths: + +```dotenv +# Directory to scan for PDFs/EPUBs (used by the "Scan" button in the UI) +PAGEPIPER_BOOKS_DIR=/path/to/your/pdfs + +# Where Pagepiper stores its SQLite index and uploaded files +PAGEPIPER_DATA_DIR=data +``` + +To unlock hybrid search and LLM chat, uncomment and set the Ollama block: + +```dotenv +PAGEPIPER_OLLAMA_URL=http://localhost:11434 +PAGEPIPER_CHAT_MODEL=mistral:7b +PAGEPIPER_EMBED_MODEL=nomic-embed-text +``` + +### 3. Start + +```bash +./manage.sh start +``` + +Open [http://localhost:8521](http://localhost:8521). + +### 4. Add documents + +Two ways to add files: + +**Upload via browser** (easiest for small collections): Click **Upload** in the Library view and select a PDF or EPUB. The file saves to `data/uploads/` and begins indexing automatically. + +**Scan a directory** (best for large collections): Set `PAGEPIPER_BOOKS_DIR` in your `.env` to a folder of PDFs/EPUBs, then click **Scan** in the Library view. Pagepiper finds all files recursively and queues them for indexing. + +### 5. Search and chat + +Switch to the **Chat** tab and ask questions. On the free tier, BM25 keyword search returns matching passages. With Ollama configured, you get semantic search and an LLM-generated answer with page-number citations. + +--- + +## Ollama Setup (optional) + +Install Ollama from [ollama.com](https://ollama.com), then pull the models: + +```bash +ollama pull mistral:7b +ollama pull nomic-embed-text +``` + +On a headless Linux server, make Ollama listen on all interfaces so the Docker container can reach it: + +```bash +OLLAMA_HOST=0.0.0.0 ollama serve +``` + +On Docker Desktop (Linux or Mac), `host.docker.internal` resolves automatically. No extra network config needed. + +--- + +## Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `PAGEPIPER_BOOKS_DIR` | `./books` | Host directory to scan for PDFs and EPUBs | +| `PAGEPIPER_DATA_DIR` | `./data` | SQLite index and uploaded files live here | +| `PAGEPIPER_OLLAMA_URL` | *(unset)* | Ollama base URL; leave blank for BM25-only mode | +| `PAGEPIPER_EMBED_MODEL` | `nomic-embed-text` | Ollama embedding model (1024-dim default) | +| `PAGEPIPER_EMBED_DIMS` | `1024` | Must match the embedding model's output dimensions | +| `PAGEPIPER_CHAT_MODEL` | `mistral:7b` | Ollama chat model; any Ollama model name works | +| `PAGEPIPER_CHAT_FEEDBACK` | *(unset)* | Set to `true` to enable thumbs up/down on chat answers | + +--- + +## Management + +```bash +./manage.sh start # Build and start (dev) +./manage.sh stop # Stop +./manage.sh restart # Restart +./manage.sh status # Show container status +./manage.sh logs [svc] # Tail logs (default: all services; pass 'api' or 'web' to filter) +./manage.sh open # Open the UI in your browser +./manage.sh build # Rebuild images without cache + +./manage.sh cloud:start # Start the cloud managed instance (port 8533) +./manage.sh cloud:stop +./manage.sh cloud:restart +./manage.sh cloud:status +./manage.sh cloud:logs [svc] +./manage.sh cloud:build +``` + +--- + +## Cloud Managed Instance + +The cloud deployment runs at [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech) and at `menagerie.circuitforge.tech/pagepiper`. It uses `compose.cloud.yml` with LLM inference routed through the cf-orch coordinator. + +To run your own cloud-style deployment: + +```bash +cp .env.cloud.example .env +# Edit .env: set PAGEPIPER_OLLAMA_URL and data paths +./manage.sh cloud:start +``` + +Cloud instance listens on port 8533. The API is internal-only; nginx proxies `/api/` to the backend. + +--- + +## Data and Backups + +The `data/` directory contains the SQLite index database and all uploaded files. Back it up to preserve your index. Pagepiper indexes documents at ingest time. If you modify or replace a source file, use the re-index button on the document card to rebuild its entry. + +Large PDFs (hundreds of pages) can take a few minutes to index. The status badge on the document card updates as indexing progresses. + +--- + +## Licensing + +Pagepiper uses a split license: + +- **MIT:** BM25 full-text search, document library management, ingest pipeline, EPUB support +- **BSL 1.1:** Hybrid vector search (embedding + k-NN), RAG chat, LLM integration + +BSL 1.1 is free for personal non-commercial self-hosting. SaaS re-hosting or commercial redistribution requires a license from CircuitForge. BSL 1.1 converts to MIT after four years. + +License keys: [circuitforge.tech](https://circuitforge.tech) + +--- + +## Contributing + +Issues and PRs welcome at [git.opensourcesolarpunk.com/Circuit-Forge/pagepiper](https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper). + +The ingest pipeline and BM25 index are MIT-licensed. If you build a better PDF parser or add support for additional formats (CBZ, MOBI, etc.), the community benefits directly. diff --git a/app/api/chat.py b/app/api/chat.py index c973ffe..dcfc35a 100644 --- a/app/api/chat.py +++ b/app/api/chat.py @@ -29,7 +29,7 @@ class ChatRequest(BaseModel): message: str history: list[ChatTurn] = [] doc_ids: list[str] | None = None - top_k: int = 5 + top_k: int = 10 class ChatResponse(BaseModel): @@ -37,6 +37,13 @@ class ChatResponse(BaseModel): citations: list[dict] +class ChatFeedbackRequest(BaseModel): + rating: int # 1 = thumbs up, -1 = thumbs down + question: str = "" + answer: str = "" + doc_ids: list[str] = [] + + def _get_llm_router(): """Return LLMRouter if Ollama configured, else None.""" from app.config import get_llm_config @@ -125,3 +132,31 @@ def chat(req: ChatRequest) -> ChatResponse: for c in result.citations ], ) + + +@router.get("/feedback/status") +def chat_feedback_status() -> dict: + enabled = os.environ.get("PAGEPIPER_CHAT_FEEDBACK", "").lower() in ("1", "true", "yes") + return {"enabled": enabled} + + +@router.post("/feedback") +def submit_chat_feedback(req: ChatFeedbackRequest) -> dict: + import json + import sqlite3 + + if req.rating not in (1, -1): + from fastapi import HTTPException + raise HTTPException(status_code=422, detail="rating must be 1 or -1") + + db_path = _get_db_path() + con = sqlite3.connect(db_path) + try: + con.execute( + "INSERT INTO chat_feedback (rating, question, answer, doc_ids) VALUES (?, ?, ?, ?)", + (req.rating, req.question[:2000], req.answer[:4000], json.dumps(req.doc_ids)), + ) + con.commit() + finally: + con.close() + return {"ok": True} diff --git a/app/api/feedback.py b/app/api/feedback.py new file mode 100644 index 0000000..57ac86e --- /dev/null +++ b/app/api/feedback.py @@ -0,0 +1,7 @@ +"""Feedback router — provided by circuitforge-core.""" +from circuitforge_core.api import make_feedback_router + +router = make_feedback_router( + repo="Circuit-Forge/pagepiper", + product="pagepiper", +) diff --git a/app/api/feedback_attach.py b/app/api/feedback_attach.py new file mode 100644 index 0000000..40c7285 --- /dev/null +++ b/app/api/feedback_attach.py @@ -0,0 +1,88 @@ +"""Screenshot attachment endpoint for in-app feedback. + +After the cf-core feedback router creates a Forgejo issue, the frontend +can call POST /feedback/attach to upload a screenshot as a comment on that issue. +""" +from __future__ import annotations + +import base64 +import os + +import requests +from fastapi import APIRouter, HTTPException +from pydantic import BaseModel, Field + +router = APIRouter() + +_FORGEJO_BASE = os.environ.get( + "FORGEJO_API_URL", "https://git.opensourcesolarpunk.com/api/v1" +) +_REPO = "Circuit-Forge/pagepiper" +_MAX_BYTES = 5 * 1024 * 1024 + + +class AttachRequest(BaseModel): + issue_number: int + filename: str = Field(default="screenshot.png", max_length=80) + image_b64: str # data URI or raw base64 + + +class AttachResponse(BaseModel): + comment_url: str + + +def _forgejo_headers() -> dict[str, str]: + token = os.environ.get("FORGEJO_API_TOKEN", "") + return {"Authorization": f"token {token}"} + + +def _decode_image(image_b64: str) -> tuple[bytes, str]: + if image_b64.startswith("data:"): + header, _, data = image_b64.partition(",") + mime = header.split(";")[0].split(":")[1] if ":" in header else "image/png" + else: + data = image_b64 + mime = "image/png" + return base64.b64decode(data), mime + + +@router.post("/attach", response_model=AttachResponse) +def attach_screenshot(payload: AttachRequest) -> AttachResponse: + token = os.environ.get("FORGEJO_API_TOKEN", "") + if not token: + raise HTTPException(status_code=503, detail="Feedback not configured.") + + raw_bytes, mime = _decode_image(payload.image_b64) + if len(raw_bytes) > _MAX_BYTES: + raise HTTPException( + status_code=413, + detail=f"Screenshot exceeds 5 MB limit ({len(raw_bytes) // 1024} KB received).", + ) + + asset_resp = requests.post( + f"{_FORGEJO_BASE}/repos/{_REPO}/issues/{payload.issue_number}/assets", + headers=_forgejo_headers(), + files={"attachment": (payload.filename, raw_bytes, mime)}, + timeout=20, + ) + if not asset_resp.ok: + raise HTTPException( + status_code=502, + detail=f"Forgejo asset upload failed: {asset_resp.text[:200]}", + ) + + asset_url = asset_resp.json().get("browser_download_url", "") + comment_body = f"**Screenshot attached by reporter:**\n\n![screenshot]({asset_url})" + comment_resp = requests.post( + f"{_FORGEJO_BASE}/repos/{_REPO}/issues/{payload.issue_number}/comments", + headers={**_forgejo_headers(), "Content-Type": "application/json"}, + json={"body": comment_body}, + timeout=15, + ) + if not comment_resp.ok: + raise HTTPException( + status_code=502, + detail=f"Forgejo comment failed: {comment_resp.text[:200]}", + ) + + return AttachResponse(comment_url=comment_resp.json().get("html_url", "")) diff --git a/app/api/library.py b/app/api/library.py index 40fa525..ad96a8e 100644 --- a/app/api/library.py +++ b/app/api/library.py @@ -12,11 +12,13 @@ import uuid from pathlib import Path from typing import Callable -from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile -from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH +from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH, DATA_DIR from app.deps import get_db +_MAX_UPLOAD_BYTES = 200 * 1024 * 1024 # 200 MB + logger = logging.getLogger(__name__) router = APIRouter(prefix="/api/library", tags=["library"]) @@ -24,15 +26,31 @@ router = APIRouter(prefix="/api/library", tags=["library"]) _mark_bm25_dirty: Callable[[], None] | None = None +_INGEST_TASKS = { + ".pdf": "pagepiper/ingest_pdf", + ".epub": "pagepiper/ingest_epub", +} + +_INGEST_RUNNERS = { + ".pdf": "scripts.ingest_pdf", + ".epub": "scripts.ingest_epub", +} + + def _dispatch_ingest( doc_id: str, file_path: str, background_tasks: BackgroundTasks, ) -> str: """Dispatch an ingest task. Tries cf-orch; falls back to BackgroundTasks.""" + import importlib import os as _os from pathlib import Path as _Path + suffix = _Path(file_path).suffix.lower() + task_name = _INGEST_TASKS.get(suffix, "pagepiper/ingest_pdf") + runner_module = _INGEST_RUNNERS.get(suffix, "scripts.ingest_pdf") + # Read lazily so test fixtures (monkeypatch.setenv) take effect _data_dir = _Path(_os.environ.get("PAGEPIPER_DATA_DIR", "data")) task_id = str(uuid.uuid4()) @@ -45,11 +63,11 @@ def _dispatch_ingest( try: from circuitforge_core.tasks import dispatch_task # type: ignore[import] - task_id = dispatch_task(caller="pagepiper/ingest_pdf", args=args) + task_id = dispatch_task(caller=task_name, args=args) logger.info("Dispatched cf-orch ingest task %s for doc %s", task_id, doc_id) except Exception: - from scripts.ingest_pdf import run as run_ingest - background_tasks.add_task(_run_ingest_background, run_ingest, args, task_id) + mod = importlib.import_module(runner_module) + background_tasks.add_task(_run_ingest_background, mod.run, args, task_id) logger.info( "cf-orch unavailable — running ingest in background thread (task %s)", task_id ) @@ -89,7 +107,7 @@ def scan_library( if not watch.exists(): raise HTTPException(status_code=404, detail=f"Watch directory not found: {watch}") - pdfs = list(watch.glob("**/*.pdf")) + pdfs = list(watch.glob("**/*.pdf")) + list(watch.glob("**/*.epub")) queued = [] for pdf_path in pdfs: @@ -156,7 +174,8 @@ def delete_document( # Remove embeddings from vector store try: from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore # type: ignore[import] - store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=768) + from app.config import VEC_DIMENSIONS + store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=VEC_DIMENSIONS) store.delete_where({"doc_id": doc_id}) except Exception as exc: logger.warning("Could not remove vectors for doc %s: %s", doc_id, exc) @@ -165,6 +184,20 @@ def delete_document( _mark_bm25_dirty() +def _get_vec_count(doc_id: str) -> int: + """Return how many vectors have been stored for this doc. Returns 0 on any error.""" + try: + conn = sqlite3.connect(VEC_DB_PATH) + count = conn.execute( + "SELECT COUNT(*) FROM page_vecs_meta WHERE json_extract(metadata, '$.doc_id') = ?", + [doc_id], + ).fetchone()[0] + conn.close() + return int(count) + except Exception: + return 0 + + @router.get("/{doc_id}/status") def document_status( doc_id: str, @@ -176,4 +209,54 @@ def document_status( ).fetchone() if not row: raise HTTPException(status_code=404, detail="Document not found") - return dict(row) + result = dict(row) + result["vec_count"] = _get_vec_count(doc_id) + return result + + +@router.post("/upload", status_code=202) +def upload_document( + file: UploadFile, + background_tasks: BackgroundTasks, + db: sqlite3.Connection = Depends(get_db), +) -> dict: + """Accept a PDF/EPUB upload, save to data/uploads/, and queue for indexing.""" + name = Path(file.filename or "").name + suffix = Path(name).suffix.lower() + if suffix not in _INGEST_TASKS: + raise HTTPException(status_code=400, detail="Supported formats: PDF, EPUB") + + content = file.file.read() + if len(content) > _MAX_UPLOAD_BYTES: + raise HTTPException(status_code=413, detail="File exceeds 200 MB limit") + + upload_dir = DATA_DIR / "uploads" + upload_dir.mkdir(parents=True, exist_ok=True) + dest = upload_dir / name + dest.write_bytes(content) + + path_str = str(dest.resolve()) + existing = db.execute( + "SELECT id, status FROM documents WHERE file_path = ?", [path_str] + ).fetchone() + + if existing and existing["status"] == "ready": + return {"doc_id": existing["id"], "task_id": None, "filename": name, "status": "already_indexed"} + + if existing: + doc_id = existing["id"] + else: + title = dest.stem.replace("_", " ").replace("-", " ").title() + doc_id = db.execute( + "INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id", + [title, path_str, "pending"], + ).fetchone()[0] + db.commit() + + task_id = _dispatch_ingest(doc_id, path_str, background_tasks) + db.execute( + "UPDATE documents SET status='processing', task_id=? WHERE id=?", + [task_id, doc_id], + ) + db.commit() + return {"doc_id": doc_id, "task_id": task_id, "filename": name, "status": "queued"} diff --git a/app/config.py b/app/config.py index 9459f42..b18568f 100644 --- a/app/config.py +++ b/app/config.py @@ -10,6 +10,7 @@ DATA_DIR.mkdir(parents=True, exist_ok=True) DB_PATH = str(DATA_DIR / "pagepiper.db") VEC_DB_PATH = str(DATA_DIR / "pagepiper_vecs.db") WATCH_DIR = Path(os.environ.get("PAGEPIPER_WATCH_DIR", "books")) +VEC_DIMENSIONS = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024")) def get_llm_config() -> dict | None: @@ -19,17 +20,27 @@ def get_llm_config() -> dict | None: return None _clean = url.rstrip("/") _base_url = _clean if _clean.endswith("/v1") else _clean + "/v1" + chat_model = os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b") + + backend: dict = { + "type": "openai_compat", + "base_url": _base_url, + "model": chat_model, + "embedding_model": os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text"), + "supports_images": False, + } + + # Wire cf-orch allocation when coordinator is configured so the model stays warm + # and cold-start latency doesn't cause chat timeouts. + orch_url = os.environ.get("CF_ORCH_URL", "").strip() + if orch_url: + backend["cf_orch"] = { + "service": "ollama", + "model_candidates": [chat_model], + "ttl_s": 3600, + } + return { "fallback_order": ["ollama"], - "backends": { - "ollama": { - "type": "openai_compat", - "base_url": _base_url, - "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"), - "embedding_model": os.environ.get( - "PAGEPIPER_EMBED_MODEL", "nomic-embed-text" - ), - "supports_images": False, - } - }, + "backends": {"ollama": backend}, } diff --git a/app/deps.py b/app/deps.py index e795afb..78cf87c 100644 --- a/app/deps.py +++ b/app/deps.py @@ -9,7 +9,7 @@ from app.config import DB_PATH def get_db() -> Generator[sqlite3.Connection, None, None]: - conn = sqlite3.connect(DB_PATH) + conn = sqlite3.connect(DB_PATH, check_same_thread=False) conn.execute("PRAGMA foreign_keys = ON") conn.execute("PRAGMA journal_mode = WAL") conn.row_factory = sqlite3.Row diff --git a/app/main.py b/app/main.py index 7469d3d..e52f09a 100644 --- a/app/main.py +++ b/app/main.py @@ -3,11 +3,15 @@ from __future__ import annotations import logging +import os +import re +import sqlite3 +import threading from contextlib import asynccontextmanager from fastapi import FastAPI -from app.config import DB_PATH +from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS from app.services.bm25_index import BM25Index logger = logging.getLogger("pagepiper") @@ -21,9 +25,91 @@ def _apply_migrations() -> None: migrate(DB_PATH) +def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None: + """Re-run full ingest for a list of (doc_id, file_path) sequentially.""" + for doc_id, file_path in docs: + suffix = os.path.splitext(file_path)[1].lower() + try: + if suffix == ".epub": + from scripts.ingest_epub import run + else: + from scripts.ingest_pdf import run + logger.info("Auto re-embed: starting %s", os.path.basename(file_path)) + run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path) + except Exception as exc: + logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc) + + +def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None: + """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed. + + sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing + models requires dropping and recreating the whole file. Catches the mismatch at + startup rather than surfacing it as an obscure OperationalError mid-request. + """ + if not os.path.exists(vec_db_path): + return + try: + conn = sqlite3.connect(vec_db_path) + row = conn.execute( + "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'" + ).fetchone() + conn.close() + except Exception as exc: + logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc) + return + + if not row: + return # table not yet created — first embed will build it with the right dims + + m = re.search(r'float\[(\d+)\]', row[0]) + if not m: + return + actual_dims = int(m.group(1)) + if actual_dims == expected_dims: + return + + logger.warning( + "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed", + actual_dims, expected_dims, vec_db_path, + ) + try: + os.remove(vec_db_path) + except OSError as exc: + logger.error( + "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc + ) + return + + # Collect all ready docs so we can rebuild their embeddings in the background. + try: + conn = sqlite3.connect(db_path) + docs = conn.execute( + "SELECT id, file_path FROM documents WHERE status='ready'" + ).fetchall() + conn.close() + except Exception as exc: + logger.warning("Could not query documents for re-embed: %s", exc) + return + + if not docs: + return + + logger.info("Queuing re-embed for %d document(s) in background", len(docs)) + threading.Thread( + target=_reembed_docs, + args=(docs, db_path, vec_db_path), + daemon=True, + name="pagepiper-reembed", + ).start() + + @asynccontextmanager async def lifespan(app: FastAPI): _apply_migrations() + embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text") + logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS) + _check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH) _bm25.mark_dirty() # will rebuild on first search yield @@ -39,8 +125,12 @@ from app.api.library import router as library_router # noqa: E402 from app.api.ingest import router as ingest_router # noqa: E402 from app.api.search import router as search_router # noqa: E402 from app.api.chat import router as chat_router # noqa: E402 +from app.api.feedback import router as feedback_router # noqa: E402 +from app.api.feedback_attach import router as feedback_attach_router # noqa: E402 app.include_router(library_router) app.include_router(ingest_router) app.include_router(search_router) app.include_router(chat_router) +app.include_router(feedback_router, prefix="/api/v1/feedback") +app.include_router(feedback_attach_router, prefix="/api/v1/feedback") diff --git a/app/services/retriever.py b/app/services/retriever.py index b335171..35ee597 100644 --- a/app/services/retriever.py +++ b/app/services/retriever.py @@ -8,6 +8,7 @@ BM25-only path is MIT and has no gate. from __future__ import annotations import logging +import sqlite3 from dataclasses import dataclass from app.services.bm25_index import BM25Index @@ -15,6 +16,62 @@ from app.services.bm25_index import BM25Index logger = logging.getLogger(__name__) +def _fetch_adjacent( + hits: list["RetrievedChunk"], + db_path: str, + window: int = 1, +) -> list["RetrievedChunk"]: + """Return chunks immediately before/after each hit that aren't already in the hit set. + + Definitional passages often start mid-sentence because the EPUB/PDF chunk + boundary fell mid-paragraph. Fetching the preceding chunk restores the subject + so the LLM can understand 'them' / 'they' references correctly. + """ + if not hits: + return [] + + existing_keys = {(c.doc_id, c.page_number) for c in hits} + needed: dict[str, set[int]] = {} + for c in hits: + for delta in range(-window, window + 1): + if delta == 0: + continue + adj_page = c.page_number + delta + if adj_page > 0 and (c.doc_id, adj_page) not in existing_keys: + needed.setdefault(c.doc_id, set()).add(adj_page) + + if not needed: + return [] + + extra: list[RetrievedChunk] = [] + try: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + for doc_id, pages in needed.items(): + placeholders = ",".join("?" * len(pages)) + rows = conn.execute( + f"SELECT id, doc_id, page_number, text FROM page_chunks " + f"WHERE doc_id=? AND page_number IN ({placeholders})", + [doc_id] + sorted(pages), + ).fetchall() + for row in rows: + extra.append( + RetrievedChunk( + chunk_id=row["id"], + doc_id=row["doc_id"], + page_number=row["page_number"], + text=row["text"], + bm25_score=0.0, + vector_score=None, + ) + ) + conn.close() + except Exception as exc: + logger.warning("Context expansion query failed (non-fatal): %s", exc) + + return extra + + @dataclass(frozen=True) class RetrievedChunk: """A chunk returned by the retriever, with source scores.""" @@ -55,13 +112,23 @@ class Retriever: for r in self._bm25.query(query, top_k=top_k * 2, doc_ids=doc_ids) } - vec = llm.embed([query])[0] - store = LocalSQLiteVecStore(db_path=vec_db_path, table="page_vecs", dimensions=768) - filter_meta = {"doc_id": doc_ids[0]} if doc_ids and len(doc_ids) == 1 else None - vec_hits = store.query(vec, top_k=top_k * 2, filter_metadata=filter_meta) + try: + vec = llm.embed([query])[0] + except Exception as exc: + logger.warning("Embed failed, falling back to BM25-only: %s", exc) + return self._bm25_only(query, top_k, doc_ids, db_path) + from app.config import VEC_DIMENSIONS + store = LocalSQLiteVecStore(db_path=vec_db_path, table="page_vecs", dimensions=VEC_DIMENSIONS) - if doc_ids and len(doc_ids) > 1: - vec_hits = [h for h in vec_hits if h.metadata.get("doc_id") in doc_ids] + # sqlite-vec applies filter_metadata as a Python post-filter after fetching k + # nearest globally. When the corpus spans many documents and only a subset is + # selected, most of those k candidates are from non-target docs and get dropped, + # leaving too few vector hits. Oversample heavily and filter in Python instead. + if doc_ids: + vec_candidates = store.query(vec, top_k=top_k * 20) + vec_hits = [h for h in vec_candidates if h.metadata.get("doc_id") in doc_ids] + else: + vec_hits = store.query(vec, top_k=top_k * 2) # Merge: BM25 hits take priority; vector hits fill in additional results merged: dict[str, RetrievedChunk] = {} @@ -76,10 +143,10 @@ class Retriever: ) for vh in vec_hits: # _chunks is the loaded list of dicts from BM25Index; no public accessor exists - text = next((c["text"] for c in self._bm25._chunks if c["id"] == vh.id), "") - if vh.id in merged: - existing = merged[vh.id] - merged[vh.id] = RetrievedChunk( + text = next((c["text"] for c in self._bm25._chunks if c["id"] == vh.entry_id), "") + if vh.entry_id in merged: + existing = merged[vh.entry_id] + merged[vh.entry_id] = RetrievedChunk( chunk_id=existing.chunk_id, doc_id=existing.doc_id, page_number=existing.page_number, @@ -88,8 +155,8 @@ class Retriever: vector_score=vh.score, ) else: - merged[vh.id] = RetrievedChunk( - chunk_id=vh.id, + merged[vh.entry_id] = RetrievedChunk( + chunk_id=vh.entry_id, doc_id=vh.metadata.get("doc_id", ""), page_number=int(vh.metadata.get("page_number", 0)), text=text, @@ -103,14 +170,15 @@ class Retriever: vec = (1.0 / (1.0 + r.vector_score)) if r.vector_score is not None else 0.0 return bm25 * 0.5 + vec * 0.5 - ranked = sorted(merged.values(), key=_combined, reverse=True) - return ranked[:top_k] + ranked = sorted(merged.values(), key=_combined, reverse=True)[:top_k] + adjacent = _fetch_adjacent(ranked, db_path) + return ranked + adjacent def _bm25_only( self, query: str, top_k: int, doc_ids: list[str] | None, db_path: str ) -> list[RetrievedChunk]: self._bm25.ensure_fresh(db_path) - return [ + hits = [ RetrievedChunk( chunk_id=r.chunk_id, doc_id=r.doc_id, @@ -121,3 +189,5 @@ class Retriever: ) for r in self._bm25.query(query, top_k=top_k, doc_ids=doc_ids) ] + adjacent = _fetch_adjacent(hits, db_path) + return hits + adjacent diff --git a/app/services/synthesizer.py b/app/services/synthesizer.py index 634b523..b9efdf2 100644 --- a/app/services/synthesizer.py +++ b/app/services/synthesizer.py @@ -42,7 +42,9 @@ class Synthesizer: history: list[dict], chunks: list[RetrievedChunk], ) -> SynthesisResult: - context_parts = [f"[p.{c.page_number}]\n{c.text[:500]}" for c in chunks] + # 1500 chars (~300 words) per chunk: enough to capture definitions that + # appear mid-paragraph without blowing past a 32k-context model's limit. + context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks] context = "\n\n---\n\n".join(context_parts) prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}" @@ -52,7 +54,7 @@ class Synthesizer: Citation( doc_id=c.doc_id, page_number=c.page_number, - snippet=c.text[:200], + snippet=c.text[:400], bm25_score=c.bm25_score, ) for c in chunks diff --git a/compose.cloud.yml b/compose.cloud.yml index 2bebdd7..2708b71 100644 --- a/compose.cloud.yml +++ b/compose.cloud.yml @@ -20,6 +20,8 @@ services: # cf-orch: route LLM inference through coordinator for managed GPU access CF_ORCH_URL: http://host.docker.internal:7700 CF_APP_NAME: pagepiper + # CF_LICENSE_KEY is the auth token CFOrchClient sends to the coordinator + CF_LICENSE_KEY: ${COORDINATOR_PAGEPIPER_KEY:-} COORDINATOR_URL: http://10.1.10.71:7700 COORDINATOR_PAGEPIPER_KEY: ${COORDINATOR_PAGEPIPER_KEY:-} extra_hosts: diff --git a/docs/index.md b/docs/index.md index f674c55..42f0fc9 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1,6 +1,6 @@ # Pagepiper -Self-hosted document search with BM25 full-text indexing and (with local Ollama) hybrid vector search and LLM-powered chat. +Self-hosted document search with BM25 full-text indexing and (with local Ollama) hybrid vector search and LLM-powered chat. Supports PDF and EPUB files. ## Demo @@ -12,7 +12,7 @@ Try it: [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech) ![Library view](screenshots/01-library.png) -Scan your PDF directory to index documents. Each document shows page count and ingest status. +Scan your PDF directory to index documents, or upload individual PDFs directly. Each document shows page count and ingest status. ### Chat @@ -20,25 +20,123 @@ Scan your PDF directory to index documents. Each document shows page count and i Ask questions across your indexed documents. Results cite the source document and page number. -## Quick Start (Docker) - -```bash -git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper -cd pagepiper -cp .env.example .env # set PAGEPIPER_DATA_DIR and PAGEPIPER_BOOKS_DIR -docker compose up -d --build -# open http://localhost:8521 -``` - -Place PDFs in your `PAGEPIPER_BOOKS_DIR` directory, then click "Scan for PDFs" in the Library view. - ## Tiers | Feature | Free | Paid (BYOK) | |---------|------|-------------| | BM25 full-text search | Yes | Yes | +| PDF and EPUB upload via browser | Yes | Yes | | Unlimited local ingestion | Yes | Yes | | Hybrid vector search | No | Yes (local Ollama) | | LLM chat over documents | No | Yes (local Ollama) | -Set `PAGEPIPER_OLLAMA_URL` in your `.env` to unlock the Paid tier with your own Ollama instance. +BYOK (Bring Your Own Key) means you supply your own Ollama instance. No cloud API keys required. + +--- + +## Self-Hosting Guide + +### Prerequisites + +- [Docker](https://docs.docker.com/get-docker/) and Docker Compose +- PDFs you want to search +- Optional: [Ollama](https://ollama.com) running locally for semantic search and LLM chat + +### Step 1: Get the code + +```bash +git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper +cd pagepiper +``` + +### Step 2: Configure + +```bash +cp .env.example .env +``` + +Open `.env` and set your directories: + +```dotenv +# Where pagepiper stores its index database +PAGEPIPER_DATA_DIR=./data + +# Directory to scan for PDFs (used by the "Scan for PDFs" button) +# You can also upload individual PDFs via the web UI without setting this +PAGEPIPER_BOOKS_DIR=/path/to/your/pdfs +``` + +To unlock hybrid vector search and LLM chat, add your Ollama endpoint: + +```dotenv +PAGEPIPER_OLLAMA_URL=http://localhost:11434 +PAGEPIPER_CHAT_MODEL=mistral:7b +PAGEPIPER_EMBED_MODEL=nomic-embed-text +``` + +### Step 3: Start + +```bash +docker compose up -d --build +``` + +Open [http://localhost:8521](http://localhost:8521) in your browser. + +### Step 4: Add your PDFs + +Two ways to add documents: + +**Option A — Upload via browser** (easiest for small collections): + +Click the **Upload PDF** button in the Library view and select a file. It saves to `data/uploads/` and begins indexing automatically. + +**Option B — Mount a directory** (best for large collections): + +Set `PAGEPIPER_BOOKS_DIR` in your `.env` to point at a folder of PDFs, then click **Scan for PDFs**. Pagepiper finds all `.pdf` files recursively and queues them for indexing. + +### Step 5: Search + +Switch to the **Chat** tab and ask questions about your documents. The Free tier uses BM25 keyword matching. With Ollama configured, you get semantic (vector) search and LLM-generated answers with page-level citations. + +--- + +## Ollama Setup (optional) + +Install Ollama from [ollama.com](https://ollama.com), then pull the models: + +```bash +ollama pull mistral:7b +ollama pull nomic-embed-text +``` + +Pagepiper's Docker container reaches Ollama at `host.docker.internal` — no extra network config needed on Linux/Mac with Docker Desktop. On a headless Linux server, make sure Ollama binds to `0.0.0.0`: + +```bash +OLLAMA_HOST=0.0.0.0 ollama serve +``` + +--- + +## Managing the instance + +```bash +# Check status +docker compose ps + +# View API logs +docker compose logs -f api + +# Stop +docker compose down + +# Rebuild after updates +docker compose up -d --build +``` + +--- + +## Notes + +- Pagepiper indexes PDFs at ingest time. Changes to the source file require a re-index (use the re-index button on the document card). +- The `data/` directory contains the SQLite index database and any uploaded files. Back it up to preserve your index. +- Large PDFs (hundreds of pages) can take a few minutes to index. Watch the status badge on the document card. diff --git a/environment.yml b/environment.yml index f2a94a2..3e3bcf8 100644 --- a/environment.yml +++ b/environment.yml @@ -14,6 +14,8 @@ dependencies: - pdfplumber>=0.11 - pytesseract>=0.3 - Pillow>=10.0 + - ebooklib>=0.18 + - beautifulsoup4>=4.12 - sqlite-vec>=0.1 - pytest>=8.0 - pytest-asyncio>=0.23 diff --git a/migrations/002_chat_feedback.sql b/migrations/002_chat_feedback.sql new file mode 100644 index 0000000..31dcba6 --- /dev/null +++ b/migrations/002_chat_feedback.sql @@ -0,0 +1,9 @@ +-- chat answer thumbs up/down signals (local SQLite, always available) +CREATE TABLE IF NOT EXISTS chat_feedback ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + rating INTEGER NOT NULL CHECK (rating IN (1, -1)), + question TEXT NOT NULL DEFAULT '', + answer TEXT NOT NULL DEFAULT '', + doc_ids TEXT NOT NULL DEFAULT '[]', + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); diff --git a/scripts/ingest_epub.py b/scripts/ingest_epub.py new file mode 100644 index 0000000..8984726 --- /dev/null +++ b/scripts/ingest_epub.py @@ -0,0 +1,239 @@ +# scripts/ingest_epub.py +""" +cf-orch task: pagepiper/ingest_epub + +Extracts text from an EPUB file, stores chapter chunks in SQLite, and (if Ollama is +configured) generates embeddings and stores them in the sqlite-vec store. + +Each EPUB chapter becomes one chunk (equivalent to a PDF page). + +Entry point: + python scripts/ingest_epub.py --doc-id X --file-path Y --db-path Z --vec-db-path W +""" +from __future__ import annotations + +import logging +import os +import sqlite3 +from dataclasses import dataclass +from pathlib import Path + +logger = logging.getLogger("pagepiper.ingest_epub") + +EMBED_BATCH_SIZE = 64 +_WORDS_PER_CHUNK = 500 # target chunk size for word-count fallback + + +@dataclass +class _Chunk: + page_number: int + text: str + source: str + word_count: int + + +def _paragraphs_from_soup(soup) -> list[str]: + """Extract non-trivial, artifact-free text lines from parsed HTML.""" + from scripts.text_clean import filter_paragraphs + raw = soup.get_text(separator="\n", strip=True) + return filter_paragraphs(raw.splitlines()) + + +def _chunks_from_paragraphs(paragraphs: list[str], start_num: int) -> list[_Chunk]: + """Accumulate paragraphs into ~_WORDS_PER_CHUNK-word chunks.""" + chunks: list[_Chunk] = [] + current: list[str] = [] + current_count = 0 + chunk_num = start_num + + for para in paragraphs: + words = para.split() + if current_count + len(words) > _WORDS_PER_CHUNK and current: + text = "\n".join(current) + chunks.append(_Chunk(chunk_num, text, "text", current_count)) + chunk_num += 1 + current, current_count = [], 0 + current.append(para) + current_count += len(words) + + if current: + text = "\n".join(current) + chunks.append(_Chunk(chunk_num, text, "text", current_count)) + + return chunks + + +def _extract_chunks(file_path: str) -> list[_Chunk]: + import ebooklib + from ebooklib import epub + from bs4 import BeautifulSoup + from scripts.text_clean import clean_line, is_artifact_line + + book = epub.read_epub(file_path, options={"ignore_ncx": True}) + all_chunks: list[_Chunk] = [] + + for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + soup = BeautifulSoup(item.get_content(), "html.parser") + headings = soup.find_all(["h1", "h2", "h3", "h4"]) + + if len(headings) >= 2: + # Heading-based split: one chunk per section + current_parts: list[str] = [] + for elem in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]): + if elem.name in ("h1", "h2", "h3", "h4"): + if current_parts: + text = "\n".join(current_parts).strip() + if text: + n = len(all_chunks) + 1 + all_chunks.append(_Chunk(n, text, "text", len(text.split()))) + current_parts = [elem.get_text(" ", strip=True)] + else: + t = clean_line(elem.get_text(" ", strip=True)) + if t and not is_artifact_line(t): + current_parts.append(t) + if current_parts: + text = "\n".join(current_parts).strip() + if text: + n = len(all_chunks) + 1 + all_chunks.append(_Chunk(n, text, "text", len(text.split()))) + else: + # Word-count fallback: accumulate paragraphs into ~500-word chunks + paragraphs = _paragraphs_from_soup(soup) + if paragraphs: + all_chunks.extend(_chunks_from_paragraphs(paragraphs, len(all_chunks) + 1)) + + return all_chunks + + +def _update_status( + conn: sqlite3.Connection, + doc_id: str, + status: str, + page_count: int | None = None, + error_msg: str | None = None, +) -> None: + if page_count is not None: + conn.execute( + "UPDATE documents SET status=?, page_count=?, updated_at=datetime('now') WHERE id=?", + [status, page_count, doc_id], + ) + elif error_msg is not None: + conn.execute( + "UPDATE documents SET status=?, error_msg=?, updated_at=datetime('now') WHERE id=?", + [status, error_msg, doc_id], + ) + else: + conn.execute( + "UPDATE documents SET status=?, updated_at=datetime('now') WHERE id=?", + [status, doc_id], + ) + conn.commit() + + +def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None: + """Run the full ingest pipeline for one EPUB. Called by cf-orch or BackgroundTasks.""" + conn: sqlite3.Connection | None = None + try: + conn = sqlite3.connect(db_path, timeout=30) + conn.execute("PRAGMA journal_mode = WAL") + conn.execute("PRAGMA foreign_keys = ON") + _update_status(conn, doc_id, "processing") + + logger.info("Extracting chapters from %s", file_path) + chunks = _extract_chunks(file_path) + logger.info("Extracted %d chapters", len(chunks)) + + conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id]) + chunk_rows: list[tuple[str, int, str]] = [] + for chunk in chunks: + row = conn.execute( + """INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) + VALUES (?,?,?,?,?) RETURNING id""", + [doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count], + ).fetchone() + chunk_rows.append((row[0], chunk.page_number, chunk.text)) + conn.commit() + + # Embedding failure is non-fatal: document remains BM25-searchable. + ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip() + if ollama_url and chunks: + try: + logger.info("Embedding %d chapters via Ollama at %s", len(chunks), ollama_url) + from circuitforge_core.llm import LLMRouter + from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore + + _clean = ollama_url.rstrip("/") + base_url = _clean if _clean.endswith("/v1") else _clean + "/v1" + router = LLMRouter({ + "fallback_order": ["ollama"], + "backends": { + "ollama": { + "type": "openai_compat", + "base_url": base_url, + "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"), + "embedding_model": os.environ.get( + "PAGEPIPER_EMBED_MODEL", "nomic-embed-text" + ), + "supports_images": False, + } + }, + }) + embed_dims = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024")) + vec_store = LocalSQLiteVecStore( + db_path=vec_db_path, table="page_vecs", dimensions=embed_dims + ) + vec_store.delete_where({"doc_id": doc_id}) + + texts = [text for _, _, text in chunk_rows] + vectors: list[list[float]] = [] + for i in range(0, len(texts), EMBED_BATCH_SIZE): + vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE])) + + for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors): + vec_store.upsert( + entry_id=chunk_id, + vector=vector, + metadata={"doc_id": doc_id, "page_number": page_number}, + ) + logger.info("Stored %d embeddings", len(vectors)) + except Exception as embed_exc: + logger.warning( + "Embedding skipped for doc %s — BM25 only (reason: %s)", + doc_id, embed_exc, + ) + + _update_status(conn, doc_id, "ready", page_count=len(chunks)) + logger.info("Ingest complete for doc %s (%d chapters)", doc_id, len(chunks)) + + except Exception as exc: + logger.error("Ingest failed for doc %s: %s", doc_id, exc, exc_info=True) + if conn is not None: + try: + _update_status(conn, doc_id, "error", error_msg=str(exc)) + except Exception: + logger.warning("Could not write error status for doc %s", doc_id) + raise + finally: + if conn is not None: + conn.close() + + +if __name__ == "__main__": + import argparse + + logging.basicConfig(level=logging.INFO) + + parser = argparse.ArgumentParser( + description="Ingest an EPUB (cf-orch task entry point)" + ) + parser.add_argument("--doc-id", required=True) + parser.add_argument("--file-path", required=True) + parser.add_argument("--db-path", required=True) + parser.add_argument("--vec-db-path", required=True) + a = parser.parse_args() + run( + doc_id=a.doc_id, + file_path=a.file_path, + db_path=a.db_path, + vec_db_path=a.vec_db_path, + ) diff --git a/scripts/ingest_pdf.py b/scripts/ingest_pdf.py index a6bf9af..65671de 100644 --- a/scripts/ingest_pdf.py +++ b/scripts/ingest_pdf.py @@ -52,7 +52,8 @@ def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None: conn: sqlite3.Connection | None = None try: - conn = sqlite3.connect(db_path) + conn = sqlite3.connect(db_path, timeout=30) + conn.execute("PRAGMA journal_mode = WAL") conn.execute("PRAGMA foreign_keys = ON") _update_status(conn, doc_id, "processing") @@ -63,59 +64,71 @@ def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None: logger.info("Extracted %d pages", len(chunks)) # Step 2: Store chunks (replace any existing for this doc) + from scripts.text_clean import clean_paragraph conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id]) chunk_rows: list[tuple[str, int, str]] = [] for chunk in chunks: + cleaned_text = clean_paragraph(chunk.text) + if not cleaned_text: + continue row = conn.execute( """INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?) RETURNING id""", - [doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count], + [doc_id, chunk.page_number, cleaned_text, chunk.source, len(cleaned_text.split())], ).fetchone() - chunk_rows.append((row[0], chunk.page_number, chunk.text)) + chunk_rows.append((row[0], chunk.page_number, cleaned_text)) conn.commit() # Step 3: Embed and store vectors if Ollama is configured (BYOK gate) + # Embedding failure is non-fatal: document remains BM25-searchable. ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip() if ollama_url and chunks: - logger.info("Embedding %d pages via Ollama at %s", len(chunks), ollama_url) - from circuitforge_core.llm import LLMRouter - from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore + try: + logger.info("Embedding %d pages via Ollama at %s", len(chunks), ollama_url) + from circuitforge_core.llm import LLMRouter + from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore - _clean = ollama_url.rstrip("/") - base_url = _clean if _clean.endswith("/v1") else _clean + "/v1" - router = LLMRouter({ - "fallback_order": ["ollama"], - "backends": { - "ollama": { - "type": "openai_compat", - "base_url": base_url, - "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"), - "embedding_model": os.environ.get( - "PAGEPIPER_EMBED_MODEL", "nomic-embed-text" - ), - "supports_images": False, - } - }, - }) - vec_store = LocalSQLiteVecStore( - db_path=vec_db_path, table="page_vecs", dimensions=768 - ) - # Remove old vectors before re-inserting. If embedding fails mid-way, - # old vectors are gone but new ones are partial — re-ingest recovers. - vec_store.delete_where({"doc_id": doc_id}) - - texts = [text for _, _, text in chunk_rows] - vectors: list[list[float]] = [] - for i in range(0, len(texts), EMBED_BATCH_SIZE): - vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE])) - - for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors): - vec_store.upsert( - id=chunk_id, - vector=vector, - metadata={"doc_id": doc_id, "page_number": page_number}, + _clean = ollama_url.rstrip("/") + base_url = _clean if _clean.endswith("/v1") else _clean + "/v1" + router = LLMRouter({ + "fallback_order": ["ollama"], + "backends": { + "ollama": { + "type": "openai_compat", + "base_url": base_url, + "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"), + "embedding_model": os.environ.get( + "PAGEPIPER_EMBED_MODEL", "nomic-embed-text" + ), + "supports_images": False, + } + }, + }) + embed_dims = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024")) + vec_store = LocalSQLiteVecStore( + db_path=vec_db_path, table="page_vecs", dimensions=embed_dims + ) + # Remove old vectors before re-inserting. If embedding fails mid-way, + # old vectors are gone but new ones are partial — re-ingest recovers. + vec_store.delete_where({"doc_id": doc_id}) + + texts = [text for _, _, text in chunk_rows] + vectors: list[list[float]] = [] + for i in range(0, len(texts), EMBED_BATCH_SIZE): + vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE])) + + for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors): + vec_store.upsert( + entry_id=chunk_id, + vector=vector, + metadata={"doc_id": doc_id, "page_number": page_number}, + ) + logger.info("Stored %d embeddings", len(vectors)) + except Exception as embed_exc: + logger.warning( + "Embedding skipped for doc %s — BM25 only (reason: %s)", + doc_id, embed_exc, ) - logger.info("Stored %d embeddings", len(vectors)) _update_status(conn, doc_id, "ready", page_count=len(chunks)) logger.info("Ingest complete for doc %s (%d pages)", doc_id, len(chunks)) diff --git a/scripts/text_clean.py b/scripts/text_clean.py new file mode 100644 index 0000000..26f1f48 --- /dev/null +++ b/scripts/text_clean.py @@ -0,0 +1,72 @@ +# scripts/text_clean.py +""" +Shared text-cleaning utilities for ingest pipelines. + +Removes boilerplate lines injected by ebook converters, piracy watermarks, +and other non-content artifacts before chunks are stored or embedded. +""" +from __future__ import annotations + +import re + +# Lines that match any of these patterns are dropped entirely. +# Each pattern is matched against the stripped line (case-insensitive). +_LINE_DROP_PATTERNS: list[re.Pattern] = [ + # ABC Amber converter family + re.compile(r'generated by abc amber', re.IGNORECASE), + re.compile(r'processtext\.com', re.IGNORECASE), + # Calibre / sigil metadata lines + re.compile(r'calibre \d+\.\d+', re.IGNORECASE), + # Standalone URLs (line is just a URL, no surrounding prose) + re.compile(r'^https?://\S+$'), + # Common piracy / file-sharing watermarks + re.compile(r'www\.\w+\.(com|net|org)/\S*book', re.IGNORECASE), + re.compile(r'downloaded from', re.IGNORECASE), + re.compile(r'scanned by', re.IGNORECASE), + re.compile(r'provided by', re.IGNORECASE), + # Page-number-only lines from PDF extraction (e.g. "- 42 -" or "42") + re.compile(r'^\s*-?\s*\d{1,4}\s*-?\s*$'), +] + +# Inline substrings to strip from within a line before further processing. +_INLINE_STRIP_PATTERNS: list[re.Pattern] = [ + re.compile(r'generated by abc amber \w+ converter,?\s*https?://\S*', re.IGNORECASE), + re.compile(r'https?://www\.processtext\.com/\S*', re.IGNORECASE), +] + + +def is_artifact_line(line: str) -> bool: + """Return True if the line is a known conversion artifact and should be dropped.""" + stripped = line.strip() + return any(p.search(stripped) for p in _LINE_DROP_PATTERNS) + + +def clean_line(line: str) -> str: + """Strip inline converter artifacts from a line, returning the cleaned version.""" + for p in _INLINE_STRIP_PATTERNS: + line = p.sub("", line) + return line.strip() + + +def clean_paragraph(text: str) -> str: + """Clean a multi-line paragraph: drop artifact lines, strip inline artifacts.""" + lines = [] + for line in text.splitlines(): + if is_artifact_line(line): + continue + cleaned = clean_line(line) + if cleaned: + lines.append(cleaned) + return "\n".join(lines) + + +def filter_paragraphs(paragraphs: list[str]) -> list[str]: + """Remove artifact lines from a list of paragraph strings.""" + result = [] + for para in paragraphs: + if is_artifact_line(para): + continue + cleaned = clean_line(para) + if cleaned and len(cleaned.split()) >= 4: + result.append(cleaned) + return result diff --git a/tests/conftest.py b/tests/conftest.py index 3acd092..91e1f12 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -30,8 +30,10 @@ def client(test_db, tmp_path, monkeypatch): from app.main import app, _bm25 from app.deps import get_db - # Suppress migrations during tests — test_db fixture already applies the schema + # Suppress startup side effects — test_db fixture already applies the schema, + # and vec schema validation is tested separately in test_startup.py monkeypatch.setattr(_main_module, "_apply_migrations", lambda: None) + monkeypatch.setattr(_main_module, "_check_vec_schema", lambda *a, **kw: None) def override_db(): conn = sqlite3.connect(test_db) diff --git a/tests/test_startup.py b/tests/test_startup.py new file mode 100644 index 0000000..7eca538 --- /dev/null +++ b/tests/test_startup.py @@ -0,0 +1,170 @@ +# tests/test_startup.py +"""Tests for startup vec DB schema validation (_check_vec_schema).""" +from __future__ import annotations + +import os +import sqlite3 +import threading +from unittest.mock import MagicMock, patch + +import pytest + +from app.main import _check_vec_schema, _reembed_docs + + +def _make_vec_db(path: str, dims: int) -> None: + """Create a minimal sqlite-vec-style DB with the given dimension.""" + conn = sqlite3.connect(path) + conn.execute("PRAGMA journal_mode=WAL") + # Replicate the virtual table name used by LocalSQLiteVecStore + conn.execute(f"CREATE TABLE page_vecs_vecs (embedding float[{dims}])") + conn.execute( + "INSERT INTO sqlite_master(type, name, tbl_name, sql) VALUES (?,?,?,?)" + if False else "" + ) + # Write a real sqlite_master entry via a virtual table workaround: + # Easiest is to put the dimension marker directly in a metadata table. + # But _check_vec_schema reads sqlite_master, so we need the real DDL there. + conn.close() + # sqlite_master is read-only — recreate using the real CREATE VIRTUAL TABLE path + # by faking it via a regular table with the matching name pattern. + conn2 = sqlite3.connect(path) + conn2.execute("DROP TABLE IF EXISTS page_vecs_vecs") + # Write a row that _check_vec_schema will parse via its regex + conn2.execute( + "CREATE TABLE _schema_hint (sql TEXT)" + ) + conn2.execute( + "INSERT INTO _schema_hint VALUES (?)", + [f"CREATE VIRTUAL TABLE page_vecs_vecs USING vec0(embedding float[{dims}])"], + ) + conn2.commit() + conn2.close() + + +def _make_real_vec_db(path: str, dims: int) -> None: + """Create a vec DB whose sqlite_master actually contains the dimension DDL.""" + import sqlite3 as _sq + # We can't load sqlite-vec in tests, so simulate by writing sqlite_master directly + # via a shadow table that _check_vec_schema reads. + conn = _sq.connect(path) + conn.execute( + f"""CREATE TABLE page_vecs_vecs ( + embedding float[{dims}] + )""" + ) + conn.commit() + conn.close() + + +class TestCheckVecSchema: + def test_no_file_is_noop(self, tmp_path): + """Missing vec DB should not raise.""" + _check_vec_schema(str(tmp_path / "missing.db"), 1024, str(tmp_path / "main.db")) + + def test_matching_dims_keeps_file(self, tmp_path): + """Correct dimensions: vec DB must not be deleted.""" + vec_path = str(tmp_path / "vecs.db") + conn = sqlite3.connect(vec_path) + conn.execute("CREATE TABLE page_vecs_vecs (embedding float[1024])") + conn.commit() + conn.close() + + _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db")) + + assert os.path.exists(vec_path), "Vec DB should not be deleted when dims match" + + def test_mismatched_dims_deletes_file(self, tmp_path): + """Dimension mismatch: vec DB must be deleted.""" + vec_path = str(tmp_path / "vecs.db") + conn = sqlite3.connect(vec_path) + conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])") + conn.commit() + conn.close() + + db_path = str(tmp_path / "main.db") + _check_vec_schema(vec_path, 1024, db_path) + + assert not os.path.exists(vec_path), "Vec DB should be deleted on dimension mismatch" + + def test_mismatched_dims_queues_reembed(self, tmp_path): + """Dimension mismatch: re-embed thread must be started for ready docs.""" + vec_path = str(tmp_path / "vecs.db") + conn = sqlite3.connect(vec_path) + conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])") + conn.commit() + conn.close() + + db_path = str(tmp_path / "main.db") + schema = ( + "CREATE TABLE documents (" + "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, " + "status TEXT, task_id TEXT, page_count INTEGER, " + "error_msg TEXT, created_at TEXT, updated_at TEXT)" + ) + main_conn = sqlite3.connect(db_path) + main_conn.execute(schema) + main_conn.execute( + "INSERT INTO documents VALUES ('abc123', 'Book', '/tmp/book.pdf', 'ready', NULL, 10, NULL, '2026-01-01', '2026-01-01')" + ) + main_conn.commit() + main_conn.close() + + started = [] + real_thread_start = threading.Thread.start + + def _capture_start(self): + started.append(self) + # Don't actually run the re-embed to keep tests fast + self.run = lambda: None + real_thread_start(self) + + with patch.object(threading.Thread, "start", _capture_start): + _check_vec_schema(vec_path, 1024, db_path) + + assert len(started) == 1, "Exactly one re-embed thread should be started" + assert started[0].name == "pagepiper-reembed" + + def test_no_ready_docs_skips_thread(self, tmp_path): + """Mismatch with no ready docs: no thread should be started.""" + vec_path = str(tmp_path / "vecs.db") + conn = sqlite3.connect(vec_path) + conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])") + conn.commit() + conn.close() + + db_path = str(tmp_path / "main.db") + schema = ( + "CREATE TABLE documents (" + "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, " + "status TEXT, task_id TEXT, page_count INTEGER, " + "error_msg TEXT, created_at TEXT, updated_at TEXT)" + ) + main_conn = sqlite3.connect(db_path) + main_conn.execute(schema) + main_conn.commit() + main_conn.close() + + started = [] + with patch.object(threading.Thread, "start", lambda self: started.append(self)): + _check_vec_schema(vec_path, 1024, db_path) + + assert len(started) == 0 + + def test_empty_db_no_table_is_noop(self, tmp_path): + """Vec DB exists but has no page_vecs_vecs table yet: no deletion.""" + vec_path = str(tmp_path / "vecs.db") + sqlite3.connect(vec_path).close() # create empty file + + _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db")) + + assert os.path.exists(vec_path) + + def test_corrupt_db_does_not_raise(self, tmp_path): + """Corrupt or unreadable vec DB must not propagate exceptions.""" + vec_path = str(tmp_path / "vecs.db") + with open(vec_path, "w") as f: + f.write("not a sqlite database") + + _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db")) + # No assertion needed — just must not raise diff --git a/tests/test_text_clean.py b/tests/test_text_clean.py new file mode 100644 index 0000000..6c954e3 --- /dev/null +++ b/tests/test_text_clean.py @@ -0,0 +1,108 @@ +# tests/test_text_clean.py +"""Tests for ebook artifact filtering in scripts/text_clean.py.""" +from __future__ import annotations + +import pytest + +from scripts.text_clean import ( + clean_line, + clean_paragraph, + filter_paragraphs, + is_artifact_line, +) + + +class TestIsArtifactLine: + def test_abc_amber_lit(self): + assert is_artifact_line( + "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html" + ) + + def test_abc_amber_rtf(self): + assert is_artifact_line("Generated by ABC Amber RTF Converter") + + def test_processtext_url_only(self): + assert is_artifact_line("http://www.processtext.com/abclit.html") + + def test_standalone_url(self): + assert is_artifact_line("https://www.example.com/book") + + def test_page_number_only(self): + assert is_artifact_line("42") + assert is_artifact_line("- 42 -") + assert is_artifact_line(" 7 ") + + def test_downloaded_from(self): + assert is_artifact_line("Downloaded from www.fictionsite.net") + + def test_scanned_by(self): + assert is_artifact_line("Scanned by SomeUser") + + def test_normal_prose_not_artifact(self): + assert not is_artifact_line( + '"And what if food isn\'t the only reason Jagang is going to Anderith?"' + ) + + def test_url_embedded_in_prose_not_dropped(self): + # A URL inside a sentence is not a standalone-URL artifact line + assert not is_artifact_line( + "You can read more about this at https://example.com and continue." + ) + + def test_short_page_header_not_dropped(self): + # "Chapter 1" is not an artifact — 4-digit number check only drops bare numbers + assert not is_artifact_line("Chapter 1") + + +class TestCleanLine: + def test_strips_inline_abc_amber(self): + line = "Some prose. Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html" + result = clean_line(line) + assert "ABC Amber" not in result + assert "processtext" not in result + assert "Some prose." in result + + def test_passes_clean_line_unchanged(self): + line = "He cocked an eyebrow and smiled." + assert clean_line(line) == line + + +class TestCleanParagraph: + def test_drops_artifact_lines_from_paragraph(self): + text = ( + "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html\n" + '"And what if food isn\'t the only reason Jagang is going to Anderith?"\n' + "He cocked an eyebrow." + ) + result = clean_paragraph(text) + assert "ABC Amber" not in result + assert "Jagang" in result + assert "eyebrow" in result + + def test_all_artifact_paragraph_returns_empty(self): + text = "Generated by ABC Amber LIT Converter\nhttp://www.processtext.com/abclit.html" + assert clean_paragraph(text) == "" + + def test_clean_paragraph_unchanged(self): + text = "Richard raised his sword.\nThe magic surged through him." + assert clean_paragraph(text) == text + + +class TestFilterParagraphs: + def test_drops_artifact_paragraphs(self): + paras = [ + "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html", + '"And what if food isn\'t the only reason Jagang is going to Anderith?"', + "He cocked an eyebrow at the question.", + ] + result = filter_paragraphs(paras) + assert len(result) == 2 + assert all("ABC Amber" not in p for p in result) + + def test_drops_short_lines_under_4_words(self): + paras = ["Hi", "OK sure", "Valid sentence with enough words here."] + result = filter_paragraphs(paras) + assert result == ["Valid sentence with enough words here."] + + def test_empty_input(self): + assert filter_paragraphs([]) == [] diff --git a/web/src/App.vue b/web/src/App.vue index e699732..16af787 100644 --- a/web/src/App.vue +++ b/web/src/App.vue @@ -6,11 +6,13 @@ Chat + diff --git a/web/src/components/FeedbackButton.vue b/web/src/components/FeedbackButton.vue new file mode 100644 index 0000000..6d75a27 --- /dev/null +++ b/web/src/components/FeedbackButton.vue @@ -0,0 +1,631 @@ +