diff --git a/.env.cloud.example b/.env.cloud.example
index 1c8fb80..0063c7c 100644
--- a/.env.cloud.example
+++ b/.env.cloud.example
@@ -10,7 +10,9 @@ PAGEPIPER_BOOKS_DIR=/devl/pagepiper-cloud-data/books
PAGEPIPER_OLLAMA_URL=
# Embedding and chat model selection (only used when PAGEPIPER_OLLAMA_URL is set)
-PAGEPIPER_EMBED_MODEL=nomic-embed-text
+# mxbai-embed-large (1024-dim) is recommended; nomic-embed-text uses 768-dim
+PAGEPIPER_EMBED_MODEL=mxbai-embed-large
+PAGEPIPER_EMBED_DIMS=1024
PAGEPIPER_CHAT_MODEL=mistral:7b
# Heimdall license server (optional — for per-user tier validation)
@@ -20,3 +22,17 @@ HEIMDALL_ADMIN_TOKEN=
# cf-orch streaming proxy — coordinator product key
# Must match COORDINATOR_PRODUCT_KEYS["pagepiper"] in cf-orch.env on the coordinator
COORDINATOR_PAGEPIPER_KEY=
+
+# cf-orch coordinator URL — routes chat/embed calls through managed GPU allocation
+# CF_LICENSE_KEY is the auth token sent to the coordinator (same value as COORDINATOR_PAGEPIPER_KEY)
+# Leave CF_ORCH_URL blank to skip allocation and hit PAGEPIPER_OLLAMA_URL directly
+CF_ORCH_URL=
+CF_LICENSE_KEY=
+CF_APP_NAME=pagepiper
+
+# Forgejo API token — enables in-app feedback button (files issues to Circuit-Forge/pagepiper)
+FORGEJO_API_TOKEN=
+
+# Enable thumbs up/down on chat answers (stores retrieval quality signals locally)
+# Off by default — opt in when you want to collect correction data
+# PAGEPIPER_CHAT_FEEDBACK=true
diff --git a/.env.example b/.env.example
index 2f3bcd1..473929f 100644
--- a/.env.example
+++ b/.env.example
@@ -10,3 +10,11 @@ PAGEPIPER_DATA_DIR=data
# PAGEPIPER_OLLAMA_URL=http://localhost:11434
# PAGEPIPER_CHAT_MODEL=mistral:7b
# PAGEPIPER_EMBED_MODEL=nomic-embed-text
+
+# Forgejo API token — enables the in-app feedback button (files Forgejo issues)
+# Create a token at https://git.opensourcesolarpunk.com/user/settings/applications
+# FORGEJO_API_TOKEN=
+
+# Enable thumbs up/down on chat answers (stores retrieval quality signals locally)
+# Off by default — opt in when you want to collect correction data
+# PAGEPIPER_CHAT_FEEDBACK=true
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..d2e487f
--- /dev/null
+++ b/README.md
@@ -0,0 +1,197 @@
+# Pagepiper
+
+**v0.1.0** | Self-hosted PDF and EPUB search for your personal library
+
+Pagepiper lets you drop PDFs and EPUBs into a library, index them, and search across the full text. With [Ollama](https://ollama.com) configured, you also get hybrid vector search and an LLM (large language model) chat interface that cites specific page numbers when it answers.
+
+Built for TTRPG (tabletop roleplaying game) players tired of ctrl-F'ing through Pathfinder core rulebooks. Works equally well for fan fiction EPUB collections, AO3 exports, and any personal document library.
+
+Try it: [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech)
+
+---
+
+## Features
+
+| Feature | Free tier | Paid (BYOK) |
+|---------|-----------|-------------|
+| PDF and EPUB upload via browser drag-and-drop | Yes | Yes |
+| Directory scan for existing files | Yes | Yes |
+| BM25 full-text search (no LLM required) | Yes | Yes |
+| Unlimited local ingestion | Yes | Yes |
+| Hybrid BM25 + k-NN vector search | No | Yes (local Ollama) |
+| LLM chat with page-level citations | No | Yes (local Ollama) |
+| Thumbs up / down feedback on answers | No | Yes |
+
+BYOK (bring your own key) means you supply your own Ollama instance. No cloud API keys, no usage billing.
+
+**BM25** (Best Match 25) is a keyword ranking algorithm. It works without any LLM and runs entirely inside the Docker container. **k-NN** (k-nearest neighbor) vector search uses embeddings to find passages that are semantically similar to your question, even when the exact words don't match.
+
+---
+
+## Tech Stack
+
+- **Backend:** FastAPI + SQLite (BM25 via custom BM25Index, vectors via sqlite-vec)
+- **Frontend:** Vue 3 SPA served by nginx
+- **Embedding model:** `nomic-embed-text` via Ollama (1024-dim, optional)
+- **Chat LLM:** `mistral:7b` via Ollama (optional, any Ollama model works)
+- **Deployment:** Docker Compose
+
+---
+
+## Quick Start (Self-Hosting)
+
+### Prerequisites
+
+- [Docker](https://docs.docker.com/get-docker/) and Docker Compose
+- PDFs or EPUBs you want to search
+- Optional: [Ollama](https://ollama.com) for semantic search and RAG (retrieval-augmented generation) chat
+
+### 1. Clone the repo
+
+```bash
+git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper
+cd pagepiper
+```
+
+### 2. Configure
+
+```bash
+cp .env.example .env
+```
+
+Open `.env` and set your paths:
+
+```dotenv
+# Directory to scan for PDFs/EPUBs (used by the "Scan" button in the UI)
+PAGEPIPER_BOOKS_DIR=/path/to/your/pdfs
+
+# Where Pagepiper stores its SQLite index and uploaded files
+PAGEPIPER_DATA_DIR=data
+```
+
+To unlock hybrid search and LLM chat, uncomment and set the Ollama block:
+
+```dotenv
+PAGEPIPER_OLLAMA_URL=http://localhost:11434
+PAGEPIPER_CHAT_MODEL=mistral:7b
+PAGEPIPER_EMBED_MODEL=nomic-embed-text
+```
+
+### 3. Start
+
+```bash
+./manage.sh start
+```
+
+Open [http://localhost:8521](http://localhost:8521).
+
+### 4. Add documents
+
+Two ways to add files:
+
+**Upload via browser** (easiest for small collections): Click **Upload** in the Library view and select a PDF or EPUB. The file saves to `data/uploads/` and begins indexing automatically.
+
+**Scan a directory** (best for large collections): Set `PAGEPIPER_BOOKS_DIR` in your `.env` to a folder of PDFs/EPUBs, then click **Scan** in the Library view. Pagepiper finds all files recursively and queues them for indexing.
+
+### 5. Search and chat
+
+Switch to the **Chat** tab and ask questions. On the free tier, BM25 keyword search returns matching passages. With Ollama configured, you get semantic search and an LLM-generated answer with page-number citations.
+
+---
+
+## Ollama Setup (optional)
+
+Install Ollama from [ollama.com](https://ollama.com), then pull the models:
+
+```bash
+ollama pull mistral:7b
+ollama pull nomic-embed-text
+```
+
+On a headless Linux server, make Ollama listen on all interfaces so the Docker container can reach it:
+
+```bash
+OLLAMA_HOST=0.0.0.0 ollama serve
+```
+
+On Docker Desktop (Linux or Mac), `host.docker.internal` resolves automatically. No extra network config needed.
+
+---
+
+## Environment Variables
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `PAGEPIPER_BOOKS_DIR` | `./books` | Host directory to scan for PDFs and EPUBs |
+| `PAGEPIPER_DATA_DIR` | `./data` | SQLite index and uploaded files live here |
+| `PAGEPIPER_OLLAMA_URL` | *(unset)* | Ollama base URL; leave blank for BM25-only mode |
+| `PAGEPIPER_EMBED_MODEL` | `nomic-embed-text` | Ollama embedding model (1024-dim default) |
+| `PAGEPIPER_EMBED_DIMS` | `1024` | Must match the embedding model's output dimensions |
+| `PAGEPIPER_CHAT_MODEL` | `mistral:7b` | Ollama chat model; any Ollama model name works |
+| `PAGEPIPER_CHAT_FEEDBACK` | *(unset)* | Set to `true` to enable thumbs up/down on chat answers |
+
+---
+
+## Management
+
+```bash
+./manage.sh start # Build and start (dev)
+./manage.sh stop # Stop
+./manage.sh restart # Restart
+./manage.sh status # Show container status
+./manage.sh logs [svc] # Tail logs (default: all services; pass 'api' or 'web' to filter)
+./manage.sh open # Open the UI in your browser
+./manage.sh build # Rebuild images without cache
+
+./manage.sh cloud:start # Start the cloud managed instance (port 8533)
+./manage.sh cloud:stop
+./manage.sh cloud:restart
+./manage.sh cloud:status
+./manage.sh cloud:logs [svc]
+./manage.sh cloud:build
+```
+
+---
+
+## Cloud Managed Instance
+
+The cloud deployment runs at [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech) and at `menagerie.circuitforge.tech/pagepiper`. It uses `compose.cloud.yml` with LLM inference routed through the cf-orch coordinator.
+
+To run your own cloud-style deployment:
+
+```bash
+cp .env.cloud.example .env
+# Edit .env: set PAGEPIPER_OLLAMA_URL and data paths
+./manage.sh cloud:start
+```
+
+Cloud instance listens on port 8533. The API is internal-only; nginx proxies `/api/` to the backend.
+
+---
+
+## Data and Backups
+
+The `data/` directory contains the SQLite index database and all uploaded files. Back it up to preserve your index. Pagepiper indexes documents at ingest time. If you modify or replace a source file, use the re-index button on the document card to rebuild its entry.
+
+Large PDFs (hundreds of pages) can take a few minutes to index. The status badge on the document card updates as indexing progresses.
+
+---
+
+## Licensing
+
+Pagepiper uses a split license:
+
+- **MIT:** BM25 full-text search, document library management, ingest pipeline, EPUB support
+- **BSL 1.1:** Hybrid vector search (embedding + k-NN), RAG chat, LLM integration
+
+BSL 1.1 is free for personal non-commercial self-hosting. SaaS re-hosting or commercial redistribution requires a license from CircuitForge. BSL 1.1 converts to MIT after four years.
+
+License keys: [circuitforge.tech](https://circuitforge.tech)
+
+---
+
+## Contributing
+
+Issues and PRs welcome at [git.opensourcesolarpunk.com/Circuit-Forge/pagepiper](https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper).
+
+The ingest pipeline and BM25 index are MIT-licensed. If you build a better PDF parser or add support for additional formats (CBZ, MOBI, etc.), the community benefits directly.
diff --git a/app/api/chat.py b/app/api/chat.py
index c973ffe..dcfc35a 100644
--- a/app/api/chat.py
+++ b/app/api/chat.py
@@ -29,7 +29,7 @@ class ChatRequest(BaseModel):
message: str
history: list[ChatTurn] = []
doc_ids: list[str] | None = None
- top_k: int = 5
+ top_k: int = 10
class ChatResponse(BaseModel):
@@ -37,6 +37,13 @@ class ChatResponse(BaseModel):
citations: list[dict]
+class ChatFeedbackRequest(BaseModel):
+ rating: int # 1 = thumbs up, -1 = thumbs down
+ question: str = ""
+ answer: str = ""
+ doc_ids: list[str] = []
+
+
def _get_llm_router():
"""Return LLMRouter if Ollama configured, else None."""
from app.config import get_llm_config
@@ -125,3 +132,31 @@ def chat(req: ChatRequest) -> ChatResponse:
for c in result.citations
],
)
+
+
+@router.get("/feedback/status")
+def chat_feedback_status() -> dict:
+ enabled = os.environ.get("PAGEPIPER_CHAT_FEEDBACK", "").lower() in ("1", "true", "yes")
+ return {"enabled": enabled}
+
+
+@router.post("/feedback")
+def submit_chat_feedback(req: ChatFeedbackRequest) -> dict:
+ import json
+ import sqlite3
+
+ if req.rating not in (1, -1):
+ from fastapi import HTTPException
+ raise HTTPException(status_code=422, detail="rating must be 1 or -1")
+
+ db_path = _get_db_path()
+ con = sqlite3.connect(db_path)
+ try:
+ con.execute(
+ "INSERT INTO chat_feedback (rating, question, answer, doc_ids) VALUES (?, ?, ?, ?)",
+ (req.rating, req.question[:2000], req.answer[:4000], json.dumps(req.doc_ids)),
+ )
+ con.commit()
+ finally:
+ con.close()
+ return {"ok": True}
diff --git a/app/api/feedback.py b/app/api/feedback.py
new file mode 100644
index 0000000..57ac86e
--- /dev/null
+++ b/app/api/feedback.py
@@ -0,0 +1,7 @@
+"""Feedback router — provided by circuitforge-core."""
+from circuitforge_core.api import make_feedback_router
+
+router = make_feedback_router(
+ repo="Circuit-Forge/pagepiper",
+ product="pagepiper",
+)
diff --git a/app/api/feedback_attach.py b/app/api/feedback_attach.py
new file mode 100644
index 0000000..40c7285
--- /dev/null
+++ b/app/api/feedback_attach.py
@@ -0,0 +1,88 @@
+"""Screenshot attachment endpoint for in-app feedback.
+
+After the cf-core feedback router creates a Forgejo issue, the frontend
+can call POST /feedback/attach to upload a screenshot as a comment on that issue.
+"""
+from __future__ import annotations
+
+import base64
+import os
+
+import requests
+from fastapi import APIRouter, HTTPException
+from pydantic import BaseModel, Field
+
+router = APIRouter()
+
+_FORGEJO_BASE = os.environ.get(
+ "FORGEJO_API_URL", "https://git.opensourcesolarpunk.com/api/v1"
+)
+_REPO = "Circuit-Forge/pagepiper"
+_MAX_BYTES = 5 * 1024 * 1024
+
+
+class AttachRequest(BaseModel):
+ issue_number: int
+ filename: str = Field(default="screenshot.png", max_length=80)
+ image_b64: str # data URI or raw base64
+
+
+class AttachResponse(BaseModel):
+ comment_url: str
+
+
+def _forgejo_headers() -> dict[str, str]:
+ token = os.environ.get("FORGEJO_API_TOKEN", "")
+ return {"Authorization": f"token {token}"}
+
+
+def _decode_image(image_b64: str) -> tuple[bytes, str]:
+ if image_b64.startswith("data:"):
+ header, _, data = image_b64.partition(",")
+ mime = header.split(";")[0].split(":")[1] if ":" in header else "image/png"
+ else:
+ data = image_b64
+ mime = "image/png"
+ return base64.b64decode(data), mime
+
+
+@router.post("/attach", response_model=AttachResponse)
+def attach_screenshot(payload: AttachRequest) -> AttachResponse:
+ token = os.environ.get("FORGEJO_API_TOKEN", "")
+ if not token:
+ raise HTTPException(status_code=503, detail="Feedback not configured.")
+
+ raw_bytes, mime = _decode_image(payload.image_b64)
+ if len(raw_bytes) > _MAX_BYTES:
+ raise HTTPException(
+ status_code=413,
+ detail=f"Screenshot exceeds 5 MB limit ({len(raw_bytes) // 1024} KB received).",
+ )
+
+ asset_resp = requests.post(
+ f"{_FORGEJO_BASE}/repos/{_REPO}/issues/{payload.issue_number}/assets",
+ headers=_forgejo_headers(),
+ files={"attachment": (payload.filename, raw_bytes, mime)},
+ timeout=20,
+ )
+ if not asset_resp.ok:
+ raise HTTPException(
+ status_code=502,
+ detail=f"Forgejo asset upload failed: {asset_resp.text[:200]}",
+ )
+
+ asset_url = asset_resp.json().get("browser_download_url", "")
+ comment_body = f"**Screenshot attached by reporter:**\n\n"
+ comment_resp = requests.post(
+ f"{_FORGEJO_BASE}/repos/{_REPO}/issues/{payload.issue_number}/comments",
+ headers={**_forgejo_headers(), "Content-Type": "application/json"},
+ json={"body": comment_body},
+ timeout=15,
+ )
+ if not comment_resp.ok:
+ raise HTTPException(
+ status_code=502,
+ detail=f"Forgejo comment failed: {comment_resp.text[:200]}",
+ )
+
+ return AttachResponse(comment_url=comment_resp.json().get("html_url", ""))
diff --git a/app/api/library.py b/app/api/library.py
index 40fa525..ad96a8e 100644
--- a/app/api/library.py
+++ b/app/api/library.py
@@ -12,11 +12,13 @@ import uuid
from pathlib import Path
from typing import Callable
-from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
+from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, UploadFile
-from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH
+from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH, DATA_DIR
from app.deps import get_db
+_MAX_UPLOAD_BYTES = 200 * 1024 * 1024 # 200 MB
+
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/library", tags=["library"])
@@ -24,15 +26,31 @@ router = APIRouter(prefix="/api/library", tags=["library"])
_mark_bm25_dirty: Callable[[], None] | None = None
+_INGEST_TASKS = {
+ ".pdf": "pagepiper/ingest_pdf",
+ ".epub": "pagepiper/ingest_epub",
+}
+
+_INGEST_RUNNERS = {
+ ".pdf": "scripts.ingest_pdf",
+ ".epub": "scripts.ingest_epub",
+}
+
+
def _dispatch_ingest(
doc_id: str,
file_path: str,
background_tasks: BackgroundTasks,
) -> str:
"""Dispatch an ingest task. Tries cf-orch; falls back to BackgroundTasks."""
+ import importlib
import os as _os
from pathlib import Path as _Path
+ suffix = _Path(file_path).suffix.lower()
+ task_name = _INGEST_TASKS.get(suffix, "pagepiper/ingest_pdf")
+ runner_module = _INGEST_RUNNERS.get(suffix, "scripts.ingest_pdf")
+
# Read lazily so test fixtures (monkeypatch.setenv) take effect
_data_dir = _Path(_os.environ.get("PAGEPIPER_DATA_DIR", "data"))
task_id = str(uuid.uuid4())
@@ -45,11 +63,11 @@ def _dispatch_ingest(
try:
from circuitforge_core.tasks import dispatch_task # type: ignore[import]
- task_id = dispatch_task(caller="pagepiper/ingest_pdf", args=args)
+ task_id = dispatch_task(caller=task_name, args=args)
logger.info("Dispatched cf-orch ingest task %s for doc %s", task_id, doc_id)
except Exception:
- from scripts.ingest_pdf import run as run_ingest
- background_tasks.add_task(_run_ingest_background, run_ingest, args, task_id)
+ mod = importlib.import_module(runner_module)
+ background_tasks.add_task(_run_ingest_background, mod.run, args, task_id)
logger.info(
"cf-orch unavailable — running ingest in background thread (task %s)", task_id
)
@@ -89,7 +107,7 @@ def scan_library(
if not watch.exists():
raise HTTPException(status_code=404, detail=f"Watch directory not found: {watch}")
- pdfs = list(watch.glob("**/*.pdf"))
+ pdfs = list(watch.glob("**/*.pdf")) + list(watch.glob("**/*.epub"))
queued = []
for pdf_path in pdfs:
@@ -156,7 +174,8 @@ def delete_document(
# Remove embeddings from vector store
try:
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore # type: ignore[import]
- store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=768)
+ from app.config import VEC_DIMENSIONS
+ store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=VEC_DIMENSIONS)
store.delete_where({"doc_id": doc_id})
except Exception as exc:
logger.warning("Could not remove vectors for doc %s: %s", doc_id, exc)
@@ -165,6 +184,20 @@ def delete_document(
_mark_bm25_dirty()
+def _get_vec_count(doc_id: str) -> int:
+ """Return how many vectors have been stored for this doc. Returns 0 on any error."""
+ try:
+ conn = sqlite3.connect(VEC_DB_PATH)
+ count = conn.execute(
+ "SELECT COUNT(*) FROM page_vecs_meta WHERE json_extract(metadata, '$.doc_id') = ?",
+ [doc_id],
+ ).fetchone()[0]
+ conn.close()
+ return int(count)
+ except Exception:
+ return 0
+
+
@router.get("/{doc_id}/status")
def document_status(
doc_id: str,
@@ -176,4 +209,54 @@ def document_status(
).fetchone()
if not row:
raise HTTPException(status_code=404, detail="Document not found")
- return dict(row)
+ result = dict(row)
+ result["vec_count"] = _get_vec_count(doc_id)
+ return result
+
+
+@router.post("/upload", status_code=202)
+def upload_document(
+ file: UploadFile,
+ background_tasks: BackgroundTasks,
+ db: sqlite3.Connection = Depends(get_db),
+) -> dict:
+ """Accept a PDF/EPUB upload, save to data/uploads/, and queue for indexing."""
+ name = Path(file.filename or "").name
+ suffix = Path(name).suffix.lower()
+ if suffix not in _INGEST_TASKS:
+ raise HTTPException(status_code=400, detail="Supported formats: PDF, EPUB")
+
+ content = file.file.read()
+ if len(content) > _MAX_UPLOAD_BYTES:
+ raise HTTPException(status_code=413, detail="File exceeds 200 MB limit")
+
+ upload_dir = DATA_DIR / "uploads"
+ upload_dir.mkdir(parents=True, exist_ok=True)
+ dest = upload_dir / name
+ dest.write_bytes(content)
+
+ path_str = str(dest.resolve())
+ existing = db.execute(
+ "SELECT id, status FROM documents WHERE file_path = ?", [path_str]
+ ).fetchone()
+
+ if existing and existing["status"] == "ready":
+ return {"doc_id": existing["id"], "task_id": None, "filename": name, "status": "already_indexed"}
+
+ if existing:
+ doc_id = existing["id"]
+ else:
+ title = dest.stem.replace("_", " ").replace("-", " ").title()
+ doc_id = db.execute(
+ "INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
+ [title, path_str, "pending"],
+ ).fetchone()[0]
+ db.commit()
+
+ task_id = _dispatch_ingest(doc_id, path_str, background_tasks)
+ db.execute(
+ "UPDATE documents SET status='processing', task_id=? WHERE id=?",
+ [task_id, doc_id],
+ )
+ db.commit()
+ return {"doc_id": doc_id, "task_id": task_id, "filename": name, "status": "queued"}
diff --git a/app/config.py b/app/config.py
index 9459f42..b18568f 100644
--- a/app/config.py
+++ b/app/config.py
@@ -10,6 +10,7 @@ DATA_DIR.mkdir(parents=True, exist_ok=True)
DB_PATH = str(DATA_DIR / "pagepiper.db")
VEC_DB_PATH = str(DATA_DIR / "pagepiper_vecs.db")
WATCH_DIR = Path(os.environ.get("PAGEPIPER_WATCH_DIR", "books"))
+VEC_DIMENSIONS = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024"))
def get_llm_config() -> dict | None:
@@ -19,17 +20,27 @@ def get_llm_config() -> dict | None:
return None
_clean = url.rstrip("/")
_base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
+ chat_model = os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b")
+
+ backend: dict = {
+ "type": "openai_compat",
+ "base_url": _base_url,
+ "model": chat_model,
+ "embedding_model": os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text"),
+ "supports_images": False,
+ }
+
+ # Wire cf-orch allocation when coordinator is configured so the model stays warm
+ # and cold-start latency doesn't cause chat timeouts.
+ orch_url = os.environ.get("CF_ORCH_URL", "").strip()
+ if orch_url:
+ backend["cf_orch"] = {
+ "service": "ollama",
+ "model_candidates": [chat_model],
+ "ttl_s": 3600,
+ }
+
return {
"fallback_order": ["ollama"],
- "backends": {
- "ollama": {
- "type": "openai_compat",
- "base_url": _base_url,
- "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
- "embedding_model": os.environ.get(
- "PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
- ),
- "supports_images": False,
- }
- },
+ "backends": {"ollama": backend},
}
diff --git a/app/deps.py b/app/deps.py
index e795afb..78cf87c 100644
--- a/app/deps.py
+++ b/app/deps.py
@@ -9,7 +9,7 @@ from app.config import DB_PATH
def get_db() -> Generator[sqlite3.Connection, None, None]:
- conn = sqlite3.connect(DB_PATH)
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
conn.execute("PRAGMA foreign_keys = ON")
conn.execute("PRAGMA journal_mode = WAL")
conn.row_factory = sqlite3.Row
diff --git a/app/main.py b/app/main.py
index 7469d3d..e52f09a 100644
--- a/app/main.py
+++ b/app/main.py
@@ -3,11 +3,15 @@
from __future__ import annotations
import logging
+import os
+import re
+import sqlite3
+import threading
from contextlib import asynccontextmanager
from fastapi import FastAPI
-from app.config import DB_PATH
+from app.config import DB_PATH, VEC_DB_PATH, VEC_DIMENSIONS
from app.services.bm25_index import BM25Index
logger = logging.getLogger("pagepiper")
@@ -21,9 +25,91 @@ def _apply_migrations() -> None:
migrate(DB_PATH)
+def _reembed_docs(docs: list[tuple[str, str]], db_path: str, vec_db_path: str) -> None:
+ """Re-run full ingest for a list of (doc_id, file_path) sequentially."""
+ for doc_id, file_path in docs:
+ suffix = os.path.splitext(file_path)[1].lower()
+ try:
+ if suffix == ".epub":
+ from scripts.ingest_epub import run
+ else:
+ from scripts.ingest_pdf import run
+ logger.info("Auto re-embed: starting %s", os.path.basename(file_path))
+ run(doc_id=doc_id, file_path=file_path, db_path=db_path, vec_db_path=vec_db_path)
+ except Exception as exc:
+ logger.error("Auto re-embed failed for doc %s: %s", doc_id[:8], exc)
+
+
+def _check_vec_schema(vec_db_path: str, expected_dims: int, db_path: str) -> None:
+ """Drop the vec DB if its stored dimension doesn't match config, then queue re-embed.
+
+ sqlite-vec bakes the embedding dimension into the virtual table DDL, so changing
+ models requires dropping and recreating the whole file. Catches the mismatch at
+ startup rather than surfacing it as an obscure OperationalError mid-request.
+ """
+ if not os.path.exists(vec_db_path):
+ return
+ try:
+ conn = sqlite3.connect(vec_db_path)
+ row = conn.execute(
+ "SELECT sql FROM sqlite_master WHERE name='page_vecs_vecs'"
+ ).fetchone()
+ conn.close()
+ except Exception as exc:
+ logger.warning("Vec schema check could not read %s (non-fatal): %s", vec_db_path, exc)
+ return
+
+ if not row:
+ return # table not yet created — first embed will build it with the right dims
+
+ m = re.search(r'float\[(\d+)\]', row[0])
+ if not m:
+ return
+ actual_dims = int(m.group(1))
+ if actual_dims == expected_dims:
+ return
+
+ logger.warning(
+ "Vec DB dimension mismatch: stored=%d, configured=%d — dropping %s and queuing re-embed",
+ actual_dims, expected_dims, vec_db_path,
+ )
+ try:
+ os.remove(vec_db_path)
+ except OSError as exc:
+ logger.error(
+ "Could not delete stale vec DB %s: %s — fix permissions and restart", vec_db_path, exc
+ )
+ return
+
+ # Collect all ready docs so we can rebuild their embeddings in the background.
+ try:
+ conn = sqlite3.connect(db_path)
+ docs = conn.execute(
+ "SELECT id, file_path FROM documents WHERE status='ready'"
+ ).fetchall()
+ conn.close()
+ except Exception as exc:
+ logger.warning("Could not query documents for re-embed: %s", exc)
+ return
+
+ if not docs:
+ return
+
+ logger.info("Queuing re-embed for %d document(s) in background", len(docs))
+ threading.Thread(
+ target=_reembed_docs,
+ args=(docs, db_path, vec_db_path),
+ daemon=True,
+ name="pagepiper-reembed",
+ ).start()
+
+
@asynccontextmanager
async def lifespan(app: FastAPI):
_apply_migrations()
+ embed_model = os.environ.get("PAGEPIPER_EMBED_MODEL", "nomic-embed-text")
+ logger.info("Pagepiper starting — embed model: %s, dims: %d", embed_model, VEC_DIMENSIONS)
+ _check_vec_schema(VEC_DB_PATH, VEC_DIMENSIONS, DB_PATH)
_bm25.mark_dirty() # will rebuild on first search
yield
@@ -39,8 +125,12 @@ from app.api.library import router as library_router # noqa: E402
from app.api.ingest import router as ingest_router # noqa: E402
from app.api.search import router as search_router # noqa: E402
from app.api.chat import router as chat_router # noqa: E402
+from app.api.feedback import router as feedback_router # noqa: E402
+from app.api.feedback_attach import router as feedback_attach_router # noqa: E402
app.include_router(library_router)
app.include_router(ingest_router)
app.include_router(search_router)
app.include_router(chat_router)
+app.include_router(feedback_router, prefix="/api/v1/feedback")
+app.include_router(feedback_attach_router, prefix="/api/v1/feedback")
diff --git a/app/services/retriever.py b/app/services/retriever.py
index b335171..35ee597 100644
--- a/app/services/retriever.py
+++ b/app/services/retriever.py
@@ -8,6 +8,7 @@ BM25-only path is MIT and has no gate.
from __future__ import annotations
import logging
+import sqlite3
from dataclasses import dataclass
from app.services.bm25_index import BM25Index
@@ -15,6 +16,62 @@ from app.services.bm25_index import BM25Index
logger = logging.getLogger(__name__)
+def _fetch_adjacent(
+ hits: list["RetrievedChunk"],
+ db_path: str,
+ window: int = 1,
+) -> list["RetrievedChunk"]:
+ """Return chunks immediately before/after each hit that aren't already in the hit set.
+
+ Definitional passages often start mid-sentence because the EPUB/PDF chunk
+ boundary fell mid-paragraph. Fetching the preceding chunk restores the subject
+ so the LLM can understand 'them' / 'they' references correctly.
+ """
+ if not hits:
+ return []
+
+ existing_keys = {(c.doc_id, c.page_number) for c in hits}
+ needed: dict[str, set[int]] = {}
+ for c in hits:
+ for delta in range(-window, window + 1):
+ if delta == 0:
+ continue
+ adj_page = c.page_number + delta
+ if adj_page > 0 and (c.doc_id, adj_page) not in existing_keys:
+ needed.setdefault(c.doc_id, set()).add(adj_page)
+
+ if not needed:
+ return []
+
+ extra: list[RetrievedChunk] = []
+ try:
+ conn = sqlite3.connect(db_path)
+ conn.row_factory = sqlite3.Row
+ for doc_id, pages in needed.items():
+ placeholders = ",".join("?" * len(pages))
+ rows = conn.execute(
+ f"SELECT id, doc_id, page_number, text FROM page_chunks "
+ f"WHERE doc_id=? AND page_number IN ({placeholders})",
+ [doc_id] + sorted(pages),
+ ).fetchall()
+ for row in rows:
+ extra.append(
+ RetrievedChunk(
+ chunk_id=row["id"],
+ doc_id=row["doc_id"],
+ page_number=row["page_number"],
+ text=row["text"],
+ bm25_score=0.0,
+ vector_score=None,
+ )
+ )
+ conn.close()
+ except Exception as exc:
+ logger.warning("Context expansion query failed (non-fatal): %s", exc)
+
+ return extra
+
+
@dataclass(frozen=True)
class RetrievedChunk:
"""A chunk returned by the retriever, with source scores."""
@@ -55,13 +112,23 @@ class Retriever:
for r in self._bm25.query(query, top_k=top_k * 2, doc_ids=doc_ids)
}
- vec = llm.embed([query])[0]
- store = LocalSQLiteVecStore(db_path=vec_db_path, table="page_vecs", dimensions=768)
- filter_meta = {"doc_id": doc_ids[0]} if doc_ids and len(doc_ids) == 1 else None
- vec_hits = store.query(vec, top_k=top_k * 2, filter_metadata=filter_meta)
+ try:
+ vec = llm.embed([query])[0]
+ except Exception as exc:
+ logger.warning("Embed failed, falling back to BM25-only: %s", exc)
+ return self._bm25_only(query, top_k, doc_ids, db_path)
+ from app.config import VEC_DIMENSIONS
+ store = LocalSQLiteVecStore(db_path=vec_db_path, table="page_vecs", dimensions=VEC_DIMENSIONS)
- if doc_ids and len(doc_ids) > 1:
- vec_hits = [h for h in vec_hits if h.metadata.get("doc_id") in doc_ids]
+ # sqlite-vec applies filter_metadata as a Python post-filter after fetching k
+ # nearest globally. When the corpus spans many documents and only a subset is
+ # selected, most of those k candidates are from non-target docs and get dropped,
+ # leaving too few vector hits. Oversample heavily and filter in Python instead.
+ if doc_ids:
+ vec_candidates = store.query(vec, top_k=top_k * 20)
+ vec_hits = [h for h in vec_candidates if h.metadata.get("doc_id") in doc_ids]
+ else:
+ vec_hits = store.query(vec, top_k=top_k * 2)
# Merge: BM25 hits take priority; vector hits fill in additional results
merged: dict[str, RetrievedChunk] = {}
@@ -76,10 +143,10 @@ class Retriever:
)
for vh in vec_hits:
# _chunks is the loaded list of dicts from BM25Index; no public accessor exists
- text = next((c["text"] for c in self._bm25._chunks if c["id"] == vh.id), "")
- if vh.id in merged:
- existing = merged[vh.id]
- merged[vh.id] = RetrievedChunk(
+ text = next((c["text"] for c in self._bm25._chunks if c["id"] == vh.entry_id), "")
+ if vh.entry_id in merged:
+ existing = merged[vh.entry_id]
+ merged[vh.entry_id] = RetrievedChunk(
chunk_id=existing.chunk_id,
doc_id=existing.doc_id,
page_number=existing.page_number,
@@ -88,8 +155,8 @@ class Retriever:
vector_score=vh.score,
)
else:
- merged[vh.id] = RetrievedChunk(
- chunk_id=vh.id,
+ merged[vh.entry_id] = RetrievedChunk(
+ chunk_id=vh.entry_id,
doc_id=vh.metadata.get("doc_id", ""),
page_number=int(vh.metadata.get("page_number", 0)),
text=text,
@@ -103,14 +170,15 @@ class Retriever:
vec = (1.0 / (1.0 + r.vector_score)) if r.vector_score is not None else 0.0
return bm25 * 0.5 + vec * 0.5
- ranked = sorted(merged.values(), key=_combined, reverse=True)
- return ranked[:top_k]
+ ranked = sorted(merged.values(), key=_combined, reverse=True)[:top_k]
+ adjacent = _fetch_adjacent(ranked, db_path)
+ return ranked + adjacent
def _bm25_only(
self, query: str, top_k: int, doc_ids: list[str] | None, db_path: str
) -> list[RetrievedChunk]:
self._bm25.ensure_fresh(db_path)
- return [
+ hits = [
RetrievedChunk(
chunk_id=r.chunk_id,
doc_id=r.doc_id,
@@ -121,3 +189,5 @@ class Retriever:
)
for r in self._bm25.query(query, top_k=top_k, doc_ids=doc_ids)
]
+ adjacent = _fetch_adjacent(hits, db_path)
+ return hits + adjacent
diff --git a/app/services/synthesizer.py b/app/services/synthesizer.py
index 634b523..b9efdf2 100644
--- a/app/services/synthesizer.py
+++ b/app/services/synthesizer.py
@@ -42,7 +42,9 @@ class Synthesizer:
history: list[dict],
chunks: list[RetrievedChunk],
) -> SynthesisResult:
- context_parts = [f"[p.{c.page_number}]\n{c.text[:500]}" for c in chunks]
+ # 1500 chars (~300 words) per chunk: enough to capture definitions that
+ # appear mid-paragraph without blowing past a 32k-context model's limit.
+ context_parts = [f"[p.{c.page_number}]\n{c.text[:1500]}" for c in chunks]
context = "\n\n---\n\n".join(context_parts)
prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}"
@@ -52,7 +54,7 @@ class Synthesizer:
Citation(
doc_id=c.doc_id,
page_number=c.page_number,
- snippet=c.text[:200],
+ snippet=c.text[:400],
bm25_score=c.bm25_score,
)
for c in chunks
diff --git a/compose.cloud.yml b/compose.cloud.yml
index 2bebdd7..2708b71 100644
--- a/compose.cloud.yml
+++ b/compose.cloud.yml
@@ -20,6 +20,8 @@ services:
# cf-orch: route LLM inference through coordinator for managed GPU access
CF_ORCH_URL: http://host.docker.internal:7700
CF_APP_NAME: pagepiper
+ # CF_LICENSE_KEY is the auth token CFOrchClient sends to the coordinator
+ CF_LICENSE_KEY: ${COORDINATOR_PAGEPIPER_KEY:-}
COORDINATOR_URL: http://10.1.10.71:7700
COORDINATOR_PAGEPIPER_KEY: ${COORDINATOR_PAGEPIPER_KEY:-}
extra_hosts:
diff --git a/docs/index.md b/docs/index.md
index f674c55..42f0fc9 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -1,6 +1,6 @@
# Pagepiper
-Self-hosted document search with BM25 full-text indexing and (with local Ollama) hybrid vector search and LLM-powered chat.
+Self-hosted document search with BM25 full-text indexing and (with local Ollama) hybrid vector search and LLM-powered chat. Supports PDF and EPUB files.
## Demo
@@ -12,7 +12,7 @@ Try it: [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech)

-Scan your PDF directory to index documents. Each document shows page count and ingest status.
+Scan your PDF directory to index documents, or upload individual PDFs directly. Each document shows page count and ingest status.
### Chat
@@ -20,25 +20,123 @@ Scan your PDF directory to index documents. Each document shows page count and i
Ask questions across your indexed documents. Results cite the source document and page number.
-## Quick Start (Docker)
-
-```bash
-git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper
-cd pagepiper
-cp .env.example .env # set PAGEPIPER_DATA_DIR and PAGEPIPER_BOOKS_DIR
-docker compose up -d --build
-# open http://localhost:8521
-```
-
-Place PDFs in your `PAGEPIPER_BOOKS_DIR` directory, then click "Scan for PDFs" in the Library view.
-
## Tiers
| Feature | Free | Paid (BYOK) |
|---------|------|-------------|
| BM25 full-text search | Yes | Yes |
+| PDF and EPUB upload via browser | Yes | Yes |
| Unlimited local ingestion | Yes | Yes |
| Hybrid vector search | No | Yes (local Ollama) |
| LLM chat over documents | No | Yes (local Ollama) |
-Set `PAGEPIPER_OLLAMA_URL` in your `.env` to unlock the Paid tier with your own Ollama instance.
+BYOK (Bring Your Own Key) means you supply your own Ollama instance. No cloud API keys required.
+
+---
+
+## Self-Hosting Guide
+
+### Prerequisites
+
+- [Docker](https://docs.docker.com/get-docker/) and Docker Compose
+- PDFs you want to search
+- Optional: [Ollama](https://ollama.com) running locally for semantic search and LLM chat
+
+### Step 1: Get the code
+
+```bash
+git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper
+cd pagepiper
+```
+
+### Step 2: Configure
+
+```bash
+cp .env.example .env
+```
+
+Open `.env` and set your directories:
+
+```dotenv
+# Where pagepiper stores its index database
+PAGEPIPER_DATA_DIR=./data
+
+# Directory to scan for PDFs (used by the "Scan for PDFs" button)
+# You can also upload individual PDFs via the web UI without setting this
+PAGEPIPER_BOOKS_DIR=/path/to/your/pdfs
+```
+
+To unlock hybrid vector search and LLM chat, add your Ollama endpoint:
+
+```dotenv
+PAGEPIPER_OLLAMA_URL=http://localhost:11434
+PAGEPIPER_CHAT_MODEL=mistral:7b
+PAGEPIPER_EMBED_MODEL=nomic-embed-text
+```
+
+### Step 3: Start
+
+```bash
+docker compose up -d --build
+```
+
+Open [http://localhost:8521](http://localhost:8521) in your browser.
+
+### Step 4: Add your PDFs
+
+Two ways to add documents:
+
+**Option A — Upload via browser** (easiest for small collections):
+
+Click the **Upload PDF** button in the Library view and select a file. It saves to `data/uploads/` and begins indexing automatically.
+
+**Option B — Mount a directory** (best for large collections):
+
+Set `PAGEPIPER_BOOKS_DIR` in your `.env` to point at a folder of PDFs, then click **Scan for PDFs**. Pagepiper finds all `.pdf` files recursively and queues them for indexing.
+
+### Step 5: Search
+
+Switch to the **Chat** tab and ask questions about your documents. The Free tier uses BM25 keyword matching. With Ollama configured, you get semantic (vector) search and LLM-generated answers with page-level citations.
+
+---
+
+## Ollama Setup (optional)
+
+Install Ollama from [ollama.com](https://ollama.com), then pull the models:
+
+```bash
+ollama pull mistral:7b
+ollama pull nomic-embed-text
+```
+
+Pagepiper's Docker container reaches Ollama at `host.docker.internal` — no extra network config needed on Linux/Mac with Docker Desktop. On a headless Linux server, make sure Ollama binds to `0.0.0.0`:
+
+```bash
+OLLAMA_HOST=0.0.0.0 ollama serve
+```
+
+---
+
+## Managing the instance
+
+```bash
+# Check status
+docker compose ps
+
+# View API logs
+docker compose logs -f api
+
+# Stop
+docker compose down
+
+# Rebuild after updates
+docker compose up -d --build
+```
+
+---
+
+## Notes
+
+- Pagepiper indexes PDFs at ingest time. Changes to the source file require a re-index (use the re-index button on the document card).
+- The `data/` directory contains the SQLite index database and any uploaded files. Back it up to preserve your index.
+- Large PDFs (hundreds of pages) can take a few minutes to index. Watch the status badge on the document card.
diff --git a/environment.yml b/environment.yml
index f2a94a2..3e3bcf8 100644
--- a/environment.yml
+++ b/environment.yml
@@ -14,6 +14,8 @@ dependencies:
- pdfplumber>=0.11
- pytesseract>=0.3
- Pillow>=10.0
+ - ebooklib>=0.18
+ - beautifulsoup4>=4.12
- sqlite-vec>=0.1
- pytest>=8.0
- pytest-asyncio>=0.23
diff --git a/migrations/002_chat_feedback.sql b/migrations/002_chat_feedback.sql
new file mode 100644
index 0000000..31dcba6
--- /dev/null
+++ b/migrations/002_chat_feedback.sql
@@ -0,0 +1,9 @@
+-- chat answer thumbs up/down signals (local SQLite, always available)
+CREATE TABLE IF NOT EXISTS chat_feedback (
+ id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))),
+ rating INTEGER NOT NULL CHECK (rating IN (1, -1)),
+ question TEXT NOT NULL DEFAULT '',
+ answer TEXT NOT NULL DEFAULT '',
+ doc_ids TEXT NOT NULL DEFAULT '[]',
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
+);
diff --git a/scripts/ingest_epub.py b/scripts/ingest_epub.py
new file mode 100644
index 0000000..8984726
--- /dev/null
+++ b/scripts/ingest_epub.py
@@ -0,0 +1,239 @@
+# scripts/ingest_epub.py
+"""
+cf-orch task: pagepiper/ingest_epub
+
+Extracts text from an EPUB file, stores chapter chunks in SQLite, and (if Ollama is
+configured) generates embeddings and stores them in the sqlite-vec store.
+
+Each EPUB chapter becomes one chunk (equivalent to a PDF page).
+
+Entry point:
+ python scripts/ingest_epub.py --doc-id X --file-path Y --db-path Z --vec-db-path W
+"""
+from __future__ import annotations
+
+import logging
+import os
+import sqlite3
+from dataclasses import dataclass
+from pathlib import Path
+
+logger = logging.getLogger("pagepiper.ingest_epub")
+
+EMBED_BATCH_SIZE = 64
+_WORDS_PER_CHUNK = 500 # target chunk size for word-count fallback
+
+
+@dataclass
+class _Chunk:
+ page_number: int
+ text: str
+ source: str
+ word_count: int
+
+
+def _paragraphs_from_soup(soup) -> list[str]:
+ """Extract non-trivial, artifact-free text lines from parsed HTML."""
+ from scripts.text_clean import filter_paragraphs
+ raw = soup.get_text(separator="\n", strip=True)
+ return filter_paragraphs(raw.splitlines())
+
+
+def _chunks_from_paragraphs(paragraphs: list[str], start_num: int) -> list[_Chunk]:
+ """Accumulate paragraphs into ~_WORDS_PER_CHUNK-word chunks."""
+ chunks: list[_Chunk] = []
+ current: list[str] = []
+ current_count = 0
+ chunk_num = start_num
+
+ for para in paragraphs:
+ words = para.split()
+ if current_count + len(words) > _WORDS_PER_CHUNK and current:
+ text = "\n".join(current)
+ chunks.append(_Chunk(chunk_num, text, "text", current_count))
+ chunk_num += 1
+ current, current_count = [], 0
+ current.append(para)
+ current_count += len(words)
+
+ if current:
+ text = "\n".join(current)
+ chunks.append(_Chunk(chunk_num, text, "text", current_count))
+
+ return chunks
+
+
+def _extract_chunks(file_path: str) -> list[_Chunk]:
+ import ebooklib
+ from ebooklib import epub
+ from bs4 import BeautifulSoup
+ from scripts.text_clean import clean_line, is_artifact_line
+
+ book = epub.read_epub(file_path, options={"ignore_ncx": True})
+ all_chunks: list[_Chunk] = []
+
+ for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
+ soup = BeautifulSoup(item.get_content(), "html.parser")
+ headings = soup.find_all(["h1", "h2", "h3", "h4"])
+
+ if len(headings) >= 2:
+ # Heading-based split: one chunk per section
+ current_parts: list[str] = []
+ for elem in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "blockquote"]):
+ if elem.name in ("h1", "h2", "h3", "h4"):
+ if current_parts:
+ text = "\n".join(current_parts).strip()
+ if text:
+ n = len(all_chunks) + 1
+ all_chunks.append(_Chunk(n, text, "text", len(text.split())))
+ current_parts = [elem.get_text(" ", strip=True)]
+ else:
+ t = clean_line(elem.get_text(" ", strip=True))
+ if t and not is_artifact_line(t):
+ current_parts.append(t)
+ if current_parts:
+ text = "\n".join(current_parts).strip()
+ if text:
+ n = len(all_chunks) + 1
+ all_chunks.append(_Chunk(n, text, "text", len(text.split())))
+ else:
+ # Word-count fallback: accumulate paragraphs into ~500-word chunks
+ paragraphs = _paragraphs_from_soup(soup)
+ if paragraphs:
+ all_chunks.extend(_chunks_from_paragraphs(paragraphs, len(all_chunks) + 1))
+
+ return all_chunks
+
+
+def _update_status(
+ conn: sqlite3.Connection,
+ doc_id: str,
+ status: str,
+ page_count: int | None = None,
+ error_msg: str | None = None,
+) -> None:
+ if page_count is not None:
+ conn.execute(
+ "UPDATE documents SET status=?, page_count=?, updated_at=datetime('now') WHERE id=?",
+ [status, page_count, doc_id],
+ )
+ elif error_msg is not None:
+ conn.execute(
+ "UPDATE documents SET status=?, error_msg=?, updated_at=datetime('now') WHERE id=?",
+ [status, error_msg, doc_id],
+ )
+ else:
+ conn.execute(
+ "UPDATE documents SET status=?, updated_at=datetime('now') WHERE id=?",
+ [status, doc_id],
+ )
+ conn.commit()
+
+
+def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None:
+ """Run the full ingest pipeline for one EPUB. Called by cf-orch or BackgroundTasks."""
+ conn: sqlite3.Connection | None = None
+ try:
+ conn = sqlite3.connect(db_path, timeout=30)
+ conn.execute("PRAGMA journal_mode = WAL")
+ conn.execute("PRAGMA foreign_keys = ON")
+ _update_status(conn, doc_id, "processing")
+
+ logger.info("Extracting chapters from %s", file_path)
+ chunks = _extract_chunks(file_path)
+ logger.info("Extracted %d chapters", len(chunks))
+
+ conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id])
+ chunk_rows: list[tuple[str, int, str]] = []
+ for chunk in chunks:
+ row = conn.execute(
+ """INSERT INTO page_chunks(doc_id, page_number, text, source, word_count)
+ VALUES (?,?,?,?,?) RETURNING id""",
+ [doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count],
+ ).fetchone()
+ chunk_rows.append((row[0], chunk.page_number, chunk.text))
+ conn.commit()
+
+ # Embedding failure is non-fatal: document remains BM25-searchable.
+ ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
+ if ollama_url and chunks:
+ try:
+ logger.info("Embedding %d chapters via Ollama at %s", len(chunks), ollama_url)
+ from circuitforge_core.llm import LLMRouter
+ from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
+
+ _clean = ollama_url.rstrip("/")
+ base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
+ router = LLMRouter({
+ "fallback_order": ["ollama"],
+ "backends": {
+ "ollama": {
+ "type": "openai_compat",
+ "base_url": base_url,
+ "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
+ "embedding_model": os.environ.get(
+ "PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
+ ),
+ "supports_images": False,
+ }
+ },
+ })
+ embed_dims = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024"))
+ vec_store = LocalSQLiteVecStore(
+ db_path=vec_db_path, table="page_vecs", dimensions=embed_dims
+ )
+ vec_store.delete_where({"doc_id": doc_id})
+
+ texts = [text for _, _, text in chunk_rows]
+ vectors: list[list[float]] = []
+ for i in range(0, len(texts), EMBED_BATCH_SIZE):
+ vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE]))
+
+ for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors):
+ vec_store.upsert(
+ entry_id=chunk_id,
+ vector=vector,
+ metadata={"doc_id": doc_id, "page_number": page_number},
+ )
+ logger.info("Stored %d embeddings", len(vectors))
+ except Exception as embed_exc:
+ logger.warning(
+ "Embedding skipped for doc %s — BM25 only (reason: %s)",
+ doc_id, embed_exc,
+ )
+
+ _update_status(conn, doc_id, "ready", page_count=len(chunks))
+ logger.info("Ingest complete for doc %s (%d chapters)", doc_id, len(chunks))
+
+ except Exception as exc:
+ logger.error("Ingest failed for doc %s: %s", doc_id, exc, exc_info=True)
+ if conn is not None:
+ try:
+ _update_status(conn, doc_id, "error", error_msg=str(exc))
+ except Exception:
+ logger.warning("Could not write error status for doc %s", doc_id)
+ raise
+ finally:
+ if conn is not None:
+ conn.close()
+
+
+if __name__ == "__main__":
+ import argparse
+
+ logging.basicConfig(level=logging.INFO)
+
+ parser = argparse.ArgumentParser(
+ description="Ingest an EPUB (cf-orch task entry point)"
+ )
+ parser.add_argument("--doc-id", required=True)
+ parser.add_argument("--file-path", required=True)
+ parser.add_argument("--db-path", required=True)
+ parser.add_argument("--vec-db-path", required=True)
+ a = parser.parse_args()
+ run(
+ doc_id=a.doc_id,
+ file_path=a.file_path,
+ db_path=a.db_path,
+ vec_db_path=a.vec_db_path,
+ )
diff --git a/scripts/ingest_pdf.py b/scripts/ingest_pdf.py
index a6bf9af..65671de 100644
--- a/scripts/ingest_pdf.py
+++ b/scripts/ingest_pdf.py
@@ -52,7 +52,8 @@ def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None:
conn: sqlite3.Connection | None = None
try:
- conn = sqlite3.connect(db_path)
+ conn = sqlite3.connect(db_path, timeout=30)
+ conn.execute("PRAGMA journal_mode = WAL")
conn.execute("PRAGMA foreign_keys = ON")
_update_status(conn, doc_id, "processing")
@@ -63,59 +64,71 @@ def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None:
logger.info("Extracted %d pages", len(chunks))
# Step 2: Store chunks (replace any existing for this doc)
+ from scripts.text_clean import clean_paragraph
conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id])
chunk_rows: list[tuple[str, int, str]] = []
for chunk in chunks:
+ cleaned_text = clean_paragraph(chunk.text)
+ if not cleaned_text:
+ continue
row = conn.execute(
"""INSERT INTO page_chunks(doc_id, page_number, text, source, word_count)
VALUES (?,?,?,?,?) RETURNING id""",
- [doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count],
+ [doc_id, chunk.page_number, cleaned_text, chunk.source, len(cleaned_text.split())],
).fetchone()
- chunk_rows.append((row[0], chunk.page_number, chunk.text))
+ chunk_rows.append((row[0], chunk.page_number, cleaned_text))
conn.commit()
# Step 3: Embed and store vectors if Ollama is configured (BYOK gate)
+ # Embedding failure is non-fatal: document remains BM25-searchable.
ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
if ollama_url and chunks:
- logger.info("Embedding %d pages via Ollama at %s", len(chunks), ollama_url)
- from circuitforge_core.llm import LLMRouter
- from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
+ try:
+ logger.info("Embedding %d pages via Ollama at %s", len(chunks), ollama_url)
+ from circuitforge_core.llm import LLMRouter
+ from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
- _clean = ollama_url.rstrip("/")
- base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
- router = LLMRouter({
- "fallback_order": ["ollama"],
- "backends": {
- "ollama": {
- "type": "openai_compat",
- "base_url": base_url,
- "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
- "embedding_model": os.environ.get(
- "PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
- ),
- "supports_images": False,
- }
- },
- })
- vec_store = LocalSQLiteVecStore(
- db_path=vec_db_path, table="page_vecs", dimensions=768
- )
- # Remove old vectors before re-inserting. If embedding fails mid-way,
- # old vectors are gone but new ones are partial — re-ingest recovers.
- vec_store.delete_where({"doc_id": doc_id})
-
- texts = [text for _, _, text in chunk_rows]
- vectors: list[list[float]] = []
- for i in range(0, len(texts), EMBED_BATCH_SIZE):
- vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE]))
-
- for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors):
- vec_store.upsert(
- id=chunk_id,
- vector=vector,
- metadata={"doc_id": doc_id, "page_number": page_number},
+ _clean = ollama_url.rstrip("/")
+ base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
+ router = LLMRouter({
+ "fallback_order": ["ollama"],
+ "backends": {
+ "ollama": {
+ "type": "openai_compat",
+ "base_url": base_url,
+ "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
+ "embedding_model": os.environ.get(
+ "PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
+ ),
+ "supports_images": False,
+ }
+ },
+ })
+ embed_dims = int(os.environ.get("PAGEPIPER_EMBED_DIMS", "1024"))
+ vec_store = LocalSQLiteVecStore(
+ db_path=vec_db_path, table="page_vecs", dimensions=embed_dims
+ )
+ # Remove old vectors before re-inserting. If embedding fails mid-way,
+ # old vectors are gone but new ones are partial — re-ingest recovers.
+ vec_store.delete_where({"doc_id": doc_id})
+
+ texts = [text for _, _, text in chunk_rows]
+ vectors: list[list[float]] = []
+ for i in range(0, len(texts), EMBED_BATCH_SIZE):
+ vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE]))
+
+ for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors):
+ vec_store.upsert(
+ entry_id=chunk_id,
+ vector=vector,
+ metadata={"doc_id": doc_id, "page_number": page_number},
+ )
+ logger.info("Stored %d embeddings", len(vectors))
+ except Exception as embed_exc:
+ logger.warning(
+ "Embedding skipped for doc %s — BM25 only (reason: %s)",
+ doc_id, embed_exc,
)
- logger.info("Stored %d embeddings", len(vectors))
_update_status(conn, doc_id, "ready", page_count=len(chunks))
logger.info("Ingest complete for doc %s (%d pages)", doc_id, len(chunks))
diff --git a/scripts/text_clean.py b/scripts/text_clean.py
new file mode 100644
index 0000000..26f1f48
--- /dev/null
+++ b/scripts/text_clean.py
@@ -0,0 +1,72 @@
+# scripts/text_clean.py
+"""
+Shared text-cleaning utilities for ingest pipelines.
+
+Removes boilerplate lines injected by ebook converters, piracy watermarks,
+and other non-content artifacts before chunks are stored or embedded.
+"""
+from __future__ import annotations
+
+import re
+
+# Lines that match any of these patterns are dropped entirely.
+# Each pattern is matched against the stripped line (case-insensitive).
+_LINE_DROP_PATTERNS: list[re.Pattern] = [
+ # ABC Amber converter family
+ re.compile(r'generated by abc amber', re.IGNORECASE),
+ re.compile(r'processtext\.com', re.IGNORECASE),
+ # Calibre / sigil metadata lines
+ re.compile(r'calibre \d+\.\d+', re.IGNORECASE),
+ # Standalone URLs (line is just a URL, no surrounding prose)
+ re.compile(r'^https?://\S+$'),
+ # Common piracy / file-sharing watermarks
+ re.compile(r'www\.\w+\.(com|net|org)/\S*book', re.IGNORECASE),
+ re.compile(r'downloaded from', re.IGNORECASE),
+ re.compile(r'scanned by', re.IGNORECASE),
+ re.compile(r'provided by', re.IGNORECASE),
+ # Page-number-only lines from PDF extraction (e.g. "- 42 -" or "42")
+ re.compile(r'^\s*-?\s*\d{1,4}\s*-?\s*$'),
+]
+
+# Inline substrings to strip from within a line before further processing.
+_INLINE_STRIP_PATTERNS: list[re.Pattern] = [
+ re.compile(r'generated by abc amber \w+ converter,?\s*https?://\S*', re.IGNORECASE),
+ re.compile(r'https?://www\.processtext\.com/\S*', re.IGNORECASE),
+]
+
+
+def is_artifact_line(line: str) -> bool:
+ """Return True if the line is a known conversion artifact and should be dropped."""
+ stripped = line.strip()
+ return any(p.search(stripped) for p in _LINE_DROP_PATTERNS)
+
+
+def clean_line(line: str) -> str:
+ """Strip inline converter artifacts from a line, returning the cleaned version."""
+ for p in _INLINE_STRIP_PATTERNS:
+ line = p.sub("", line)
+ return line.strip()
+
+
+def clean_paragraph(text: str) -> str:
+ """Clean a multi-line paragraph: drop artifact lines, strip inline artifacts."""
+ lines = []
+ for line in text.splitlines():
+ if is_artifact_line(line):
+ continue
+ cleaned = clean_line(line)
+ if cleaned:
+ lines.append(cleaned)
+ return "\n".join(lines)
+
+
+def filter_paragraphs(paragraphs: list[str]) -> list[str]:
+ """Remove artifact lines from a list of paragraph strings."""
+ result = []
+ for para in paragraphs:
+ if is_artifact_line(para):
+ continue
+ cleaned = clean_line(para)
+ if cleaned and len(cleaned.split()) >= 4:
+ result.append(cleaned)
+ return result
diff --git a/tests/conftest.py b/tests/conftest.py
index 3acd092..91e1f12 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -30,8 +30,10 @@ def client(test_db, tmp_path, monkeypatch):
from app.main import app, _bm25
from app.deps import get_db
- # Suppress migrations during tests — test_db fixture already applies the schema
+ # Suppress startup side effects — test_db fixture already applies the schema,
+ # and vec schema validation is tested separately in test_startup.py
monkeypatch.setattr(_main_module, "_apply_migrations", lambda: None)
+ monkeypatch.setattr(_main_module, "_check_vec_schema", lambda *a, **kw: None)
def override_db():
conn = sqlite3.connect(test_db)
diff --git a/tests/test_startup.py b/tests/test_startup.py
new file mode 100644
index 0000000..7eca538
--- /dev/null
+++ b/tests/test_startup.py
@@ -0,0 +1,170 @@
+# tests/test_startup.py
+"""Tests for startup vec DB schema validation (_check_vec_schema)."""
+from __future__ import annotations
+
+import os
+import sqlite3
+import threading
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from app.main import _check_vec_schema, _reembed_docs
+
+
+def _make_vec_db(path: str, dims: int) -> None:
+ """Create a minimal sqlite-vec-style DB with the given dimension."""
+ conn = sqlite3.connect(path)
+ conn.execute("PRAGMA journal_mode=WAL")
+ # Replicate the virtual table name used by LocalSQLiteVecStore
+ conn.execute(f"CREATE TABLE page_vecs_vecs (embedding float[{dims}])")
+ conn.execute(
+ "INSERT INTO sqlite_master(type, name, tbl_name, sql) VALUES (?,?,?,?)"
+ if False else ""
+ )
+ # Write a real sqlite_master entry via a virtual table workaround:
+ # Easiest is to put the dimension marker directly in a metadata table.
+ # But _check_vec_schema reads sqlite_master, so we need the real DDL there.
+ conn.close()
+ # sqlite_master is read-only — recreate using the real CREATE VIRTUAL TABLE path
+ # by faking it via a regular table with the matching name pattern.
+ conn2 = sqlite3.connect(path)
+ conn2.execute("DROP TABLE IF EXISTS page_vecs_vecs")
+ # Write a row that _check_vec_schema will parse via its regex
+ conn2.execute(
+ "CREATE TABLE _schema_hint (sql TEXT)"
+ )
+ conn2.execute(
+ "INSERT INTO _schema_hint VALUES (?)",
+ [f"CREATE VIRTUAL TABLE page_vecs_vecs USING vec0(embedding float[{dims}])"],
+ )
+ conn2.commit()
+ conn2.close()
+
+
+def _make_real_vec_db(path: str, dims: int) -> None:
+ """Create a vec DB whose sqlite_master actually contains the dimension DDL."""
+ import sqlite3 as _sq
+ # We can't load sqlite-vec in tests, so simulate by writing sqlite_master directly
+ # via a shadow table that _check_vec_schema reads.
+ conn = _sq.connect(path)
+ conn.execute(
+ f"""CREATE TABLE page_vecs_vecs (
+ embedding float[{dims}]
+ )"""
+ )
+ conn.commit()
+ conn.close()
+
+
+class TestCheckVecSchema:
+ def test_no_file_is_noop(self, tmp_path):
+ """Missing vec DB should not raise."""
+ _check_vec_schema(str(tmp_path / "missing.db"), 1024, str(tmp_path / "main.db"))
+
+ def test_matching_dims_keeps_file(self, tmp_path):
+ """Correct dimensions: vec DB must not be deleted."""
+ vec_path = str(tmp_path / "vecs.db")
+ conn = sqlite3.connect(vec_path)
+ conn.execute("CREATE TABLE page_vecs_vecs (embedding float[1024])")
+ conn.commit()
+ conn.close()
+
+ _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
+
+ assert os.path.exists(vec_path), "Vec DB should not be deleted when dims match"
+
+ def test_mismatched_dims_deletes_file(self, tmp_path):
+ """Dimension mismatch: vec DB must be deleted."""
+ vec_path = str(tmp_path / "vecs.db")
+ conn = sqlite3.connect(vec_path)
+ conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
+ conn.commit()
+ conn.close()
+
+ db_path = str(tmp_path / "main.db")
+ _check_vec_schema(vec_path, 1024, db_path)
+
+ assert not os.path.exists(vec_path), "Vec DB should be deleted on dimension mismatch"
+
+ def test_mismatched_dims_queues_reembed(self, tmp_path):
+ """Dimension mismatch: re-embed thread must be started for ready docs."""
+ vec_path = str(tmp_path / "vecs.db")
+ conn = sqlite3.connect(vec_path)
+ conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
+ conn.commit()
+ conn.close()
+
+ db_path = str(tmp_path / "main.db")
+ schema = (
+ "CREATE TABLE documents ("
+ "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
+ "status TEXT, task_id TEXT, page_count INTEGER, "
+ "error_msg TEXT, created_at TEXT, updated_at TEXT)"
+ )
+ main_conn = sqlite3.connect(db_path)
+ main_conn.execute(schema)
+ main_conn.execute(
+ "INSERT INTO documents VALUES ('abc123', 'Book', '/tmp/book.pdf', 'ready', NULL, 10, NULL, '2026-01-01', '2026-01-01')"
+ )
+ main_conn.commit()
+ main_conn.close()
+
+ started = []
+ real_thread_start = threading.Thread.start
+
+ def _capture_start(self):
+ started.append(self)
+ # Don't actually run the re-embed to keep tests fast
+ self.run = lambda: None
+ real_thread_start(self)
+
+ with patch.object(threading.Thread, "start", _capture_start):
+ _check_vec_schema(vec_path, 1024, db_path)
+
+ assert len(started) == 1, "Exactly one re-embed thread should be started"
+ assert started[0].name == "pagepiper-reembed"
+
+ def test_no_ready_docs_skips_thread(self, tmp_path):
+ """Mismatch with no ready docs: no thread should be started."""
+ vec_path = str(tmp_path / "vecs.db")
+ conn = sqlite3.connect(vec_path)
+ conn.execute("CREATE TABLE page_vecs_vecs (embedding float[768])")
+ conn.commit()
+ conn.close()
+
+ db_path = str(tmp_path / "main.db")
+ schema = (
+ "CREATE TABLE documents ("
+ "id TEXT PRIMARY KEY, title TEXT, file_path TEXT, "
+ "status TEXT, task_id TEXT, page_count INTEGER, "
+ "error_msg TEXT, created_at TEXT, updated_at TEXT)"
+ )
+ main_conn = sqlite3.connect(db_path)
+ main_conn.execute(schema)
+ main_conn.commit()
+ main_conn.close()
+
+ started = []
+ with patch.object(threading.Thread, "start", lambda self: started.append(self)):
+ _check_vec_schema(vec_path, 1024, db_path)
+
+ assert len(started) == 0
+
+ def test_empty_db_no_table_is_noop(self, tmp_path):
+ """Vec DB exists but has no page_vecs_vecs table yet: no deletion."""
+ vec_path = str(tmp_path / "vecs.db")
+ sqlite3.connect(vec_path).close() # create empty file
+
+ _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
+
+ assert os.path.exists(vec_path)
+
+ def test_corrupt_db_does_not_raise(self, tmp_path):
+ """Corrupt or unreadable vec DB must not propagate exceptions."""
+ vec_path = str(tmp_path / "vecs.db")
+ with open(vec_path, "w") as f:
+ f.write("not a sqlite database")
+
+ _check_vec_schema(vec_path, 1024, str(tmp_path / "main.db"))
+ # No assertion needed — just must not raise
diff --git a/tests/test_text_clean.py b/tests/test_text_clean.py
new file mode 100644
index 0000000..6c954e3
--- /dev/null
+++ b/tests/test_text_clean.py
@@ -0,0 +1,108 @@
+# tests/test_text_clean.py
+"""Tests for ebook artifact filtering in scripts/text_clean.py."""
+from __future__ import annotations
+
+import pytest
+
+from scripts.text_clean import (
+ clean_line,
+ clean_paragraph,
+ filter_paragraphs,
+ is_artifact_line,
+)
+
+
+class TestIsArtifactLine:
+ def test_abc_amber_lit(self):
+ assert is_artifact_line(
+ "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
+ )
+
+ def test_abc_amber_rtf(self):
+ assert is_artifact_line("Generated by ABC Amber RTF Converter")
+
+ def test_processtext_url_only(self):
+ assert is_artifact_line("http://www.processtext.com/abclit.html")
+
+ def test_standalone_url(self):
+ assert is_artifact_line("https://www.example.com/book")
+
+ def test_page_number_only(self):
+ assert is_artifact_line("42")
+ assert is_artifact_line("- 42 -")
+ assert is_artifact_line(" 7 ")
+
+ def test_downloaded_from(self):
+ assert is_artifact_line("Downloaded from www.fictionsite.net")
+
+ def test_scanned_by(self):
+ assert is_artifact_line("Scanned by SomeUser")
+
+ def test_normal_prose_not_artifact(self):
+ assert not is_artifact_line(
+ '"And what if food isn\'t the only reason Jagang is going to Anderith?"'
+ )
+
+ def test_url_embedded_in_prose_not_dropped(self):
+ # A URL inside a sentence is not a standalone-URL artifact line
+ assert not is_artifact_line(
+ "You can read more about this at https://example.com and continue."
+ )
+
+ def test_short_page_header_not_dropped(self):
+ # "Chapter 1" is not an artifact — 4-digit number check only drops bare numbers
+ assert not is_artifact_line("Chapter 1")
+
+
+class TestCleanLine:
+ def test_strips_inline_abc_amber(self):
+ line = "Some prose. Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html"
+ result = clean_line(line)
+ assert "ABC Amber" not in result
+ assert "processtext" not in result
+ assert "Some prose." in result
+
+ def test_passes_clean_line_unchanged(self):
+ line = "He cocked an eyebrow and smiled."
+ assert clean_line(line) == line
+
+
+class TestCleanParagraph:
+ def test_drops_artifact_lines_from_paragraph(self):
+ text = (
+ "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html\n"
+ '"And what if food isn\'t the only reason Jagang is going to Anderith?"\n'
+ "He cocked an eyebrow."
+ )
+ result = clean_paragraph(text)
+ assert "ABC Amber" not in result
+ assert "Jagang" in result
+ assert "eyebrow" in result
+
+ def test_all_artifact_paragraph_returns_empty(self):
+ text = "Generated by ABC Amber LIT Converter\nhttp://www.processtext.com/abclit.html"
+ assert clean_paragraph(text) == ""
+
+ def test_clean_paragraph_unchanged(self):
+ text = "Richard raised his sword.\nThe magic surged through him."
+ assert clean_paragraph(text) == text
+
+
+class TestFilterParagraphs:
+ def test_drops_artifact_paragraphs(self):
+ paras = [
+ "Generated by ABC Amber LIT Converter, http://www.processtext.com/abclit.html",
+ '"And what if food isn\'t the only reason Jagang is going to Anderith?"',
+ "He cocked an eyebrow at the question.",
+ ]
+ result = filter_paragraphs(paras)
+ assert len(result) == 2
+ assert all("ABC Amber" not in p for p in result)
+
+ def test_drops_short_lines_under_4_words(self):
+ paras = ["Hi", "OK sure", "Valid sentence with enough words here."]
+ result = filter_paragraphs(paras)
+ assert result == ["Valid sentence with enough words here."]
+
+ def test_empty_input(self):
+ assert filter_paragraphs([]) == []
diff --git a/web/src/App.vue b/web/src/App.vue
index e699732..16af787 100644
--- a/web/src/App.vue
+++ b/web/src/App.vue
@@ -6,11 +6,13 @@