From 0795a9286c415a1ee6fea8241e059066a03d392f Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Wed, 6 May 2026 08:33:37 -0700 Subject: [PATCH] docs: add MkDocs site (getting-started, user-guide, reference) --- docs/getting-started/installation.md | 51 ++++++++++++++++++++ docs/getting-started/ollama-setup.md | 49 +++++++++++++++++++ docs/getting-started/quick-start.md | 36 ++++++++++++++ docs/plausible.js | 1 + docs/reference/architecture.md | 60 +++++++++++++++++++++++ docs/reference/environment-variables.md | 42 ++++++++++++++++ docs/reference/tier-system.md | 23 +++++++++ docs/user-guide/chat.md | 39 +++++++++++++++ docs/user-guide/library.md | 48 +++++++++++++++++++ docs/user-guide/search.md | 24 ++++++++++ mkdocs.yml | 64 +++++++++++++++++++++++++ 11 files changed, 437 insertions(+) create mode 100644 docs/getting-started/installation.md create mode 100644 docs/getting-started/ollama-setup.md create mode 100644 docs/getting-started/quick-start.md create mode 100644 docs/plausible.js create mode 100644 docs/reference/architecture.md create mode 100644 docs/reference/environment-variables.md create mode 100644 docs/reference/tier-system.md create mode 100644 docs/user-guide/chat.md create mode 100644 docs/user-guide/library.md create mode 100644 docs/user-guide/search.md create mode 100644 mkdocs.yml diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md new file mode 100644 index 0000000..6bfe7c7 --- /dev/null +++ b/docs/getting-started/installation.md @@ -0,0 +1,51 @@ +# Installation + +Pagepiper runs as a Docker Compose stack: a FastAPI backend and a Vue 3 frontend served by nginx. No external services are required for the core BM25 search feature set. + +## Prerequisites + +- Docker and Docker Compose +- 1 GB disk for images, plus space for your document library + +## Quick setup + +```bash +git clone https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper +cd pagepiper +cp .env.example .env +./manage.sh start +``` + +The web UI opens at `http://localhost:8521`. + +## manage.sh commands + +| Command | Description | +|---------|-------------| +| `./manage.sh start` | Start all services (builds on first run) | +| `./manage.sh stop` | Stop all services | +| `./manage.sh restart` | Rebuild and restart | +| `./manage.sh status` | Show running containers | +| `./manage.sh logs [api\|web]` | Tail logs | +| `./manage.sh build` | Rebuild images without starting | +| `./manage.sh test` | Run the test suite | +| `./manage.sh open` | Open browser to the web UI | + +## Mounting a document directory + +To scan an entire folder of PDFs and EPUBs at startup, set `PAGEPIPER_WATCH_DIR` in your `.env`: + +```bash +PAGEPIPER_WATCH_DIR=/home/you/books +``` + +Then use the **Scan for PDFs** button in the library to index everything in that directory. + +## Updating + +```bash +git pull +./manage.sh restart +``` + +The SQLite database persists in `data/` across rebuilds. diff --git a/docs/getting-started/ollama-setup.md b/docs/getting-started/ollama-setup.md new file mode 100644 index 0000000..9661f0f --- /dev/null +++ b/docs/getting-started/ollama-setup.md @@ -0,0 +1,49 @@ +# Ollama Setup + +Hybrid vector search and RAG chat are gated behind a local Ollama instance. This is the BYOK (bring your own key) unlock for the Free tier — no paid subscription required. + +## Install Ollama + +```bash +curl -fsSL https://ollama.ai/install.sh | sh +``` + +## Pull the required models + +```bash +# Embedding model — converts pages into vectors +ollama pull nomic-embed-text + +# Chat model — answers questions using retrieved page excerpts +ollama pull mistral:7b +``` + +`nomic-embed-text` produces 1024-dimensional vectors and runs comfortably on 8 GB of VRAM. +`mistral:7b` requires roughly 5 GB of VRAM. Substitute any compatible model. + +## Configure Pagepiper + +In your `.env`: + +```bash +PAGEPIPER_OLLAMA_URL=http://localhost:11434 +PAGEPIPER_EMBED_MODEL=nomic-embed-text +PAGEPIPER_CHAT_MODEL=mistral:7b +``` + +Restart Pagepiper: + +```bash +./manage.sh restart +``` + +## Verify + +Upload or re-index a document. The document card should show **Embedding N / M pages** during ingest. Once complete, the Chat tab becomes active. + +## Changing embedding models + +If you switch `PAGEPIPER_EMBED_MODEL`, Pagepiper detects the dimension mismatch at startup, deletes the old vector database, and automatically re-embeds all indexed documents in the background. BM25 search remains available throughout. + +!!! note + Re-embedding a large library can take 30-60 minutes depending on hardware. diff --git a/docs/getting-started/quick-start.md b/docs/getting-started/quick-start.md new file mode 100644 index 0000000..e6fc786 --- /dev/null +++ b/docs/getting-started/quick-start.md @@ -0,0 +1,36 @@ +# Quick Start + +This guide gets you from zero to searching your first document in under five minutes. + +## 1. Start Pagepiper + +```bash +./manage.sh start +``` + +Open `http://localhost:8521` in your browser. + +## 2. Add a document + +You have two options: + +**Upload directly** — click **Upload PDF / EPUB** in the library header and pick a file from your computer. + +**Scan a directory** — set `PAGEPIPER_WATCH_DIR` in your `.env` to a folder of PDFs or EPUBs, then click **Scan for PDFs**. Pagepiper indexes every file it finds. + +## 3. Wait for indexing + +The document card shows progress while text is being extracted and embedded: + +- **Extracting text...** (animated bar) — PDF/EPUB is being parsed into page chunks +- **Embedding N / M pages (X%)** (filling bar) — vectors are being written to the vector store (only when Ollama is configured) + +Once the badge shows **READY**, the document is searchable. + +## 4. Search + +Click **Search** in the navigation. Type any phrase and see ranked page excerpts with scores. Results are instant using BM25 full-text search — no Ollama required. + +## 5. Chat (optional, requires Ollama) + +See the [Ollama Setup](ollama-setup.md) guide to enable hybrid vector search and LLM-powered chat. Once configured, the **Chat** tab lets you ask natural-language questions and get answers with page citations. diff --git a/docs/plausible.js b/docs/plausible.js new file mode 100644 index 0000000..769bf63 --- /dev/null +++ b/docs/plausible.js @@ -0,0 +1 @@ +(function(){var s=document.createElement("script");s.defer=true;s.dataset.domain="docs.circuitforge.tech,circuitforge.tech";s.dataset.api="https://analytics.circuitforge.tech/api/event";s.src="https://analytics.circuitforge.tech/js/script.js";document.head.appendChild(s);})(); diff --git a/docs/reference/architecture.md b/docs/reference/architecture.md new file mode 100644 index 0000000..4069ec1 --- /dev/null +++ b/docs/reference/architecture.md @@ -0,0 +1,60 @@ +# Architecture + +## Overview + +``` +Browser (Vue 3 SPA) + | + nginx (static + /api proxy) + | + FastAPI backend + ├── BM25Index (in-process, rank-bm25) + ├── Retriever (BM25 + optional vector) + ├── Synthesizer (LLMRouter → Ollama) + └── SQLite (page_chunks + metadata) + + + sqlite-vec (vectors) +``` + +## Ingest pipeline + +``` +PDF / EPUB file + │ + ├─ PDFExtractor (pdfminer + OCR fallback) ← circuitforge_core + │ or + └─ EPUBExtractor (BeautifulSoup + heading chunking) + │ + text_clean.py (strip artifacts) + │ + INSERT INTO page_chunks + │ + Ollama embed (batches of 64) ← BYOK gate + │ + sqlite-vec upsert +``` + +## Retrieval + +Hybrid search merges BM25 and semantic results with a 50/50 score blend: + +1. BM25 queries the in-process index (no round-trip to DB) +2. Semantic query embeds the user query via Ollama, fetches `top_k * 20` nearest vectors, filters by `doc_id` in Python +3. Hits are merged: BM25 scores and vector scores combined; BM25 hits take priority +4. Top `k` results are ranked, then adjacent pages (page ± 1) are fetched to restore context for mid-sentence chunk boundaries + +## Storage + +| File | Format | Contents | +|------|--------|---------| +| `pagepiper.db` | SQLite | `documents`, `page_chunks`, `chat_feedback` | +| `pagepiper_vecs.db` | sqlite-vec | `page_vecs` virtual table + `page_vecs_meta` | + +The vector database stores one row per page chunk. If the embedding model changes, Pagepiper detects the dimension mismatch at startup (reads `CREATE VIRTUAL TABLE` DDL from `sqlite_master`), deletes the vec DB, and queues a background re-embed. + +## Licensing boundary + +| Component | License | +|-----------|---------| +| BM25 search, ingest pipeline, library API | MIT | +| Hybrid vector search, RAG chat, embedding | BSL 1.1 (BYOK unlocked on Free tier) | diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md new file mode 100644 index 0000000..dab24ad --- /dev/null +++ b/docs/reference/environment-variables.md @@ -0,0 +1,42 @@ +# Environment Variables + +Copy `.env.example` to `.env` and configure as needed. + +## Core + +| Variable | Default | Description | +|----------|---------|-------------| +| `PAGEPIPER_DATA_DIR` | `data` | Directory for SQLite databases and uploads | +| `PAGEPIPER_WATCH_DIR` | _(unset)_ | Directory scanned for PDFs/EPUBs on demand | +| `SECRET_KEY` | _(required)_ | Random secret for internal signing | + +## Ollama / BYOK + +| Variable | Default | Description | +|----------|---------|-------------| +| `PAGEPIPER_OLLAMA_URL` | _(unset)_ | Ollama base URL, e.g. `http://localhost:11434`. Enables hybrid search and chat. | +| `PAGEPIPER_EMBED_MODEL` | `nomic-embed-text` | Ollama embedding model | +| `PAGEPIPER_EMBED_DIMS` | `1024` | Embedding dimensions (must match the model) | +| `PAGEPIPER_CHAT_MODEL` | `mistral:7b` | Ollama chat/completion model | + +## cf-orch (managed deployments) + +| Variable | Default | Description | +|----------|---------|-------------| +| `CF_ORCH_URL` | _(unset)_ | cf-orch coordinator URL for GPU allocation | +| `CF_LICENSE_KEY` | _(unset)_ | License key for cf-orch authentication | +| `CF_APP_NAME` | `pagepiper` | Application identifier sent to cf-orch | + +## License (cloud tier) + +| Variable | Default | Description | +|----------|---------|-------------| +| `PAGEPIPER_HEIMDALL_URL` | _(unset)_ | Heimdall license server URL | +| `PAGEPIPER_HEIMDALL_TOKEN` | _(unset)_ | Admin token for license validation | + +## Feature flags + +| Variable | Default | Description | +|----------|---------|-------------| +| `PAGEPIPER_CHAT_FEEDBACK` | `false` | Enable thumbs up/down feedback UI on chat answers | +| `CLOUD_MODE` | `false` | Enable cloud-specific middleware (rate limiting, license checks) | diff --git a/docs/reference/tier-system.md b/docs/reference/tier-system.md new file mode 100644 index 0000000..2a36347 --- /dev/null +++ b/docs/reference/tier-system.md @@ -0,0 +1,23 @@ +# Tier System + +| Feature | Free | Paid (BYOK) | +|---------|------|-------------| +| BM25 full-text search | Yes | Yes | +| PDF and EPUB upload | Yes | Yes | +| Unlimited local ingestion | Yes | Yes | +| Directory scan | Yes | Yes | +| Hybrid vector search | No | Yes (local Ollama) | +| RAG chat with page citations | No | Yes (local Ollama) | +| Embedding model choice | No | Yes | + +## BYOK unlock + +Setting `PAGEPIPER_OLLAMA_URL` in your `.env` unlocks all Paid-tier features at no cost. You supply your own compute; Pagepiper supplies the pipeline. + +```bash +PAGEPIPER_OLLAMA_URL=http://localhost:11434 +``` + +## Cloud managed tier + +The hosted instance at [pagepiper.circuitforge.tech](https://pagepiper.circuitforge.tech) runs on Circuit Forge infrastructure and requires a Paid tier license key. A free trial is available without a key. diff --git a/docs/user-guide/chat.md b/docs/user-guide/chat.md new file mode 100644 index 0000000..0c93043 --- /dev/null +++ b/docs/user-guide/chat.md @@ -0,0 +1,39 @@ +# Chat + +RAG (retrieval-augmented generation) chat lets you ask natural-language questions and get answers grounded in your document library. Requires Ollama — see [Ollama Setup](../getting-started/ollama-setup.md). + +## Asking a question + +1. Click **Chat** in the navigation bar +2. Optionally select one or more documents to restrict the search scope +3. Type your question and press Enter or click Send + +Pagepiper retrieves the most relevant page excerpts using hybrid BM25 + vector search, then passes them to the local LLM with instructions to answer using only the provided text and cite every claim with a page number. + +## Citations + +Each answer includes a citation panel showing the source pages used. Citations include: + +- Document title +- Page number +- A short text excerpt from that page + +If the answer says `[p.42]`, you can cross-reference the citation panel to see exactly what text the model read. + +## Multi-document chat + +Leave the document selector empty to search across your entire library. When you have many books indexed, scoping to a specific document gives more precise results. + +## Context window + +Pagepiper fetches the top 10 matching pages plus one adjacent page on each side of every hit. This ensures mid-paragraph chunk boundaries don't cut off context that the model needs to understand a passage. + +## Limitations + +- The model answers using only the retrieved excerpts. If the relevant passage was not retrieved, the model will say it cannot find an answer. +- Chat history is kept in the browser session only. Refreshing the page clears the conversation. +- RAG chat is gated behind a local Ollama instance. Cloud LLM backends are not currently supported on the Free tier. + +## Feedback + +Use the thumbs up / thumbs down buttons after each answer to flag good and bad responses. Feedback is stored locally in `data/pagepiper.db` for future quality review. diff --git a/docs/user-guide/library.md b/docs/user-guide/library.md new file mode 100644 index 0000000..00957ca --- /dev/null +++ b/docs/user-guide/library.md @@ -0,0 +1,48 @@ +# Library + +The library is the home screen. It shows all indexed documents and lets you add new ones. + +## Adding documents + +**Upload** — click **Upload PDF / EPUB** and select a file. Files up to 200 MB are accepted. The document is saved to `data/uploads/` and queued for indexing immediately. + +**Scan** — set `PAGEPIPER_WATCH_DIR` to a directory in your `.env`, then click **Scan for PDFs**. Any PDF or EPUB not already in the library is queued. Re-scanning is safe; already-indexed documents are skipped. + +## Document states + +| Badge | Meaning | +|-------|---------| +| PROCESSING | Text extraction or embedding in progress | +| READY | Fully indexed and searchable | +| ERROR | Indexing failed — see the error message on the card | + +## Ingestion progress + +While a document is processing, its card shows a live progress bar: + +- Animated sliding bar while text is being extracted (before page count is known) +- "Embedding N / M pages (X%)" once vectors are being written + +The card refreshes automatically and emits a library reload when indexing completes. + +## Re-indexing + +Click **Re-index** on any document card to re-run the full ingest pipeline. This is useful after: + +- Changing the `PAGEPIPER_EMBED_MODEL` (dimension mismatch auto-detected at startup, but you can also trigger manually) +- A failed ingest you want to retry +- Updating to a new version of Pagepiper with an improved extractor + +## Removing a document + +Click **Remove** to delete the document's metadata, page chunks, and vectors. The source file on disk is not deleted. + +## Storage + +All data lives in the directory set by `PAGEPIPER_DATA_DIR` (default: `data/`): + +| File | Contents | +|------|---------| +| `pagepiper.db` | Document metadata, page chunks, chat feedback | +| `pagepiper_vecs.db` | sqlite-vec vector store | +| `uploads/` | Files added via browser upload | diff --git a/docs/user-guide/search.md b/docs/user-guide/search.md new file mode 100644 index 0000000..e8f945f --- /dev/null +++ b/docs/user-guide/search.md @@ -0,0 +1,24 @@ +# Search + +BM25 full-text search is available on the Free tier with no Ollama required. + +## Using search + +1. Click **Search** in the navigation bar +2. Type a phrase or keyword — results appear as you submit +3. Results show the source document, page number, a text excerpt, and a BM25 relevance score + +## Filtering by document + +Use the document selector to restrict results to one or more specific books. This is useful when your library spans many documents and you know which one contains the answer. + +## BM25 scoring + +BM25 (Best Match 25) ranks pages by term frequency weighted against how rare each term is across the whole corpus. A page that uses your query term frequently AND that term is rare across all documents ranks highest. + +!!! tip + For short queries like "chimes" or "protocol", BM25 tends to surface later chapters where the term appears repeatedly in action scenes. If you want the introductory definition, try a longer phrase like "what are the chimes" to give BM25 more signal. + +## Hybrid search (requires Ollama) + +When Ollama is configured, the Chat endpoint uses hybrid search behind the scenes: BM25 results are merged with semantic vector results using a 50/50 score blend. The Search page always uses BM25 only. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..a9367db --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,64 @@ +site_name: Pagepiper +site_description: Self-hosted PDF and EPUB library with BM25 full-text search, hybrid vector retrieval, and LLM-powered RAG chat. +site_author: Circuit Forge LLC +site_url: https://docs.circuitforge.tech/pagepiper +repo_url: https://git.opensourcesolarpunk.com/Circuit-Forge/pagepiper +repo_name: Circuit-Forge/pagepiper + +theme: + name: material + palette: + - scheme: default + primary: deep purple + accent: purple + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - scheme: slate + primary: deep purple + accent: purple + toggle: + icon: material/brightness-4 + name: Switch to light mode + features: + - navigation.tabs + - navigation.sections + - navigation.expand + - navigation.top + - search.suggest + - search.highlight + - content.code.copy + +markdown_extensions: + - admonition + - pymdownx.details + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.tabbed: + alternate_style: true + - tables + - toc: + permalink: true + +nav: + - Home: index.md + - Getting Started: + - Installation: getting-started/installation.md + - Quick Start: getting-started/quick-start.md + - Ollama Setup: getting-started/ollama-setup.md + - User Guide: + - Library: user-guide/library.md + - Search: user-guide/search.md + - Chat: user-guide/chat.md + - Reference: + - Architecture: reference/architecture.md + - Tier System: reference/tier-system.md + - Environment Variables: reference/environment-variables.md + +extra_javascript: + - plausible.js