From 3a0608ff980d3c5c82ebcdd8ccf50a06cb549c67 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Mon, 4 May 2026 16:54:08 -0700 Subject: [PATCH] chore: initial pagepiper repo scaffold Adds pyproject.toml, environment.yml, Dockerfile, docker/web (Vue+nginx), compose.yml, compose.override.yml.example, manage.sh, .env.example, .gitignore, and config stubs for the pagepiper self-hosted PDF library tool. Port 8521. No secrets committed. --- .env.example | 12 +++++++++ .gitignore | 24 +++++++++++++++++ Dockerfile | 31 ++++++++++++++++++++++ compose.override.yml.example | 13 ++++++++++ compose.yml | 21 +++++++++++++++ config/ingest.yaml | 2 ++ config/llm.yaml.example | 7 +++++ docker/web/Dockerfile | 19 ++++++++++++++ docker/web/nginx.conf | 17 ++++++++++++ environment.yml | 19 ++++++++++++++ manage.sh | 50 ++++++++++++++++++++++++++++++++++++ pyproject.toml | 27 +++++++++++++++++++ 12 files changed, 242 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 compose.override.yml.example create mode 100644 compose.yml create mode 100644 config/ingest.yaml create mode 100644 config/llm.yaml.example create mode 100644 docker/web/Dockerfile create mode 100644 docker/web/nginx.conf create mode 100644 environment.yml create mode 100755 manage.sh create mode 100644 pyproject.toml diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2f3bcd1 --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# Copy to .env and fill in your values. .env is gitignored. + +# Path to your PDF library on the host machine +PAGEPIPER_BOOKS_DIR=/path/to/your/pdfs + +# Data directory (SQLite + vector DB stored here) +PAGEPIPER_DATA_DIR=data + +# Ollama URL — set this to unlock semantic search and RAG chat (BYOK) +# PAGEPIPER_OLLAMA_URL=http://localhost:11434 +# PAGEPIPER_CHAT_MODEL=mistral:7b +# PAGEPIPER_EMBED_MODEL=nomic-embed-text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f3a24d3 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Secrets and local config +.env +config/llm.yaml +CLAUDE.md + +# Data +data/ +books/ + +# Python +__pycache__/ +*.pyc +*.pyo +.pytest_cache/ +*.egg-info/ +dist/ +.eggs/ + +# Node +web/node_modules/ +web/dist/ + +# Docker override (local dev extras) +compose.override.yml diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a8d0715 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM continuumio/miniconda3:latest + +WORKDIR /app + +# System deps for pytesseract (OCR) and pdfplumber +RUN apt-get update && apt-get install -y --no-install-recommends \ + tesseract-ocr \ + libgl1 \ + && rm -rf /var/lib/apt/lists/* + +# Install circuitforge-core from sibling directory (compose sets context: ..) +COPY circuitforge-core/ ./circuitforge-core/ +RUN conda run -n base pip install --no-cache-dir -e "./circuitforge-core[pdf,vector]" + +# Create pagepiper conda env +COPY pagepiper/environment.yml . +RUN conda env create -f environment.yml + +COPY pagepiper/ ./pagepiper/ + +# Remove gitignored secrets — defence-in-depth +RUN rm -f /app/pagepiper/.env /app/pagepiper/config/llm.yaml + +# Install cf-core into pagepiper env + the app itself +RUN conda run -n pagepiper pip install --no-cache-dir -e "/app/circuitforge-core[pdf,vector]" +WORKDIR /app/pagepiper +RUN conda run -n pagepiper pip install --no-cache-dir -e . + +EXPOSE 8521 +CMD ["conda", "run", "--no-capture-output", "-n", "pagepiper", \ + "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8521"] diff --git a/compose.override.yml.example b/compose.override.yml.example new file mode 100644 index 0000000..94553eb --- /dev/null +++ b/compose.override.yml.example @@ -0,0 +1,13 @@ +# Copy to compose.override.yml and fill in your values. +# compose.override.yml is gitignored — never commit secrets. + +services: + api: + environment: + # Point to your local Ollama instance to unlock semantic search and RAG chat + PAGEPIPER_OLLAMA_URL: "http://localhost:11434" + PAGEPIPER_CHAT_MODEL: "mistral:7b" + PAGEPIPER_EMBED_MODEL: "nomic-embed-text" + volumes: + # Override books directory if your PDFs are elsewhere + - /path/to/your/pdfs:/books:ro diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..7c7f0e2 --- /dev/null +++ b/compose.yml @@ -0,0 +1,21 @@ +services: + api: + build: + context: .. + dockerfile: pagepiper/Dockerfile + network_mode: host + env_file: .env + volumes: + - ./data:/app/pagepiper/data + - ${PAGEPIPER_BOOKS_DIR:-./books}:/books:ro + restart: unless-stopped + + web: + build: + context: . + dockerfile: docker/web/Dockerfile + ports: + - "8521:80" + restart: unless-stopped + depends_on: + - api diff --git a/config/ingest.yaml b/config/ingest.yaml new file mode 100644 index 0000000..c14eaeb --- /dev/null +++ b/config/ingest.yaml @@ -0,0 +1,2 @@ +ocr_min_words: 10 # Pages with fewer words from text layer → OCR fallback +batch_size: 32 # Pages embedded per Ollama call (tune to your GPU VRAM) diff --git a/config/llm.yaml.example b/config/llm.yaml.example new file mode 100644 index 0000000..2b28014 --- /dev/null +++ b/config/llm.yaml.example @@ -0,0 +1,7 @@ +# Copy to config/llm.yaml (gitignored) — or use .env / compose.override.yml instead. + +provider: ollama +base_url: "${PAGEPIPER_OLLAMA_URL}" +chat_model: mistral:7b +embedding_model: nomic-embed-text # ollama pull nomic-embed-text +vector_store: sqlite_vec diff --git a/docker/web/Dockerfile b/docker/web/Dockerfile new file mode 100644 index 0000000..e164057 --- /dev/null +++ b/docker/web/Dockerfile @@ -0,0 +1,19 @@ +# Stage 1: build Vue SPA +FROM node:20-alpine AS build +WORKDIR /app +COPY web/package*.json ./ +RUN npm ci --prefer-offline +COPY web/ ./ + +ARG VITE_BASE_URL=/ +ARG VITE_API_BASE= +ENV VITE_BASE_URL=$VITE_BASE_URL +ENV VITE_API_BASE=$VITE_API_BASE + +RUN npm run build + +# Stage 2: serve via nginx +FROM nginx:alpine +COPY docker/web/nginx.conf /etc/nginx/conf.d/default.conf +COPY --from=build /app/dist /usr/share/nginx/html +EXPOSE 80 diff --git a/docker/web/nginx.conf b/docker/web/nginx.conf new file mode 100644 index 0000000..e3d0524 --- /dev/null +++ b/docker/web/nginx.conf @@ -0,0 +1,17 @@ +server { + listen 80; + root /usr/share/nginx/html; + index index.html; + + # SPA routing — all non-asset paths → index.html + location / { + try_files $uri $uri/ /index.html; + } + + # Proxy API requests to FastAPI + location /api/ { + proxy_pass http://localhost:8521; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + } +} diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..f2a94a2 --- /dev/null +++ b/environment.yml @@ -0,0 +1,19 @@ +name: pagepiper +channels: + - conda-forge + - defaults +dependencies: + - python=3.11 + - pip + - pip: + - fastapi>=0.110 + - uvicorn[standard]>=0.29 + - rank-bm25>=0.2 + - PyYAML>=6.0 + - httpx>=0.27 + - pdfplumber>=0.11 + - pytesseract>=0.3 + - Pillow>=10.0 + - sqlite-vec>=0.1 + - pytest>=8.0 + - pytest-asyncio>=0.23 diff --git a/manage.sh b/manage.sh new file mode 100755 index 0000000..c94edd9 --- /dev/null +++ b/manage.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +set -euo pipefail + +SERVICE=pagepiper +WEB_PORT=8521 +COMPOSE_FILE="compose.yml" + +OVERRIDE_FLAG="" +[[ -f "compose.override.yml" ]] && OVERRIDE_FLAG="-f compose.override.yml" + +usage() { + echo "Usage: $0 {start|stop|restart|status|logs [svc]|open|build|test}" + exit 1 +} + +cmd="${1:-help}" +shift || true + +case "$cmd" in + start) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG up -d --build + echo "Pagepiper running → http://localhost:${WEB_PORT}" + ;; + stop) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG down + ;; + restart) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG down + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG up -d --build + echo "Pagepiper running → http://localhost:${WEB_PORT}" + ;; + status) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG ps + ;; + logs) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG logs -f "${1:-}" + ;; + open) + xdg-open "http://localhost:${WEB_PORT}" 2>/dev/null || open "http://localhost:${WEB_PORT}" + ;; + build) + docker compose -f "$COMPOSE_FILE" $OVERRIDE_FLAG build --no-cache + ;; + test) + conda run -n cf pytest tests/ -v + ;; + *) + usage + ;; +esac diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..884a2da --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[build-system] +requires = ["setuptools>=68", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "pagepiper" +version = "0.1.0" +description = "Self-hosted PDF library manager with RAG chat and page-level citations" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "fastapi>=0.110", + "uvicorn[standard]>=0.29", + "python-multipart>=0.0.9", + "rank-bm25>=0.2", + "PyYAML>=6.0", + "httpx>=0.27", + "circuitforge-core[pdf,vector]>=0.19.0", +] + +[tool.setuptools.packages.find] +where = ["."] +include = ["app*"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +asyncio_mode = "auto"