Compare commits
16 commits
abeb6089e5
...
2e24808d91
| Author | SHA1 | Date | |
|---|---|---|---|
| 2e24808d91 | |||
| c24bd33478 | |||
| 6fc8e7faa6 | |||
| 6bda1143cc | |||
| e401cb5f48 | |||
| b4837163d5 | |||
| 17cdb552a3 | |||
| 0e493ab560 | |||
| eb5c7383ed | |||
| 6869f32392 | |||
| c6fa9baf2c | |||
| f4574dd05e | |||
| 751faf1679 | |||
| 4c2370f1de | |||
| 47914cebeb | |||
| 2253cd7da3 |
44 changed files with 5457 additions and 2 deletions
22
.env.cloud.example
Normal file
22
.env.cloud.example
Normal file
|
|
@ -0,0 +1,22 @@
|
||||||
|
# pagepiper cloud environment — copy to .env and fill in secrets
|
||||||
|
# Used by: docker compose -f compose.cloud.yml -p pagepiper-cloud ...
|
||||||
|
|
||||||
|
# Data directories (host paths, bind-mounted into the api container)
|
||||||
|
PAGEPIPER_DATA_DIR=/devl/pagepiper-cloud-data
|
||||||
|
PAGEPIPER_BOOKS_DIR=/devl/pagepiper-cloud-data/books
|
||||||
|
|
||||||
|
# BYOK gate — set to enable hybrid search and RAG chat (BSL feature)
|
||||||
|
# Leave blank to run BM25-only mode (MIT, no Ollama required)
|
||||||
|
PAGEPIPER_OLLAMA_URL=
|
||||||
|
|
||||||
|
# Embedding and chat model selection (only used when PAGEPIPER_OLLAMA_URL is set)
|
||||||
|
PAGEPIPER_EMBED_MODEL=nomic-embed-text
|
||||||
|
PAGEPIPER_CHAT_MODEL=mistral:7b
|
||||||
|
|
||||||
|
# Heimdall license server (optional — for per-user tier validation)
|
||||||
|
HEIMDALL_URL=https://license.circuitforge.tech
|
||||||
|
HEIMDALL_ADMIN_TOKEN=
|
||||||
|
|
||||||
|
# cf-orch streaming proxy — coordinator product key
|
||||||
|
# Must match COORDINATOR_PRODUCT_KEYS["pagepiper"] in cf-orch.env on the coordinator
|
||||||
|
COORDINATOR_PAGEPIPER_KEY=
|
||||||
127
app/api/chat.py
Normal file
127
app/api/chat.py
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
# app/api/chat.py
|
||||||
|
"""
|
||||||
|
RAG chat endpoint — retrieves relevant page chunks and synthesizes an answer.
|
||||||
|
|
||||||
|
BSL 1.1 — BYOK gate: requires PAGEPIPER_OLLAMA_URL or a Paid tier license.
|
||||||
|
Returns 402 with clear upgrade message if neither is configured.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from app.services.retriever import Retriever
|
||||||
|
from app.services.synthesizer import Synthesizer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/chat", tags=["chat"])
|
||||||
|
|
||||||
|
|
||||||
|
class ChatTurn(BaseModel):
|
||||||
|
role: str # "user" | "assistant"
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChatRequest(BaseModel):
|
||||||
|
message: str
|
||||||
|
history: list[ChatTurn] = []
|
||||||
|
doc_ids: list[str] | None = None
|
||||||
|
top_k: int = 5
|
||||||
|
|
||||||
|
|
||||||
|
class ChatResponse(BaseModel):
|
||||||
|
answer: str
|
||||||
|
citations: list[dict]
|
||||||
|
|
||||||
|
|
||||||
|
def _get_llm_router():
|
||||||
|
"""Return LLMRouter if Ollama configured, else None."""
|
||||||
|
from app.config import get_llm_config
|
||||||
|
|
||||||
|
cfg = get_llm_config()
|
||||||
|
if cfg is None:
|
||||||
|
return None
|
||||||
|
from circuitforge_core.llm import LLMRouter
|
||||||
|
|
||||||
|
return LLMRouter(cfg)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_db_path() -> str:
|
||||||
|
"""Read lazily so test fixtures take effect."""
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
||||||
|
return str(data_dir / "pagepiper.db")
|
||||||
|
|
||||||
|
|
||||||
|
def _get_vec_db_path() -> str:
|
||||||
|
import pathlib
|
||||||
|
|
||||||
|
data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
||||||
|
return str(data_dir / "pagepiper_vecs.db")
|
||||||
|
|
||||||
|
|
||||||
|
def _require_llm():
|
||||||
|
"""Return LLMRouter or raise 402."""
|
||||||
|
llm = _get_llm_router()
|
||||||
|
if llm is None:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=402,
|
||||||
|
detail={
|
||||||
|
"error": "ollama_required",
|
||||||
|
"message": (
|
||||||
|
"RAG chat requires Ollama. Set PAGEPIPER_OLLAMA_URL in your .env file, "
|
||||||
|
"then restart. Run: ollama pull nomic-embed-text && ollama pull mistral:7b"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return llm
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("")
|
||||||
|
def chat(req: ChatRequest) -> ChatResponse:
|
||||||
|
llm = _require_llm()
|
||||||
|
|
||||||
|
from app.main import _bm25
|
||||||
|
|
||||||
|
retriever = Retriever(_bm25)
|
||||||
|
chunks = retriever.hybrid_search(
|
||||||
|
query=req.message,
|
||||||
|
top_k=req.top_k,
|
||||||
|
doc_ids=req.doc_ids,
|
||||||
|
db_path=_get_db_path(),
|
||||||
|
vec_db_path=_get_vec_db_path(),
|
||||||
|
llm=llm,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not chunks:
|
||||||
|
return ChatResponse(
|
||||||
|
answer=(
|
||||||
|
"I couldn't find any relevant passages. "
|
||||||
|
"Try a different query or check which documents are indexed."
|
||||||
|
),
|
||||||
|
citations=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
synth = Synthesizer(llm)
|
||||||
|
result = synth.synthesize(
|
||||||
|
message=req.message,
|
||||||
|
history=[t.model_dump() for t in req.history],
|
||||||
|
chunks=chunks,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ChatResponse(
|
||||||
|
answer=result.answer,
|
||||||
|
citations=[
|
||||||
|
{
|
||||||
|
"doc_id": c.doc_id,
|
||||||
|
"page_number": c.page_number,
|
||||||
|
"snippet": c.snippet,
|
||||||
|
"bm25_score": c.bm25_score,
|
||||||
|
}
|
||||||
|
for c in result.citations
|
||||||
|
],
|
||||||
|
)
|
||||||
24
app/api/ingest.py
Normal file
24
app/api/ingest.py
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# app/api/ingest.py
|
||||||
|
"""Ingest job status polling (proxies cf-orch or checks in-memory registry)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/api/ingest", tags=["ingest"])
|
||||||
|
|
||||||
|
# Populated by _run_ingest_background when cf-orch is unavailable
|
||||||
|
_task_registry: dict[str, dict] = {}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{task_id}")
|
||||||
|
def get_task_status(task_id: str) -> dict:
|
||||||
|
# Check in-memory registry first (BackgroundTasks fallback)
|
||||||
|
if task_id in _task_registry:
|
||||||
|
return _task_registry[task_id]
|
||||||
|
|
||||||
|
# Try cf-orch
|
||||||
|
try:
|
||||||
|
from circuitforge_core.tasks import get_task_status as orch_status # type: ignore[import]
|
||||||
|
return orch_status(task_id)
|
||||||
|
except Exception:
|
||||||
|
raise HTTPException(status_code=404, detail="Task not found")
|
||||||
179
app/api/library.py
Normal file
179
app/api/library.py
Normal file
|
|
@ -0,0 +1,179 @@
|
||||||
|
# app/api/library.py
|
||||||
|
"""
|
||||||
|
Document library management API.
|
||||||
|
|
||||||
|
All endpoints in this module are MIT — no tier gate.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
|
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException
|
||||||
|
|
||||||
|
from app.config import WATCH_DIR, DB_PATH, VEC_DB_PATH
|
||||||
|
from app.deps import get_db
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/library", tags=["library"])
|
||||||
|
|
||||||
|
# Injected by main.py after _bm25 is created
|
||||||
|
_mark_bm25_dirty: Callable[[], None] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _dispatch_ingest(
|
||||||
|
doc_id: str,
|
||||||
|
file_path: str,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
) -> str:
|
||||||
|
"""Dispatch an ingest task. Tries cf-orch; falls back to BackgroundTasks."""
|
||||||
|
import os as _os
|
||||||
|
from pathlib import Path as _Path
|
||||||
|
|
||||||
|
# Read lazily so test fixtures (monkeypatch.setenv) take effect
|
||||||
|
_data_dir = _Path(_os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
||||||
|
task_id = str(uuid.uuid4())
|
||||||
|
args = {
|
||||||
|
"doc_id": doc_id,
|
||||||
|
"file_path": file_path,
|
||||||
|
"db_path": str(_data_dir / "pagepiper.db"),
|
||||||
|
"vec_db_path": str(_data_dir / "pagepiper_vecs.db"),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
from circuitforge_core.tasks import dispatch_task # type: ignore[import]
|
||||||
|
task_id = dispatch_task(caller="pagepiper/ingest_pdf", args=args)
|
||||||
|
logger.info("Dispatched cf-orch ingest task %s for doc %s", task_id, doc_id)
|
||||||
|
except Exception:
|
||||||
|
from scripts.ingest_pdf import run as run_ingest
|
||||||
|
background_tasks.add_task(_run_ingest_background, run_ingest, args, task_id)
|
||||||
|
logger.info(
|
||||||
|
"cf-orch unavailable — running ingest in background thread (task %s)", task_id
|
||||||
|
)
|
||||||
|
|
||||||
|
return task_id
|
||||||
|
|
||||||
|
|
||||||
|
def _run_ingest_background(run_fn: Callable[..., None], args: dict, task_id: str) -> None:
|
||||||
|
from app.api.ingest import _task_registry
|
||||||
|
_task_registry[task_id] = {"status": "running", "progress": 0}
|
||||||
|
try:
|
||||||
|
run_fn(**args)
|
||||||
|
_task_registry[task_id] = {"status": "complete", "progress": 100}
|
||||||
|
if _mark_bm25_dirty:
|
||||||
|
_mark_bm25_dirty()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.exception("Ingest task %s failed", task_id)
|
||||||
|
_task_registry[task_id] = {"status": "error", "error": str(exc)}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("")
|
||||||
|
def list_library(db: sqlite3.Connection = Depends(get_db)) -> list[dict]:
|
||||||
|
rows = db.execute(
|
||||||
|
"SELECT id, title, file_path, status, task_id, page_count, created_at"
|
||||||
|
" FROM documents ORDER BY created_at DESC"
|
||||||
|
).fetchall()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/scan", status_code=202)
|
||||||
|
def scan_library(
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
db: sqlite3.Connection = Depends(get_db),
|
||||||
|
) -> dict:
|
||||||
|
"""Scan the watched directory and queue ingest for any new PDFs."""
|
||||||
|
watch = WATCH_DIR
|
||||||
|
if not watch.exists():
|
||||||
|
raise HTTPException(status_code=404, detail=f"Watch directory not found: {watch}")
|
||||||
|
|
||||||
|
pdfs = list(watch.glob("**/*.pdf"))
|
||||||
|
queued = []
|
||||||
|
|
||||||
|
for pdf_path in pdfs:
|
||||||
|
path_str = str(pdf_path.resolve())
|
||||||
|
existing = db.execute(
|
||||||
|
"SELECT id, status FROM documents WHERE file_path = ?", [path_str]
|
||||||
|
).fetchone()
|
||||||
|
|
||||||
|
if existing and existing["status"] == "ready":
|
||||||
|
continue # already indexed
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
doc_id = existing["id"]
|
||||||
|
else:
|
||||||
|
title = pdf_path.stem.replace("_", " ").replace("-", " ").title()
|
||||||
|
doc_id = db.execute(
|
||||||
|
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
|
||||||
|
[title, path_str, "pending"],
|
||||||
|
).fetchone()[0]
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
task_id = _dispatch_ingest(doc_id, path_str, background_tasks)
|
||||||
|
db.execute(
|
||||||
|
"UPDATE documents SET status='processing', task_id=? WHERE id=?",
|
||||||
|
[task_id, doc_id],
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
|
queued.append({"doc_id": doc_id, "task_id": task_id})
|
||||||
|
|
||||||
|
return {"discovered": len(pdfs), "queued": len(queued), "tasks": queued}
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{doc_id}/reingest", status_code=202)
|
||||||
|
def reingest_document(
|
||||||
|
doc_id: str,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
db: sqlite3.Connection = Depends(get_db),
|
||||||
|
) -> dict:
|
||||||
|
row = db.execute("SELECT file_path FROM documents WHERE id=?", [doc_id]).fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
task_id = _dispatch_ingest(doc_id, row["file_path"], background_tasks)
|
||||||
|
db.execute(
|
||||||
|
"UPDATE documents SET status='processing', task_id=?, error_msg=NULL WHERE id=?",
|
||||||
|
[task_id, doc_id],
|
||||||
|
)
|
||||||
|
db.commit()
|
||||||
|
return {"doc_id": doc_id, "task_id": task_id}
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{doc_id}", status_code=204)
|
||||||
|
def delete_document(
|
||||||
|
doc_id: str,
|
||||||
|
db: sqlite3.Connection = Depends(get_db),
|
||||||
|
) -> None:
|
||||||
|
row = db.execute("SELECT id FROM documents WHERE id=?", [doc_id]).fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
|
||||||
|
db.execute("DELETE FROM documents WHERE id=?", [doc_id])
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
# Remove embeddings from vector store
|
||||||
|
try:
|
||||||
|
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore # type: ignore[import]
|
||||||
|
store = LocalSQLiteVecStore(db_path=VEC_DB_PATH, table="page_vecs", dimensions=768)
|
||||||
|
store.delete_where({"doc_id": doc_id})
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Could not remove vectors for doc %s: %s", doc_id, exc)
|
||||||
|
|
||||||
|
if _mark_bm25_dirty:
|
||||||
|
_mark_bm25_dirty()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{doc_id}/status")
|
||||||
|
def document_status(
|
||||||
|
doc_id: str,
|
||||||
|
db: sqlite3.Connection = Depends(get_db),
|
||||||
|
) -> dict:
|
||||||
|
row = db.execute(
|
||||||
|
"SELECT id, status, task_id, page_count, error_msg FROM documents WHERE id=?",
|
||||||
|
[doc_id],
|
||||||
|
).fetchone()
|
||||||
|
if not row:
|
||||||
|
raise HTTPException(status_code=404, detail="Document not found")
|
||||||
|
return dict(row)
|
||||||
67
app/api/search.py
Normal file
67
app/api/search.py
Normal file
|
|
@ -0,0 +1,67 @@
|
||||||
|
# app/api/search.py
|
||||||
|
"""
|
||||||
|
BM25 keyword search across the document library.
|
||||||
|
|
||||||
|
MIT — no tier gate. No Ollama required.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Annotated
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from app.services.bm25_index import BM25Index
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
router = APIRouter(prefix="/api/search", tags=["search"])
|
||||||
|
|
||||||
|
|
||||||
|
class SearchRequest(BaseModel):
|
||||||
|
query: str
|
||||||
|
top_k: int = Field(default=10, ge=1, le=50)
|
||||||
|
doc_ids: list[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
class SearchResult(BaseModel):
|
||||||
|
chunk_id: str
|
||||||
|
doc_id: str
|
||||||
|
page_number: int
|
||||||
|
text_snippet: str # first 300 chars of the page text
|
||||||
|
bm25_score: float
|
||||||
|
|
||||||
|
|
||||||
|
def _get_bm25() -> BM25Index:
|
||||||
|
import app.main as _main
|
||||||
|
bm25 = getattr(_main, "_bm25", None)
|
||||||
|
if bm25 is None:
|
||||||
|
raise RuntimeError("BM25 index not initialised — app.main not loaded")
|
||||||
|
return bm25
|
||||||
|
|
||||||
|
|
||||||
|
def _get_db_path() -> str:
|
||||||
|
"""Read lazily so test fixtures (monkeypatch.setattr) take effect."""
|
||||||
|
import pathlib
|
||||||
|
data_dir = pathlib.Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
||||||
|
return str(data_dir / "pagepiper.db")
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("")
|
||||||
|
def search(
|
||||||
|
req: SearchRequest,
|
||||||
|
bm25: Annotated[BM25Index, Depends(_get_bm25)],
|
||||||
|
) -> list[SearchResult]:
|
||||||
|
bm25.ensure_fresh(_get_db_path())
|
||||||
|
hits = bm25.query(req.query, top_k=req.top_k, doc_ids=req.doc_ids)
|
||||||
|
return [
|
||||||
|
SearchResult(
|
||||||
|
chunk_id=h.chunk_id,
|
||||||
|
doc_id=h.doc_id,
|
||||||
|
page_number=h.page_number,
|
||||||
|
text_snippet=(h.text or "")[:300],
|
||||||
|
bm25_score=h.score,
|
||||||
|
)
|
||||||
|
for h in hits
|
||||||
|
]
|
||||||
19
app/deps.py
Normal file
19
app/deps.py
Normal file
|
|
@ -0,0 +1,19 @@
|
||||||
|
# app/deps.py
|
||||||
|
"""FastAPI dependency providers."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
from app.config import DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def get_db() -> Generator[sqlite3.Connection, None, None]:
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
conn.execute("PRAGMA journal_mode = WAL")
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
46
app/main.py
Normal file
46
app/main.py
Normal file
|
|
@ -0,0 +1,46 @@
|
||||||
|
# app/main.py
|
||||||
|
"""FastAPI application factory for pagepiper."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
from fastapi import FastAPI
|
||||||
|
|
||||||
|
from app.config import DB_PATH
|
||||||
|
from app.services.bm25_index import BM25Index
|
||||||
|
|
||||||
|
logger = logging.getLogger("pagepiper")
|
||||||
|
|
||||||
|
# Module-level BM25 singleton — shared across all requests
|
||||||
|
_bm25 = BM25Index()
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_migrations() -> None:
|
||||||
|
from scripts.db_migrate import migrate
|
||||||
|
migrate(DB_PATH)
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
_apply_migrations()
|
||||||
|
_bm25.mark_dirty() # will rebuild on first search
|
||||||
|
yield
|
||||||
|
|
||||||
|
|
||||||
|
app = FastAPI(title="Pagepiper", lifespan=lifespan)
|
||||||
|
|
||||||
|
# Wire BM25 dirty callback into library router
|
||||||
|
from app.api import library as _lib_module # noqa: E402
|
||||||
|
_lib_module._mark_bm25_dirty = _bm25.mark_dirty
|
||||||
|
|
||||||
|
# Register routers
|
||||||
|
from app.api.library import router as library_router # noqa: E402
|
||||||
|
from app.api.ingest import router as ingest_router # noqa: E402
|
||||||
|
from app.api.search import router as search_router # noqa: E402
|
||||||
|
from app.api.chat import router as chat_router # noqa: E402
|
||||||
|
|
||||||
|
app.include_router(library_router)
|
||||||
|
app.include_router(ingest_router)
|
||||||
|
app.include_router(search_router)
|
||||||
|
app.include_router(chat_router)
|
||||||
102
app/services/bm25_index.py
Normal file
102
app/services/bm25_index.py
Normal file
|
|
@ -0,0 +1,102 @@
|
||||||
|
"""
|
||||||
|
BM25 keyword search over the page_chunks corpus.
|
||||||
|
|
||||||
|
MIT — no tier gate. Available to all users with no Ollama required.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sqlite3
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from rank_bm25 import BM25Okapi
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class BM25Result:
|
||||||
|
"""A single BM25 search result."""
|
||||||
|
|
||||||
|
chunk_id: str
|
||||||
|
doc_id: str
|
||||||
|
page_number: int
|
||||||
|
text: str
|
||||||
|
score: float
|
||||||
|
|
||||||
|
|
||||||
|
class BM25Index:
|
||||||
|
"""
|
||||||
|
In-memory BM25 index over page_chunks. Rebuilt lazily on demand.
|
||||||
|
|
||||||
|
Thread-safety note: rebuilt synchronously in the request thread. For
|
||||||
|
single-user local deployments this is acceptable.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._index: BM25Okapi | None = None
|
||||||
|
self._chunks: list[dict] = []
|
||||||
|
self._dirty: bool = True
|
||||||
|
|
||||||
|
def mark_dirty(self) -> None:
|
||||||
|
"""Signal that the index needs rebuilding (call after any ingest completes)."""
|
||||||
|
self._dirty = True
|
||||||
|
|
||||||
|
def ensure_fresh(self, db_path: str) -> None:
|
||||||
|
"""Rebuild from SQLite if dirty."""
|
||||||
|
if not self._dirty:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT id, doc_id, page_number, text FROM page_chunks ORDER BY doc_id, page_number"
|
||||||
|
).fetchall()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
except sqlite3.Error as exc:
|
||||||
|
logger.error("BM25 index rebuild failed: %s", exc)
|
||||||
|
return
|
||||||
|
self._load_chunks([dict(r) for r in rows])
|
||||||
|
self._dirty = False
|
||||||
|
logger.info("BM25 index rebuilt: %d chunks", len(self._chunks))
|
||||||
|
|
||||||
|
def _load_chunks(self, chunks: list[dict]) -> None:
|
||||||
|
self._chunks = chunks
|
||||||
|
tokenized = [c["text"].lower().split() for c in chunks]
|
||||||
|
self._index = BM25Okapi(tokenized) if tokenized else None
|
||||||
|
|
||||||
|
def query(
|
||||||
|
self,
|
||||||
|
query_text: str,
|
||||||
|
top_k: int = 10,
|
||||||
|
doc_ids: list[str] | None = None,
|
||||||
|
) -> list[BM25Result]:
|
||||||
|
"""Search the corpus. Returns results sorted by descending BM25 score."""
|
||||||
|
if not self._index or not self._chunks:
|
||||||
|
return []
|
||||||
|
|
||||||
|
scores = self._index.get_scores(query_text.lower().split())
|
||||||
|
ranked = sorted(enumerate(scores), key=lambda x: x[1], reverse=True)
|
||||||
|
|
||||||
|
results: list[BM25Result] = []
|
||||||
|
for i, score in ranked:
|
||||||
|
if score <= 0:
|
||||||
|
continue
|
||||||
|
c = self._chunks[i]
|
||||||
|
if doc_ids is not None and c["doc_id"] not in doc_ids:
|
||||||
|
continue
|
||||||
|
results.append(
|
||||||
|
BM25Result(
|
||||||
|
chunk_id=c["id"],
|
||||||
|
doc_id=c["doc_id"],
|
||||||
|
page_number=c["page_number"],
|
||||||
|
text=c["text"],
|
||||||
|
score=float(score),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if len(results) >= top_k:
|
||||||
|
break
|
||||||
|
return results
|
||||||
123
app/services/retriever.py
Normal file
123
app/services/retriever.py
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
# app/services/retriever.py
|
||||||
|
"""
|
||||||
|
Hybrid BM25 + semantic retriever.
|
||||||
|
|
||||||
|
BSL 1.1 — semantic path requires PAGEPIPER_OLLAMA_URL (BYOK gate).
|
||||||
|
BM25-only path is MIT and has no gate.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from app.services.bm25_index import BM25Index
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class RetrievedChunk:
|
||||||
|
"""A chunk returned by the retriever, with source scores."""
|
||||||
|
|
||||||
|
chunk_id: str
|
||||||
|
doc_id: str
|
||||||
|
page_number: int
|
||||||
|
text: str
|
||||||
|
bm25_score: float
|
||||||
|
vector_score: float | None
|
||||||
|
|
||||||
|
|
||||||
|
class Retriever:
|
||||||
|
def __init__(self, bm25: BM25Index) -> None:
|
||||||
|
self._bm25 = bm25
|
||||||
|
|
||||||
|
def hybrid_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
top_k: int,
|
||||||
|
doc_ids: list[str] | None,
|
||||||
|
db_path: str,
|
||||||
|
vec_db_path: str,
|
||||||
|
llm, # LLMRouter | None — caller must pass
|
||||||
|
) -> list[RetrievedChunk]:
|
||||||
|
"""
|
||||||
|
Merge BM25 and semantic results.
|
||||||
|
Falls back to BM25-only if llm is None.
|
||||||
|
"""
|
||||||
|
if llm is None:
|
||||||
|
return self._bm25_only(query, top_k, doc_ids, db_path)
|
||||||
|
|
||||||
|
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
|
||||||
|
|
||||||
|
self._bm25.ensure_fresh(db_path)
|
||||||
|
bm25_hits = {
|
||||||
|
r.chunk_id: r
|
||||||
|
for r in self._bm25.query(query, top_k=top_k * 2, doc_ids=doc_ids)
|
||||||
|
}
|
||||||
|
|
||||||
|
vec = llm.embed([query])[0]
|
||||||
|
store = LocalSQLiteVecStore(db_path=vec_db_path, table="page_vecs", dimensions=768)
|
||||||
|
filter_meta = {"doc_id": doc_ids[0]} if doc_ids and len(doc_ids) == 1 else None
|
||||||
|
vec_hits = store.query(vec, top_k=top_k * 2, filter_metadata=filter_meta)
|
||||||
|
|
||||||
|
if doc_ids and len(doc_ids) > 1:
|
||||||
|
vec_hits = [h for h in vec_hits if h.metadata.get("doc_id") in doc_ids]
|
||||||
|
|
||||||
|
# Merge: BM25 hits take priority; vector hits fill in additional results
|
||||||
|
merged: dict[str, RetrievedChunk] = {}
|
||||||
|
for cid, r in bm25_hits.items():
|
||||||
|
merged[cid] = RetrievedChunk(
|
||||||
|
chunk_id=cid,
|
||||||
|
doc_id=r.doc_id,
|
||||||
|
page_number=r.page_number,
|
||||||
|
text=r.text,
|
||||||
|
bm25_score=r.score,
|
||||||
|
vector_score=None,
|
||||||
|
)
|
||||||
|
for vh in vec_hits:
|
||||||
|
# _chunks is the loaded list of dicts from BM25Index; no public accessor exists
|
||||||
|
text = next((c["text"] for c in self._bm25._chunks if c["id"] == vh.id), "")
|
||||||
|
if vh.id in merged:
|
||||||
|
existing = merged[vh.id]
|
||||||
|
merged[vh.id] = RetrievedChunk(
|
||||||
|
chunk_id=existing.chunk_id,
|
||||||
|
doc_id=existing.doc_id,
|
||||||
|
page_number=existing.page_number,
|
||||||
|
text=existing.text,
|
||||||
|
bm25_score=existing.bm25_score,
|
||||||
|
vector_score=vh.score,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
merged[vh.id] = RetrievedChunk(
|
||||||
|
chunk_id=vh.id,
|
||||||
|
doc_id=vh.metadata.get("doc_id", ""),
|
||||||
|
page_number=int(vh.metadata.get("page_number", 0)),
|
||||||
|
text=text,
|
||||||
|
bm25_score=0.0,
|
||||||
|
vector_score=vh.score,
|
||||||
|
)
|
||||||
|
|
||||||
|
def _combined(r: RetrievedChunk) -> float:
|
||||||
|
bm25 = r.bm25_score
|
||||||
|
# sqlite-vec returns L2 distance (lower=better); invert to [0,1] higher-is-better
|
||||||
|
vec = (1.0 / (1.0 + r.vector_score)) if r.vector_score is not None else 0.0
|
||||||
|
return bm25 * 0.5 + vec * 0.5
|
||||||
|
|
||||||
|
ranked = sorted(merged.values(), key=_combined, reverse=True)
|
||||||
|
return ranked[:top_k]
|
||||||
|
|
||||||
|
def _bm25_only(
|
||||||
|
self, query: str, top_k: int, doc_ids: list[str] | None, db_path: str
|
||||||
|
) -> list[RetrievedChunk]:
|
||||||
|
self._bm25.ensure_fresh(db_path)
|
||||||
|
return [
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id=r.chunk_id,
|
||||||
|
doc_id=r.doc_id,
|
||||||
|
page_number=r.page_number,
|
||||||
|
text=r.text,
|
||||||
|
bm25_score=r.score,
|
||||||
|
vector_score=None,
|
||||||
|
)
|
||||||
|
for r in self._bm25.query(query, top_k=top_k, doc_ids=doc_ids)
|
||||||
|
]
|
||||||
60
app/services/synthesizer.py
Normal file
60
app/services/synthesizer.py
Normal file
|
|
@ -0,0 +1,60 @@
|
||||||
|
# app/services/synthesizer.py
|
||||||
|
"""
|
||||||
|
LLM answer synthesis over retrieved chunks.
|
||||||
|
|
||||||
|
BSL 1.1 — requires LLMRouter (Ollama BYOK or cloud tier).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
from app.services.retriever import RetrievedChunk
|
||||||
|
|
||||||
|
_SYSTEM_PROMPT = (
|
||||||
|
"You are a helpful document assistant. "
|
||||||
|
"Answer the user's question using ONLY the provided document excerpts. "
|
||||||
|
"For each claim, cite the source page as [p.N]. "
|
||||||
|
"If the excerpts are insufficient, say so. Do not invent information."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Citation:
|
||||||
|
doc_id: str
|
||||||
|
page_number: int
|
||||||
|
snippet: str
|
||||||
|
bm25_score: float
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class SynthesisResult:
|
||||||
|
answer: str
|
||||||
|
citations: tuple[Citation, ...]
|
||||||
|
|
||||||
|
|
||||||
|
class Synthesizer:
|
||||||
|
def __init__(self, llm) -> None: # LLMRouter
|
||||||
|
self._llm = llm
|
||||||
|
|
||||||
|
def synthesize(
|
||||||
|
self,
|
||||||
|
message: str,
|
||||||
|
history: list[dict],
|
||||||
|
chunks: list[RetrievedChunk],
|
||||||
|
) -> SynthesisResult:
|
||||||
|
context_parts = [f"[p.{c.page_number}]\n{c.text[:500]}" for c in chunks]
|
||||||
|
context = "\n\n---\n\n".join(context_parts)
|
||||||
|
prompt = f"Document excerpts:\n\n{context}\n\nQuestion: {message}"
|
||||||
|
|
||||||
|
answer = self._llm.complete(prompt, system=_SYSTEM_PROMPT)
|
||||||
|
|
||||||
|
citations = tuple(
|
||||||
|
Citation(
|
||||||
|
doc_id=c.doc_id,
|
||||||
|
page_number=c.page_number,
|
||||||
|
snippet=c.text[:200],
|
||||||
|
bm25_score=c.bm25_score,
|
||||||
|
)
|
||||||
|
for c in chunks
|
||||||
|
)
|
||||||
|
return SynthesisResult(answer=answer, citations=citations)
|
||||||
51
compose.cloud.yml
Normal file
51
compose.cloud.yml
Normal file
|
|
@ -0,0 +1,51 @@
|
||||||
|
# Pagepiper — cloud managed instance
|
||||||
|
# Project: pagepiper-cloud (docker compose -f compose.cloud.yml -p pagepiper-cloud ...)
|
||||||
|
# Web: http://127.0.0.1:8533 → pagepiper.circuitforge.tech (primary)
|
||||||
|
# → menagerie.circuitforge.tech/pagepiper (secondary)
|
||||||
|
# API: internal only on pagepiper-cloud-net (nginx proxies /api/ → api:8522)
|
||||||
|
|
||||||
|
services:
|
||||||
|
api:
|
||||||
|
build:
|
||||||
|
context: ..
|
||||||
|
dockerfile: pagepiper/Dockerfile
|
||||||
|
restart: unless-stopped
|
||||||
|
env_file: .env
|
||||||
|
environment:
|
||||||
|
CLOUD_MODE: "true"
|
||||||
|
PAGEPIPER_DATA_DIR: /devl/pagepiper-cloud-data
|
||||||
|
PAGEPIPER_BOOKS_DIR: /devl/pagepiper-cloud-data/books
|
||||||
|
# PAGEPIPER_OLLAMA_URL — set in .env (BYOK gate for hybrid search + RAG)
|
||||||
|
# HEIMDALL_URL, HEIMDALL_ADMIN_TOKEN — set in .env for license validation
|
||||||
|
# cf-orch: route LLM inference through coordinator for managed GPU access
|
||||||
|
CF_ORCH_URL: http://host.docker.internal:7700
|
||||||
|
CF_APP_NAME: pagepiper
|
||||||
|
COORDINATOR_URL: http://10.1.10.71:7700
|
||||||
|
COORDINATOR_PAGEPIPER_KEY: ${COORDINATOR_PAGEPIPER_KEY:-}
|
||||||
|
extra_hosts:
|
||||||
|
- "host.docker.internal:host-gateway"
|
||||||
|
volumes:
|
||||||
|
- /devl/pagepiper-cloud-data:/devl/pagepiper-cloud-data
|
||||||
|
- ${HOME}/.config/circuitforge:/root/.config/circuitforge:ro
|
||||||
|
networks:
|
||||||
|
- pagepiper-cloud-net
|
||||||
|
|
||||||
|
web:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: docker/web/Dockerfile
|
||||||
|
args:
|
||||||
|
VITE_BASE_URL: /pagepiper
|
||||||
|
VITE_API_BASE: /pagepiper
|
||||||
|
NGINX_CONF: docker/web/nginx.cloud.conf
|
||||||
|
restart: unless-stopped
|
||||||
|
ports:
|
||||||
|
- "8533:80"
|
||||||
|
networks:
|
||||||
|
- pagepiper-cloud-net
|
||||||
|
depends_on:
|
||||||
|
- api
|
||||||
|
|
||||||
|
networks:
|
||||||
|
pagepiper-cloud-net:
|
||||||
|
driver: bridge
|
||||||
|
|
@ -14,6 +14,7 @@ RUN npm run build
|
||||||
|
|
||||||
# Stage 2: serve via nginx
|
# Stage 2: serve via nginx
|
||||||
FROM nginx:alpine
|
FROM nginx:alpine
|
||||||
COPY docker/web/nginx.conf /etc/nginx/conf.d/default.conf
|
ARG NGINX_CONF=docker/web/nginx.conf
|
||||||
|
COPY ${NGINX_CONF} /etc/nginx/conf.d/default.conf
|
||||||
COPY --from=build /app/dist /usr/share/nginx/html
|
COPY --from=build /app/dist /usr/share/nginx/html
|
||||||
EXPOSE 80
|
EXPOSE 80
|
||||||
|
|
|
||||||
59
docker/web/nginx.cloud.conf
Normal file
59
docker/web/nginx.cloud.conf
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
server {
|
||||||
|
listen 80;
|
||||||
|
server_name _;
|
||||||
|
|
||||||
|
root /usr/share/nginx/html;
|
||||||
|
index index.html;
|
||||||
|
|
||||||
|
# API requests when accessed via Caddy (prefix already stripped by handle_path)
|
||||||
|
location /api/ {
|
||||||
|
proxy_pass http://api:8522;
|
||||||
|
proxy_set_header Host $http_host;
|
||||||
|
proxy_set_header X-Real-IP $http_x_real_ip;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $http_x_forwarded_proto;
|
||||||
|
proxy_set_header X-CF-Session $http_x_cf_session;
|
||||||
|
client_max_body_size 50m;
|
||||||
|
# PDF uploads and LLM inference can be slow
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
}
|
||||||
|
|
||||||
|
# API requests when accessed directly via pagepiper.circuitforge.tech
|
||||||
|
# VITE_API_BASE=/pagepiper means frontend builds calls as /pagepiper/api/...
|
||||||
|
# Caddy passes these unchanged; nginx strips /pagepiper prefix here.
|
||||||
|
location /pagepiper/api/ {
|
||||||
|
rewrite ^/pagepiper(/api/.*)$ $1 break;
|
||||||
|
proxy_pass http://api:8522;
|
||||||
|
proxy_set_header Host $http_host;
|
||||||
|
proxy_set_header X-Real-IP $http_x_real_ip;
|
||||||
|
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||||
|
proxy_set_header X-Forwarded-Proto $http_x_forwarded_proto;
|
||||||
|
proxy_set_header X-CF-Session $http_x_cf_session;
|
||||||
|
client_max_body_size 50m;
|
||||||
|
proxy_read_timeout 300s;
|
||||||
|
proxy_send_timeout 300s;
|
||||||
|
}
|
||||||
|
|
||||||
|
# Static assets at the /pagepiper/ base — used when Caddy does NOT strip the prefix
|
||||||
|
# (i.e., pagepiper.circuitforge.tech routes, where /pagepiper is the Vite base URL).
|
||||||
|
# ^~ prevents regex asset location from matching first.
|
||||||
|
location ^~ /pagepiper/ {
|
||||||
|
alias /usr/share/nginx/html/;
|
||||||
|
try_files $uri $uri/ /index.html;
|
||||||
|
}
|
||||||
|
|
||||||
|
location = /index.html {
|
||||||
|
add_header Cache-Control "no-cache, no-store, must-revalidate";
|
||||||
|
try_files $uri /index.html;
|
||||||
|
}
|
||||||
|
|
||||||
|
location / {
|
||||||
|
try_files $uri $uri/ /index.html;
|
||||||
|
}
|
||||||
|
|
||||||
|
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff2?)$ {
|
||||||
|
expires 1y;
|
||||||
|
add_header Cache-Control "public, immutable";
|
||||||
|
}
|
||||||
|
}
|
||||||
27
manage.sh
27
manage.sh
|
|
@ -3,13 +3,17 @@ set -euo pipefail
|
||||||
|
|
||||||
SERVICE=pagepiper
|
SERVICE=pagepiper
|
||||||
WEB_PORT=8521
|
WEB_PORT=8521
|
||||||
|
CLOUD_WEB_PORT=8533
|
||||||
COMPOSE_FILE="compose.yml"
|
COMPOSE_FILE="compose.yml"
|
||||||
|
COMPOSE_CLOUD_FILE="compose.cloud.yml"
|
||||||
|
CLOUD_PROJECT="pagepiper-cloud"
|
||||||
|
|
||||||
OVERRIDE_ARGS=()
|
OVERRIDE_ARGS=()
|
||||||
[[ -f "compose.override.yml" ]] && OVERRIDE_ARGS=(-f compose.override.yml)
|
[[ -f "compose.override.yml" ]] && OVERRIDE_ARGS=(-f compose.override.yml)
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo "Usage: $0 {start|stop|restart|status|logs [svc]|open|build|test}"
|
echo "Usage: $0 {start|stop|restart|status|logs [svc]|open|build|test"
|
||||||
|
echo " |cloud:start|cloud:stop|cloud:restart|cloud:status|cloud:logs [svc]|cloud:build}"
|
||||||
exit 1
|
exit 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -44,6 +48,27 @@ case "$cmd" in
|
||||||
test)
|
test)
|
||||||
conda run -n cf pytest tests/ -v
|
conda run -n cf pytest tests/ -v
|
||||||
;;
|
;;
|
||||||
|
cloud:start)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" up -d --build
|
||||||
|
echo "Pagepiper cloud running → http://localhost:${CLOUD_WEB_PORT}"
|
||||||
|
;;
|
||||||
|
cloud:stop)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" down
|
||||||
|
;;
|
||||||
|
cloud:restart)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" down
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" up -d --build
|
||||||
|
echo "Pagepiper cloud running → http://localhost:${CLOUD_WEB_PORT}"
|
||||||
|
;;
|
||||||
|
cloud:status)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" ps
|
||||||
|
;;
|
||||||
|
cloud:logs)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" logs -f "${1:-}"
|
||||||
|
;;
|
||||||
|
cloud:build)
|
||||||
|
docker compose -f "$COMPOSE_CLOUD_FILE" -p "$CLOUD_PROJECT" build --no-cache
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
usage
|
usage
|
||||||
;;
|
;;
|
||||||
|
|
|
||||||
154
scripts/ingest_pdf.py
Normal file
154
scripts/ingest_pdf.py
Normal file
|
|
@ -0,0 +1,154 @@
|
||||||
|
# scripts/ingest_pdf.py
|
||||||
|
"""
|
||||||
|
cf-orch task: pagepiper/ingest_pdf
|
||||||
|
|
||||||
|
Extracts text from a PDF, stores page chunks in SQLite, and (if Ollama is
|
||||||
|
configured) generates embeddings and stores them in the sqlite-vec store.
|
||||||
|
|
||||||
|
Entry point:
|
||||||
|
python scripts/ingest_pdf.py --doc-id X --file-path Y --db-path Z --vec-db-path W
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger("pagepiper.ingest")
|
||||||
|
|
||||||
|
# Pages to embed per Ollama API call — avoids hitting request size limits on large PDFs
|
||||||
|
EMBED_BATCH_SIZE = 64
|
||||||
|
|
||||||
|
|
||||||
|
def _update_status(
|
||||||
|
conn: sqlite3.Connection,
|
||||||
|
doc_id: str,
|
||||||
|
status: str,
|
||||||
|
page_count: int | None = None,
|
||||||
|
error_msg: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
if page_count is not None:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE documents SET status=?, page_count=?, updated_at=datetime('now') WHERE id=?",
|
||||||
|
[status, page_count, doc_id],
|
||||||
|
)
|
||||||
|
elif error_msg is not None:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE documents SET status=?, error_msg=?, updated_at=datetime('now') WHERE id=?",
|
||||||
|
[status, error_msg, doc_id],
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE documents SET status=?, updated_at=datetime('now') WHERE id=?",
|
||||||
|
[status, doc_id],
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
|
def run(doc_id: str, file_path: str, db_path: str, vec_db_path: str) -> None:
|
||||||
|
"""Run the full ingest pipeline for one PDF. Called by cf-orch or BackgroundTasks."""
|
||||||
|
from circuitforge_core.documents.pdf import PDFExtractor
|
||||||
|
|
||||||
|
conn: sqlite3.Connection | None = None
|
||||||
|
try:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
_update_status(conn, doc_id, "processing")
|
||||||
|
|
||||||
|
# Step 1: Extract page chunks
|
||||||
|
logger.info("Extracting text from %s", file_path)
|
||||||
|
extractor = PDFExtractor(ocr_min_words=10)
|
||||||
|
chunks = extractor.chunk_pages(file_path)
|
||||||
|
logger.info("Extracted %d pages", len(chunks))
|
||||||
|
|
||||||
|
# Step 2: Store chunks (replace any existing for this doc)
|
||||||
|
conn.execute("DELETE FROM page_chunks WHERE doc_id=?", [doc_id])
|
||||||
|
chunk_rows: list[tuple[str, int, str]] = []
|
||||||
|
for chunk in chunks:
|
||||||
|
row = conn.execute(
|
||||||
|
"""INSERT INTO page_chunks(doc_id, page_number, text, source, word_count)
|
||||||
|
VALUES (?,?,?,?,?) RETURNING id""",
|
||||||
|
[doc_id, chunk.page_number, chunk.text, chunk.source, chunk.word_count],
|
||||||
|
).fetchone()
|
||||||
|
chunk_rows.append((row[0], chunk.page_number, chunk.text))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Step 3: Embed and store vectors if Ollama is configured (BYOK gate)
|
||||||
|
ollama_url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
|
||||||
|
if ollama_url and chunks:
|
||||||
|
logger.info("Embedding %d pages via Ollama at %s", len(chunks), ollama_url)
|
||||||
|
from circuitforge_core.llm import LLMRouter
|
||||||
|
from circuitforge_core.vector.sqlite_vec import LocalSQLiteVecStore
|
||||||
|
|
||||||
|
_clean = ollama_url.rstrip("/")
|
||||||
|
base_url = _clean if _clean.endswith("/v1") else _clean + "/v1"
|
||||||
|
router = LLMRouter({
|
||||||
|
"fallback_order": ["ollama"],
|
||||||
|
"backends": {
|
||||||
|
"ollama": {
|
||||||
|
"type": "openai_compat",
|
||||||
|
"base_url": base_url,
|
||||||
|
"model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
|
||||||
|
"embedding_model": os.environ.get(
|
||||||
|
"PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
|
||||||
|
),
|
||||||
|
"supports_images": False,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
})
|
||||||
|
vec_store = LocalSQLiteVecStore(
|
||||||
|
db_path=vec_db_path, table="page_vecs", dimensions=768
|
||||||
|
)
|
||||||
|
# Remove old vectors before re-inserting. If embedding fails mid-way,
|
||||||
|
# old vectors are gone but new ones are partial — re-ingest recovers.
|
||||||
|
vec_store.delete_where({"doc_id": doc_id})
|
||||||
|
|
||||||
|
texts = [text for _, _, text in chunk_rows]
|
||||||
|
vectors: list[list[float]] = []
|
||||||
|
for i in range(0, len(texts), EMBED_BATCH_SIZE):
|
||||||
|
vectors.extend(router.embed(texts[i : i + EMBED_BATCH_SIZE]))
|
||||||
|
|
||||||
|
for (chunk_id, page_number, _), vector in zip(chunk_rows, vectors):
|
||||||
|
vec_store.upsert(
|
||||||
|
id=chunk_id,
|
||||||
|
vector=vector,
|
||||||
|
metadata={"doc_id": doc_id, "page_number": page_number},
|
||||||
|
)
|
||||||
|
logger.info("Stored %d embeddings", len(vectors))
|
||||||
|
|
||||||
|
_update_status(conn, doc_id, "ready", page_count=len(chunks))
|
||||||
|
logger.info("Ingest complete for doc %s (%d pages)", doc_id, len(chunks))
|
||||||
|
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error("Ingest failed for doc %s: %s", doc_id, exc, exc_info=True)
|
||||||
|
if conn is not None:
|
||||||
|
try:
|
||||||
|
_update_status(conn, doc_id, "error", error_msg=str(exc))
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Could not write error status for doc %s", doc_id)
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
if conn is not None:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Ingest a PDF (cf-orch task entry point)"
|
||||||
|
)
|
||||||
|
parser.add_argument("--doc-id", required=True)
|
||||||
|
parser.add_argument("--file-path", required=True)
|
||||||
|
parser.add_argument("--db-path", required=True)
|
||||||
|
parser.add_argument("--vec-db-path", required=True)
|
||||||
|
a = parser.parse_args()
|
||||||
|
run(
|
||||||
|
doc_id=a.doc_id,
|
||||||
|
file_path=a.file_path,
|
||||||
|
db_path=a.db_path,
|
||||||
|
vec_db_path=a.vec_db_path,
|
||||||
|
)
|
||||||
48
tests/conftest.py
Normal file
48
tests/conftest.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
# tests/conftest.py
|
||||||
|
"""Shared fixtures for pagepiper test suite."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def test_db(tmp_path) -> str:
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.executescript(schema)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return db_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client(test_db, tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("PAGEPIPER_DATA_DIR", str(tmp_path))
|
||||||
|
monkeypatch.setenv("PAGEPIPER_WATCH_DIR", str(tmp_path / "books"))
|
||||||
|
(tmp_path / "books").mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
import app.main as _main_module
|
||||||
|
from app.main import app, _bm25
|
||||||
|
from app.deps import get_db
|
||||||
|
|
||||||
|
# Suppress migrations during tests — test_db fixture already applies the schema
|
||||||
|
monkeypatch.setattr(_main_module, "_apply_migrations", lambda: None)
|
||||||
|
|
||||||
|
def override_db():
|
||||||
|
conn = sqlite3.connect(test_db)
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
try:
|
||||||
|
yield conn
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
app.dependency_overrides[get_db] = override_db
|
||||||
|
_bm25.mark_dirty() # clear any state from previous tests
|
||||||
|
yield TestClient(app)
|
||||||
|
app.dependency_overrides.clear()
|
||||||
96
tests/test_bm25_index.py
Normal file
96
tests/test_bm25_index.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
"""Tests for app.services.bm25_index."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.services.bm25_index import BM25Index, BM25Result
|
||||||
|
|
||||||
|
|
||||||
|
def _seeded_index() -> BM25Index:
|
||||||
|
idx = BM25Index()
|
||||||
|
idx._load_chunks(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "c1",
|
||||||
|
"doc_id": "book-a",
|
||||||
|
"page_number": 1,
|
||||||
|
"text": "Fireball deals 8d6 fire damage on a failed Dexterity saving throw.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "c2",
|
||||||
|
"doc_id": "book-a",
|
||||||
|
"page_number": 2,
|
||||||
|
"text": "A wizard can cast one spell per turn unless they have Action Surge.",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "c3",
|
||||||
|
"doc_id": "book-b",
|
||||||
|
"page_number": 5,
|
||||||
|
"text": "Grapple rules apply when the attacker uses the Attack action to grab a target.",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
return idx
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_returns_relevant_result():
|
||||||
|
idx = _seeded_index()
|
||||||
|
results = idx.query("fireball fire damage")
|
||||||
|
assert len(results) >= 1
|
||||||
|
assert results[0].chunk_id == "c1"
|
||||||
|
assert results[0].score > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_respects_top_k():
|
||||||
|
# "action" matches all three chunks; top_k=2 must hard-cap the result list
|
||||||
|
idx = _seeded_index()
|
||||||
|
results = idx.query("action", top_k=2)
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_filters_by_doc_id():
|
||||||
|
idx = _seeded_index()
|
||||||
|
results = idx.query("rules", doc_ids=["book-b"])
|
||||||
|
assert all(r.doc_id == "book-b" for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_query_empty_corpus_returns_empty():
|
||||||
|
idx = BM25Index()
|
||||||
|
idx._load_chunks([])
|
||||||
|
results = idx.query("anything")
|
||||||
|
assert results == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_mark_dirty_triggers_rebuild(tmp_path):
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"CREATE TABLE page_chunks(id TEXT, doc_id TEXT, page_number INT, text TEXT)"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks VALUES ('x1','doc-1',1,'Ranger favored enemy favored terrain terrain bonuses bonuses action attack')"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks VALUES ('x2','doc-1',2,'Wizard can cast spells and perform actions')"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks VALUES ('x3','doc-1',3,'Fighter attacks and deals damage with weapon')"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
idx = BM25Index()
|
||||||
|
idx.mark_dirty()
|
||||||
|
idx.ensure_fresh(db_path)
|
||||||
|
results = idx.query("ranger terrain")
|
||||||
|
assert len(results) >= 1
|
||||||
|
assert results[0].chunk_id == "x1"
|
||||||
|
|
||||||
|
|
||||||
|
def test_bm25_result_is_frozen():
|
||||||
|
r = BM25Result(chunk_id="x", doc_id="d", page_number=1, text="hello", score=0.5)
|
||||||
|
with pytest.raises(Exception):
|
||||||
|
r.score = 1.0 # type: ignore[misc]
|
||||||
59
tests/test_chat_api.py
Normal file
59
tests/test_chat_api.py
Normal file
|
|
@ -0,0 +1,59 @@
|
||||||
|
# tests/test_chat_api.py
|
||||||
|
"""Tests for POST /api/chat — RAG chat (BSL, BYOK gate)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
from app.services.retriever import RetrievedChunk
|
||||||
|
|
||||||
|
|
||||||
|
def test_chat_returns_402_without_ollama(client, monkeypatch):
|
||||||
|
monkeypatch.delenv("PAGEPIPER_OLLAMA_URL", raising=False)
|
||||||
|
resp = client.post("/api/chat", json={"message": "How does Fireball work?", "history": []})
|
||||||
|
assert resp.status_code == 402
|
||||||
|
body = resp.json()
|
||||||
|
assert "detail" in body
|
||||||
|
assert "Ollama" in body["detail"]["message"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_chat_returns_answer_with_mocked_ollama(client, test_db, monkeypatch):
|
||||||
|
monkeypatch.setenv("PAGEPIPER_OLLAMA_URL", "http://localhost:11434")
|
||||||
|
|
||||||
|
conn = sqlite3.connect(test_db)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT OR IGNORE INTO documents(id, title, file_path, status) VALUES ('b1','PHB','phb.pdf','ready')"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) "
|
||||||
|
"VALUES ('b1',15,'Fireball deals 8d6 fire damage.','text_layer',6)"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
mock_llm = MagicMock()
|
||||||
|
mock_llm.complete.return_value = "Fireball deals 8d6 fire damage [p.15]."
|
||||||
|
|
||||||
|
mock_chunks = [
|
||||||
|
RetrievedChunk(
|
||||||
|
chunk_id="c1",
|
||||||
|
doc_id="b1",
|
||||||
|
page_number=15,
|
||||||
|
text="Fireball deals 8d6 fire damage.",
|
||||||
|
bm25_score=1.0,
|
||||||
|
vector_score=None,
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("app.api.chat.Retriever.hybrid_search", return_value=mock_chunks):
|
||||||
|
with patch("app.api.chat._get_llm_router", return_value=mock_llm):
|
||||||
|
resp = client.post(
|
||||||
|
"/api/chat",
|
||||||
|
json={"message": "How does Fireball work?", "history": [], "doc_ids": ["b1"]},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert resp.status_code == 200
|
||||||
|
body = resp.json()
|
||||||
|
assert "answer" in body
|
||||||
|
assert "citations" in body
|
||||||
|
assert "Fireball" in body["answer"]
|
||||||
138
tests/test_ingest.py
Normal file
138
tests/test_ingest.py
Normal file
|
|
@ -0,0 +1,138 @@
|
||||||
|
# tests/test_ingest.py
|
||||||
|
"""Unit tests for scripts/ingest_pdf.py."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from scripts.ingest_pdf import run
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def ingest_db(tmp_path) -> tuple[str, str]:
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.executescript(schema)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Test','test.pdf','pending')"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
vec_db_path = str(tmp_path / "vecs.db")
|
||||||
|
return db_path, vec_db_path
|
||||||
|
|
||||||
|
|
||||||
|
def _make_mock_chunk(page_number: int = 1, text: str = "Some page text about rules.") -> MagicMock:
|
||||||
|
chunk = MagicMock()
|
||||||
|
chunk.page_number = page_number
|
||||||
|
chunk.text = text
|
||||||
|
chunk.source = "text_layer"
|
||||||
|
chunk.word_count = len(text.split())
|
||||||
|
return chunk
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_sets_status_ready_on_success(ingest_db):
|
||||||
|
db_path, vec_db_path = ingest_db
|
||||||
|
|
||||||
|
mock_extractor = MagicMock()
|
||||||
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
|
||||||
|
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
||||||
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
row = conn.execute("SELECT status, page_count FROM documents WHERE id='d1'").fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "ready"
|
||||||
|
assert row[1] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_stores_page_chunks(ingest_db):
|
||||||
|
db_path, vec_db_path = ingest_db
|
||||||
|
|
||||||
|
mock_extractor = MagicMock()
|
||||||
|
chunks = [_make_mock_chunk(page_number=i + 1, text=f"Page {i+1} text content.") for i in range(3)]
|
||||||
|
mock_extractor.chunk_pages.return_value = chunks
|
||||||
|
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
||||||
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT page_number, text FROM page_chunks WHERE doc_id='d1' ORDER BY page_number"
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
assert len(rows) == 3
|
||||||
|
assert rows[0][0] == 1
|
||||||
|
assert "Page 1" in rows[0][1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_sets_error_status_on_failure(ingest_db):
|
||||||
|
db_path, vec_db_path = ingest_db
|
||||||
|
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", side_effect=RuntimeError("PDF corrupt")):
|
||||||
|
from scripts.ingest_pdf import run
|
||||||
|
with pytest.raises(RuntimeError):
|
||||||
|
run(doc_id="d1", file_path="bad.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
row = conn.execute("SELECT status, error_msg FROM documents WHERE id='d1'").fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "error"
|
||||||
|
assert "PDF corrupt" in row[1]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_skips_embeddings_without_ollama_url(ingest_db, monkeypatch):
|
||||||
|
"""When PAGEPIPER_OLLAMA_URL is unset, no vec DB file should be created."""
|
||||||
|
db_path, vec_db_path = ingest_db
|
||||||
|
monkeypatch.delenv("PAGEPIPER_OLLAMA_URL", raising=False)
|
||||||
|
|
||||||
|
mock_extractor = MagicMock()
|
||||||
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk()]
|
||||||
|
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
||||||
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
# No embeddings were requested, so the vec DB should not have been created
|
||||||
|
assert not Path(vec_db_path).exists(), "vec DB should not be created without OLLAMA_URL"
|
||||||
|
|
||||||
|
# Document should still be ready with chunks stored
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
status = conn.execute("SELECT status FROM documents WHERE id='d1'").fetchone()[0]
|
||||||
|
chunk_count = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'"
|
||||||
|
).fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
assert status == "ready"
|
||||||
|
assert chunk_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_ingest_replaces_existing_chunks_on_reingest(ingest_db):
|
||||||
|
"""Re-running ingest for the same doc_id replaces old page_chunks."""
|
||||||
|
db_path, vec_db_path = ingest_db
|
||||||
|
|
||||||
|
mock_extractor = MagicMock()
|
||||||
|
|
||||||
|
# First ingest: 3 pages
|
||||||
|
mock_extractor.chunk_pages.return_value = [
|
||||||
|
_make_mock_chunk(page_number=i + 1, text=f"Original page {i+1}.") for i in range(3)
|
||||||
|
]
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
||||||
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
# Second ingest: 1 page (simulating a re-ingest after file change)
|
||||||
|
mock_extractor.chunk_pages.return_value = [_make_mock_chunk(text="Updated single page.")]
|
||||||
|
with patch("circuitforge_core.documents.pdf.PDFExtractor", return_value=mock_extractor):
|
||||||
|
run(doc_id="d1", file_path="test.pdf", db_path=db_path, vec_db_path=vec_db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT text FROM page_chunks WHERE doc_id='d1'"
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
assert len(rows) == 1
|
||||||
|
assert "Updated" in rows[0][0]
|
||||||
68
tests/test_library_api.py
Normal file
68
tests/test_library_api.py
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
# tests/test_library_api.py
|
||||||
|
"""Tests for GET/POST /api/library endpoints."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
def _add_doc(db_path: str, title: str, path: str, status: str = "ready") -> str:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
doc_id = conn.execute(
|
||||||
|
"INSERT INTO documents(title, file_path, status) VALUES (?,?,?) RETURNING id",
|
||||||
|
[title, path, status],
|
||||||
|
).fetchone()[0]
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return doc_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_library_empty(client):
|
||||||
|
resp = client.get("/api/library")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_library_returns_documents(client, test_db):
|
||||||
|
_add_doc(test_db, "Player's Handbook", "/books/phb.pdf")
|
||||||
|
resp = client.get("/api/library")
|
||||||
|
assert resp.status_code == 200
|
||||||
|
docs = resp.json()
|
||||||
|
assert len(docs) == 1
|
||||||
|
assert docs[0]["title"] == "Player's Handbook"
|
||||||
|
assert "status" in docs[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_document_removes_record(client, test_db):
|
||||||
|
doc_id = _add_doc(test_db, "Monster Manual", "/books/mm.pdf")
|
||||||
|
resp = client.delete(f"/api/library/{doc_id}")
|
||||||
|
assert resp.status_code == 204
|
||||||
|
resp2 = client.get("/api/library")
|
||||||
|
assert resp2.json() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_nonexistent_returns_404(client):
|
||||||
|
resp = client.delete("/api/library/does-not-exist")
|
||||||
|
assert resp.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_reingest_returns_task_id(client, test_db, tmp_path):
|
||||||
|
pdf_path = str(tmp_path / "books" / "test.pdf")
|
||||||
|
open(pdf_path, "wb").write(b"%PDF-1.4")
|
||||||
|
doc_id = _add_doc(test_db, "Test Book", pdf_path)
|
||||||
|
resp = client.post(f"/api/library/{doc_id}/reingest")
|
||||||
|
assert resp.status_code == 202
|
||||||
|
assert "task_id" in resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
def test_reingest_updates_status_to_processing(client, test_db, tmp_path):
|
||||||
|
from pathlib import Path
|
||||||
|
pdf_path = str(tmp_path / "books" / "dm_guide.pdf")
|
||||||
|
Path(pdf_path).write_bytes(b"%PDF-1.4 empty fixture")
|
||||||
|
doc_id = _add_doc(test_db, "DM Guide", pdf_path)
|
||||||
|
|
||||||
|
resp = client.post(f"/api/library/{doc_id}/reingest")
|
||||||
|
assert resp.status_code == 202
|
||||||
|
|
||||||
|
# Document should be in processing state (or beyond if stub ingest ran instantly)
|
||||||
|
status_resp = client.get(f"/api/library/{doc_id}/status")
|
||||||
|
assert status_resp.json()["status"] in ("processing", "error", "ready")
|
||||||
69
tests/test_search_api.py
Normal file
69
tests/test_search_api.py
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
# tests/test_search_api.py
|
||||||
|
"""Tests for POST /api/search — BM25 keyword search (MIT, no tier gate)."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
def _add_chunks(db_path: str, doc_id: str, chunks: list[dict]) -> None:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT OR IGNORE INTO documents(id, title, file_path, status) VALUES (?,'Book','p.pdf','ready')",
|
||||||
|
[doc_id],
|
||||||
|
)
|
||||||
|
for c in chunks:
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES (?,?,?,?,?)",
|
||||||
|
[doc_id, c["page_number"], c["text"], "text_layer", len(c["text"].split())],
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_returns_results(client, test_db, monkeypatch):
|
||||||
|
import app.api.search as _search_mod
|
||||||
|
monkeypatch.setattr(_search_mod, "_get_db_path", lambda: test_db)
|
||||||
|
# BM25Okapi IDF is 0 when df == N/2 (e.g. 2 docs, 1 match → log(1.0) = 0).
|
||||||
|
# Add a 3rd unrelated chunk so relevant terms score above zero.
|
||||||
|
_add_chunks(test_db, "book-a", [
|
||||||
|
{"page_number": 1, "text": "Fireball deals 8d6 fire damage on a failed saving throw."},
|
||||||
|
{"page_number": 2, "text": "Cure Wounds restores hit points to a living creature."},
|
||||||
|
{"page_number": 3, "text": "Shield grants plus five to armor class until next turn."},
|
||||||
|
])
|
||||||
|
|
||||||
|
resp = client.post("/api/search", json={"query": "fireball fire damage"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
results = resp.json()
|
||||||
|
assert len(results) >= 1
|
||||||
|
assert results[0]["page_number"] == 1
|
||||||
|
assert results[0]["bm25_score"] > 0
|
||||||
|
assert "text_snippet" in results[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_empty_index_returns_empty(client):
|
||||||
|
resp = client.post("/api/search", json={"query": "anything"})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
assert resp.json() == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_filters_by_doc_ids(client, test_db, monkeypatch):
|
||||||
|
import app.api.search as _search_mod
|
||||||
|
monkeypatch.setattr(_search_mod, "_get_db_path", lambda: test_db)
|
||||||
|
# Three chunks so BM25Okapi IDF is non-zero for terms appearing in one doc.
|
||||||
|
_add_chunks(test_db, "book-a", [
|
||||||
|
{"page_number": 1, "text": "Grapple rules for melee attacks."},
|
||||||
|
{"page_number": 2, "text": "Shield spell protects from incoming blows."},
|
||||||
|
])
|
||||||
|
_add_chunks(test_db, "book-b", [{"page_number": 3, "text": "Grapple also applies to ranged attacks."}])
|
||||||
|
|
||||||
|
resp = client.post("/api/search", json={"query": "grapple", "doc_ids": ["book-a"]})
|
||||||
|
assert resp.status_code == 200
|
||||||
|
results = resp.json()
|
||||||
|
assert len(results) >= 1, "expected at least one grapple result from book-a"
|
||||||
|
assert all(r["doc_id"] == "book-a" for r in results)
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_has_no_tier_gate(client):
|
||||||
|
# Search endpoint must return 200 with no PAGEPIPER_OLLAMA_URL set
|
||||||
|
resp = client.post("/api/search", json={"query": "anything"})
|
||||||
|
assert resp.status_code == 200 # Not 402
|
||||||
53
tests/test_synthesizer.py
Normal file
53
tests/test_synthesizer.py
Normal file
|
|
@ -0,0 +1,53 @@
|
||||||
|
# tests/test_synthesizer.py
|
||||||
|
"""Tests for Synthesizer — mocked LLM, citation assembly."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
from app.services.retriever import RetrievedChunk
|
||||||
|
from app.services.synthesizer import Synthesizer, SynthesisResult
|
||||||
|
|
||||||
|
|
||||||
|
def _chunk(doc_id: str = "book-a", page: int = 5, text: str = "Fireball rules") -> RetrievedChunk:
|
||||||
|
return RetrievedChunk(
|
||||||
|
chunk_id="c1", doc_id=doc_id, page_number=page, text=text,
|
||||||
|
bm25_score=1.0, vector_score=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthesizer_returns_answer_and_citations():
|
||||||
|
mock_llm = MagicMock()
|
||||||
|
mock_llm.complete.return_value = "Fireball deals 8d6 damage [p.5]."
|
||||||
|
|
||||||
|
synth = Synthesizer(mock_llm)
|
||||||
|
result = synth.synthesize(
|
||||||
|
message="How does Fireball work?",
|
||||||
|
history=[],
|
||||||
|
chunks=[_chunk()],
|
||||||
|
)
|
||||||
|
|
||||||
|
assert isinstance(result, SynthesisResult)
|
||||||
|
assert "Fireball" in result.answer
|
||||||
|
assert len(result.citations) == 1
|
||||||
|
assert result.citations[0].page_number == 5
|
||||||
|
assert result.citations[0].doc_id == "book-a"
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthesizer_builds_context_from_chunks():
|
||||||
|
mock_llm = MagicMock()
|
||||||
|
mock_llm.complete.return_value = "Answer."
|
||||||
|
|
||||||
|
synth = Synthesizer(mock_llm)
|
||||||
|
synth.synthesize("Q?", [], [_chunk(text="Detailed rule text here.")])
|
||||||
|
|
||||||
|
assert "Detailed rule text here." in mock_llm.complete.call_args.args[0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_synthesizer_uses_system_prompt():
|
||||||
|
mock_llm = MagicMock()
|
||||||
|
mock_llm.complete.return_value = "Answer."
|
||||||
|
synth = Synthesizer(mock_llm)
|
||||||
|
synth.synthesize("Q?", [], [_chunk()])
|
||||||
|
|
||||||
|
call_kwargs = mock_llm.complete.call_args
|
||||||
|
assert call_kwargs.kwargs.get("system") or call_kwargs[1].get("system")
|
||||||
39
web/.gitignore
vendored
Normal file
39
web/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Logs
|
||||||
|
logs
|
||||||
|
*.log
|
||||||
|
npm-debug.log*
|
||||||
|
yarn-debug.log*
|
||||||
|
yarn-error.log*
|
||||||
|
pnpm-debug.log*
|
||||||
|
lerna-debug.log*
|
||||||
|
|
||||||
|
node_modules
|
||||||
|
.DS_Store
|
||||||
|
dist
|
||||||
|
dist-ssr
|
||||||
|
coverage
|
||||||
|
*.local
|
||||||
|
|
||||||
|
# Editor directories and files
|
||||||
|
.vscode/*
|
||||||
|
!.vscode/extensions.json
|
||||||
|
.idea
|
||||||
|
*.suo
|
||||||
|
*.ntvs*
|
||||||
|
*.njsproj
|
||||||
|
*.sln
|
||||||
|
*.sw?
|
||||||
|
|
||||||
|
*.tsbuildinfo
|
||||||
|
|
||||||
|
.eslintcache
|
||||||
|
|
||||||
|
# Cypress
|
||||||
|
/cypress/videos/
|
||||||
|
/cypress/screenshots/
|
||||||
|
|
||||||
|
# Vitest
|
||||||
|
__screenshots__/
|
||||||
|
|
||||||
|
# Vite
|
||||||
|
*.timestamp-*-*.mjs
|
||||||
3
web/.vscode/extensions.json
vendored
Normal file
3
web/.vscode/extensions.json
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"recommendations": ["Vue.volar"]
|
||||||
|
}
|
||||||
42
web/README.md
Normal file
42
web/README.md
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
# web
|
||||||
|
|
||||||
|
This template should help get you started developing with Vue 3 in Vite.
|
||||||
|
|
||||||
|
## Recommended IDE Setup
|
||||||
|
|
||||||
|
[VS Code](https://code.visualstudio.com/) + [Vue (Official)](https://marketplace.visualstudio.com/items?itemName=Vue.volar) (and disable Vetur).
|
||||||
|
|
||||||
|
## Recommended Browser Setup
|
||||||
|
|
||||||
|
- Chromium-based browsers (Chrome, Edge, Brave, etc.):
|
||||||
|
- [Vue.js devtools](https://chromewebstore.google.com/detail/vuejs-devtools/nhdogjmejiglipccpnnnanhbledajbpd)
|
||||||
|
- [Turn on Custom Object Formatter in Chrome DevTools](http://bit.ly/object-formatters)
|
||||||
|
- Firefox:
|
||||||
|
- [Vue.js devtools](https://addons.mozilla.org/en-US/firefox/addon/vue-js-devtools/)
|
||||||
|
- [Turn on Custom Object Formatter in Firefox DevTools](https://fxdx.dev/firefox-devtools-custom-object-formatters/)
|
||||||
|
|
||||||
|
## Type Support for `.vue` Imports in TS
|
||||||
|
|
||||||
|
TypeScript cannot handle type information for `.vue` imports by default, so we replace the `tsc` CLI with `vue-tsc` for type checking. In editors, we need [Volar](https://marketplace.visualstudio.com/items?itemName=Vue.volar) to make the TypeScript language service aware of `.vue` types.
|
||||||
|
|
||||||
|
## Customize configuration
|
||||||
|
|
||||||
|
See [Vite Configuration Reference](https://vite.dev/config/).
|
||||||
|
|
||||||
|
## Project Setup
|
||||||
|
|
||||||
|
```sh
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
|
### Compile and Hot-Reload for Development
|
||||||
|
|
||||||
|
```sh
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
### Type-Check, Compile and Minify for Production
|
||||||
|
|
||||||
|
```sh
|
||||||
|
npm run build
|
||||||
|
```
|
||||||
1
web/env.d.ts
vendored
Normal file
1
web/env.d.ts
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
/// <reference types="vite/client" />
|
||||||
13
web/index.html
Normal file
13
web/index.html
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<meta charset="UTF-8">
|
||||||
|
<link rel="icon" href="/favicon.ico">
|
||||||
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
|
<title>Pagepiper</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div id="app"></div>
|
||||||
|
<script type="module" src="/src/main.ts"></script>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
2898
web/package-lock.json
generated
Normal file
2898
web/package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load diff
31
web/package.json
Normal file
31
web/package.json
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"name": "web",
|
||||||
|
"version": "0.0.0",
|
||||||
|
"private": true,
|
||||||
|
"type": "module",
|
||||||
|
"scripts": {
|
||||||
|
"dev": "vite",
|
||||||
|
"build": "run-p type-check \"build-only {@}\" --",
|
||||||
|
"preview": "vite preview",
|
||||||
|
"build-only": "vite build",
|
||||||
|
"type-check": "vue-tsc --build"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"vue": "^3.5.32",
|
||||||
|
"vue-router": "^5.0.4"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@tsconfig/node24": "^24.0.4",
|
||||||
|
"@types/node": "^24.12.2",
|
||||||
|
"@vitejs/plugin-vue": "^6.0.6",
|
||||||
|
"@vue/tsconfig": "^0.9.1",
|
||||||
|
"npm-run-all2": "^8.0.4",
|
||||||
|
"typescript": "~6.0.0",
|
||||||
|
"vite": "^8.0.8",
|
||||||
|
"vite-plugin-vue-devtools": "^8.1.1",
|
||||||
|
"vue-tsc": "^3.2.6"
|
||||||
|
},
|
||||||
|
"engines": {
|
||||||
|
"node": "^20.19.0 || >=22.12.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
BIN
web/public/favicon.ico
Normal file
BIN
web/public/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 4.2 KiB |
33
web/src/App.vue
Normal file
33
web/src/App.vue
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
<template>
|
||||||
|
<div id="app">
|
||||||
|
<nav class="nav">
|
||||||
|
<span class="nav-brand">Pagepiper</span>
|
||||||
|
<RouterLink to="/" class="nav-link">Library</RouterLink>
|
||||||
|
<RouterLink to="/chat" class="nav-link">Chat</RouterLink>
|
||||||
|
</nav>
|
||||||
|
<RouterView />
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { RouterLink, RouterView } from "vue-router"
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style>
|
||||||
|
@import "@/theme.css";
|
||||||
|
|
||||||
|
.nav {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 1.5rem;
|
||||||
|
padding: 0.75rem 1.5rem;
|
||||||
|
background: var(--color-surface);
|
||||||
|
border-bottom: 1px solid var(--color-border);
|
||||||
|
position: sticky;
|
||||||
|
top: 0;
|
||||||
|
z-index: 100;
|
||||||
|
}
|
||||||
|
.nav-brand { font-weight: 700; color: var(--color-accent); }
|
||||||
|
.nav-link { color: var(--color-text-muted); text-decoration: none; }
|
||||||
|
.nav-link:hover, .nav-link.router-link-active { color: var(--color-text); }
|
||||||
|
</style>
|
||||||
101
web/src/api.ts
Normal file
101
web/src/api.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
// web/src/api.ts
|
||||||
|
const BASE = import.meta.env.VITE_API_BASE ?? ""
|
||||||
|
|
||||||
|
export interface Document {
|
||||||
|
id: string
|
||||||
|
title: string
|
||||||
|
file_path: string
|
||||||
|
status: "pending" | "processing" | "ready" | "error"
|
||||||
|
task_id: string | null
|
||||||
|
page_count: number | null
|
||||||
|
created_at: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SearchResult {
|
||||||
|
chunk_id: string
|
||||||
|
doc_id: string
|
||||||
|
page_number: number
|
||||||
|
text_snippet: string
|
||||||
|
bm25_score: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface Citation {
|
||||||
|
doc_id: string
|
||||||
|
page_number: number
|
||||||
|
snippet: string
|
||||||
|
bm25_score: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ChatResponse {
|
||||||
|
answer: string
|
||||||
|
citations: Citation[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface TaskStatus {
|
||||||
|
status: string
|
||||||
|
progress?: number
|
||||||
|
error?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ChatMessage {
|
||||||
|
role: string
|
||||||
|
content: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export const api = {
|
||||||
|
async getLibrary(): Promise<Document[]> {
|
||||||
|
const r = await fetch(`${BASE}/api/library`)
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
async scanLibrary(): Promise<{ discovered: number; queued: number; tasks: { doc_id: string; task_id: string }[] }> {
|
||||||
|
const r = await fetch(`${BASE}/api/library/scan`, { method: "POST" })
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
async reingestDocument(docId: string): Promise<{ task_id: string }> {
|
||||||
|
const r = await fetch(`${BASE}/api/library/${docId}/reingest`, { method: "POST" })
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
async deleteDocument(docId: string): Promise<void> {
|
||||||
|
const r = await fetch(`${BASE}/api/library/${docId}`, { method: "DELETE" })
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
},
|
||||||
|
async getTaskStatus(taskId: string): Promise<TaskStatus> {
|
||||||
|
const r = await fetch(`${BASE}/api/ingest/${taskId}`)
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
async search(query: string, topK = 10, docIds?: string[]): Promise<SearchResult[]> {
|
||||||
|
const r = await fetch(`${BASE}/api/search`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ query, top_k: topK, doc_ids: docIds ?? null }),
|
||||||
|
})
|
||||||
|
if (!r.ok) throw new Error(await r.text())
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
async chat(
|
||||||
|
message: string,
|
||||||
|
history: ChatMessage[],
|
||||||
|
docIds?: string[],
|
||||||
|
topK = 5,
|
||||||
|
): Promise<ChatResponse> {
|
||||||
|
const r = await fetch(`${BASE}/api/chat`, {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({ message, history, doc_ids: docIds ?? null, top_k: topK }),
|
||||||
|
})
|
||||||
|
if (!r.ok) {
|
||||||
|
const body = await r.json().catch(() => ({}))
|
||||||
|
const err: Error & { status?: number; detail?: unknown } = new Error(
|
||||||
|
(body as { detail?: { message?: string } }).detail?.message ?? "Request failed"
|
||||||
|
)
|
||||||
|
err.status = r.status
|
||||||
|
err.detail = (body as { detail?: unknown }).detail
|
||||||
|
throw err
|
||||||
|
}
|
||||||
|
return r.json()
|
||||||
|
},
|
||||||
|
}
|
||||||
69
web/src/components/CitationPanel.vue
Normal file
69
web/src/components/CitationPanel.vue
Normal file
|
|
@ -0,0 +1,69 @@
|
||||||
|
<template>
|
||||||
|
<div class="citation-panel">
|
||||||
|
<button
|
||||||
|
class="citation-toggle"
|
||||||
|
:aria-expanded="open"
|
||||||
|
@click="open = !open"
|
||||||
|
>
|
||||||
|
<span class="citation-badge" :class="{ 'nat20': showNat20 }" aria-hidden="true">
|
||||||
|
{{ showNat20 ? "⚀ Natural 20" : `p.${citation.page_number}` }}
|
||||||
|
</span>
|
||||||
|
<span class="citation-doc">{{ docTitle }}</span>
|
||||||
|
<span class="citation-chevron">{{ open ? "▲" : "▼" }}</span>
|
||||||
|
</button>
|
||||||
|
|
||||||
|
<div class="citation-body" v-show="open" role="region" :aria-label="`Excerpt from page ${citation.page_number}`">
|
||||||
|
<p class="citation-source-label">Source text (not paraphrased):</p>
|
||||||
|
<blockquote class="citation-text">{{ citation.snippet }}</blockquote>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { onMounted, ref } from "vue"
|
||||||
|
import type { Citation } from "@/api"
|
||||||
|
|
||||||
|
const props = defineProps<{
|
||||||
|
citation: Citation
|
||||||
|
docTitle?: string
|
||||||
|
bm25Score?: number
|
||||||
|
}>()
|
||||||
|
|
||||||
|
const open = ref(false)
|
||||||
|
const showNat20 = ref(false)
|
||||||
|
|
||||||
|
const NAT20_THRESHOLD = 8.0
|
||||||
|
|
||||||
|
onMounted(() => {
|
||||||
|
const prefersReduced = window.matchMedia("(prefers-reduced-motion: reduce)").matches
|
||||||
|
if (!prefersReduced && props.bm25Score && props.bm25Score >= NAT20_THRESHOLD) {
|
||||||
|
showNat20.value = true
|
||||||
|
setTimeout(() => { showNat20.value = false }, 300)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.citation-panel { border: 1px solid var(--color-border); border-radius: var(--radius-sm); margin-bottom: 0.5rem; overflow: hidden; }
|
||||||
|
.citation-toggle {
|
||||||
|
width: 100%; display: flex; align-items: center; gap: 0.75rem; padding: 0.6rem 0.75rem;
|
||||||
|
background: var(--color-surface-alt); border: none; cursor: pointer; color: var(--color-text);
|
||||||
|
text-align: left;
|
||||||
|
}
|
||||||
|
.citation-toggle:hover { background: var(--color-border); }
|
||||||
|
.citation-badge {
|
||||||
|
font-size: 0.75rem; font-weight: 700; padding: 2px 8px;
|
||||||
|
background: var(--color-surface); border-radius: var(--radius-sm);
|
||||||
|
border: 1px solid var(--color-border); font-family: var(--font-mono);
|
||||||
|
white-space: nowrap; transition: background var(--transition-fast), color var(--transition-fast);
|
||||||
|
}
|
||||||
|
.citation-badge.nat20 { background: var(--color-accent); color: #fff; border-color: var(--color-accent); }
|
||||||
|
.citation-doc { flex: 1; font-size: 0.85rem; color: var(--color-text-muted); }
|
||||||
|
.citation-chevron { font-size: 0.7rem; color: var(--color-text-muted); }
|
||||||
|
.citation-body { padding: 0.75rem; background: var(--color-surface); }
|
||||||
|
.citation-source-label { font-size: 0.75rem; color: var(--color-text-muted); margin-bottom: 0.4rem; font-style: italic; }
|
||||||
|
.citation-text {
|
||||||
|
border-left: 3px solid var(--color-accent); padding-left: 0.75rem;
|
||||||
|
font-size: 0.9rem; color: var(--color-text); line-height: 1.6;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
68
web/src/components/DocumentCard.vue
Normal file
68
web/src/components/DocumentCard.vue
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
<template>
|
||||||
|
<div class="doc-card" :class="`status-${doc.status}`">
|
||||||
|
<div class="doc-status-badge">{{ doc.status }}</div>
|
||||||
|
<div class="doc-title">{{ doc.title }}</div>
|
||||||
|
<div class="doc-meta" v-if="doc.page_count != null">{{ doc.page_count }} pages</div>
|
||||||
|
<div class="doc-meta path">{{ shortPath }}</div>
|
||||||
|
|
||||||
|
<IngestProgress
|
||||||
|
v-if="doc.status === 'processing' && doc.task_id"
|
||||||
|
:task-id="doc.task_id"
|
||||||
|
@done="emit('refresh')"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<div class="doc-actions">
|
||||||
|
<button class="btn-sm" @click="emit('reingest', doc.id)" :disabled="doc.status === 'processing'">
|
||||||
|
Re-index
|
||||||
|
</button>
|
||||||
|
<button class="btn-sm danger" @click="emit('delete', doc.id)">Remove</button>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { computed } from "vue"
|
||||||
|
import type { Document } from "@/api"
|
||||||
|
import IngestProgress from "@/components/IngestProgress.vue"
|
||||||
|
|
||||||
|
const props = defineProps<{ doc: Document }>()
|
||||||
|
const emit = defineEmits<{ reingest: [id: string]; delete: [id: string]; refresh: [] }>()
|
||||||
|
|
||||||
|
const shortPath = computed(() => {
|
||||||
|
const parts = props.doc.file_path.split("/")
|
||||||
|
return parts.slice(-2).join("/")
|
||||||
|
})
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.doc-card {
|
||||||
|
background: var(--color-surface);
|
||||||
|
border: 1px solid var(--color-border);
|
||||||
|
border-radius: var(--radius-md);
|
||||||
|
padding: 1rem;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.4rem;
|
||||||
|
box-shadow: var(--shadow-card);
|
||||||
|
position: relative;
|
||||||
|
}
|
||||||
|
.doc-card.status-error { border-color: var(--color-error); }
|
||||||
|
.doc-card.status-ready { border-color: var(--color-success); }
|
||||||
|
.doc-title { font-weight: 600; font-size: 1rem; }
|
||||||
|
.doc-meta { font-size: 0.8rem; color: var(--color-text-muted); }
|
||||||
|
.doc-meta.path { font-family: var(--font-mono); word-break: break-all; }
|
||||||
|
.doc-status-badge {
|
||||||
|
position: absolute; top: 0.5rem; right: 0.75rem;
|
||||||
|
font-size: 0.7rem; font-weight: 700; text-transform: uppercase;
|
||||||
|
padding: 2px 6px; border-radius: var(--radius-sm);
|
||||||
|
background: var(--color-surface-alt);
|
||||||
|
}
|
||||||
|
.doc-actions { display: flex; gap: 0.5rem; margin-top: 0.5rem; }
|
||||||
|
.btn-sm {
|
||||||
|
padding: 4px 10px; border: 1px solid var(--color-border); border-radius: var(--radius-sm);
|
||||||
|
background: var(--color-surface-alt); color: var(--color-text); cursor: pointer; font-size: 0.8rem;
|
||||||
|
}
|
||||||
|
.btn-sm:hover { border-color: var(--color-accent); }
|
||||||
|
.btn-sm.danger:hover { border-color: var(--color-error); color: var(--color-error); }
|
||||||
|
.btn-sm:disabled { opacity: 0.4; cursor: default; }
|
||||||
|
</style>
|
||||||
82
web/src/components/IngestProgress.vue
Normal file
82
web/src/components/IngestProgress.vue
Normal file
|
|
@ -0,0 +1,82 @@
|
||||||
|
<template>
|
||||||
|
<div class="ingest-progress" v-if="visible">
|
||||||
|
<div class="progress-label">
|
||||||
|
<span>{{ statusLabel }}</span>
|
||||||
|
<span class="progress-pct" v-if="status?.progress != null">{{ status.progress }}%</span>
|
||||||
|
</div>
|
||||||
|
<div class="progress-bar">
|
||||||
|
<div class="progress-fill" :style="{ width: barWidth }" />
|
||||||
|
</div>
|
||||||
|
<p class="progress-error" v-if="status?.status === 'error'">{{ status.error }}</p>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { computed, onMounted, onUnmounted, ref, watch } from "vue"
|
||||||
|
import { api, type TaskStatus } from "@/api"
|
||||||
|
|
||||||
|
const props = defineProps<{ taskId: string | null }>()
|
||||||
|
const emit = defineEmits<{ done: [] }>()
|
||||||
|
|
||||||
|
const status = ref<TaskStatus | null>(null)
|
||||||
|
let timer: ReturnType<typeof setInterval> | null = null
|
||||||
|
|
||||||
|
const visible = computed(() => props.taskId !== null && status.value?.status !== "complete")
|
||||||
|
|
||||||
|
const statusLabel = computed(() => {
|
||||||
|
if (!status.value) return "Queued…"
|
||||||
|
const map: Record<string, string> = {
|
||||||
|
running: "Indexing…",
|
||||||
|
complete: "Done",
|
||||||
|
error: "Error",
|
||||||
|
}
|
||||||
|
return map[status.value.status] ?? "Processing…"
|
||||||
|
})
|
||||||
|
|
||||||
|
const barWidth = computed(() => {
|
||||||
|
const p = status.value?.progress ?? 0
|
||||||
|
return `${Math.min(p, 100)}%`
|
||||||
|
})
|
||||||
|
|
||||||
|
async function poll() {
|
||||||
|
if (!props.taskId) return
|
||||||
|
try {
|
||||||
|
status.value = await api.getTaskStatus(props.taskId)
|
||||||
|
if (status.value.status === "complete") {
|
||||||
|
stopPoll()
|
||||||
|
emit("done")
|
||||||
|
} else if (status.value.status === "error") {
|
||||||
|
stopPoll()
|
||||||
|
}
|
||||||
|
} catch (_e: unknown) { /* task not yet registered */ }
|
||||||
|
}
|
||||||
|
|
||||||
|
function stopPoll() {
|
||||||
|
if (timer) { clearInterval(timer); timer = null }
|
||||||
|
}
|
||||||
|
|
||||||
|
function startPoll() {
|
||||||
|
stopPoll()
|
||||||
|
status.value = null
|
||||||
|
if (props.taskId) {
|
||||||
|
poll()
|
||||||
|
timer = setInterval(poll, 2000)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
watch(() => props.taskId, (newId) => {
|
||||||
|
if (newId) startPoll()
|
||||||
|
else stopPoll()
|
||||||
|
})
|
||||||
|
|
||||||
|
onMounted(startPoll)
|
||||||
|
onUnmounted(stopPoll)
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.ingest-progress { margin-top: 0.5rem; }
|
||||||
|
.progress-label { display: flex; justify-content: space-between; font-size: 0.8rem; color: var(--color-text-muted); margin-bottom: 4px; }
|
||||||
|
.progress-bar { height: 4px; background: var(--color-border); border-radius: 2px; overflow: hidden; }
|
||||||
|
.progress-fill { height: 100%; background: var(--color-accent); transition: width 0.3s ease; }
|
||||||
|
.progress-error { color: var(--color-error); font-size: 0.8rem; margin-top: 4px; }
|
||||||
|
</style>
|
||||||
5
web/src/main.ts
Normal file
5
web/src/main.ts
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
import { createApp } from "vue"
|
||||||
|
import App from "./App.vue"
|
||||||
|
import router from "./router"
|
||||||
|
|
||||||
|
createApp(App).use(router).mount("#app")
|
||||||
11
web/src/router/index.ts
Normal file
11
web/src/router/index.ts
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
import { createRouter, createWebHistory } from "vue-router"
|
||||||
|
import LibraryView from "@/views/LibraryView.vue"
|
||||||
|
import ChatView from "@/views/ChatView.vue"
|
||||||
|
|
||||||
|
export default createRouter({
|
||||||
|
history: createWebHistory(import.meta.env.VITE_BASE_URL),
|
||||||
|
routes: [
|
||||||
|
{ path: "/", name: "library", component: LibraryView },
|
||||||
|
{ path: "/chat", name: "chat", component: ChatView },
|
||||||
|
],
|
||||||
|
})
|
||||||
47
web/src/theme.css
Normal file
47
web/src/theme.css
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
/* web/src/theme.css */
|
||||||
|
:root {
|
||||||
|
--color-bg: #1a1a2e;
|
||||||
|
--color-surface: #16213e;
|
||||||
|
--color-surface-alt: #0f3460;
|
||||||
|
--color-accent: #e94560;
|
||||||
|
--color-accent-dim: #a83050;
|
||||||
|
--color-text: #e8e8e8;
|
||||||
|
--color-text-muted: #9e9e9e;
|
||||||
|
--color-success: #4caf50;
|
||||||
|
--color-warning: #ff9800;
|
||||||
|
--color-error: #f44336;
|
||||||
|
--color-border: #2a2a4a;
|
||||||
|
|
||||||
|
--font-base: system-ui, -apple-system, sans-serif;
|
||||||
|
--font-mono: "Fira Code", "Cascadia Code", monospace;
|
||||||
|
|
||||||
|
--radius-sm: 4px;
|
||||||
|
--radius-md: 8px;
|
||||||
|
--radius-lg: 16px;
|
||||||
|
--shadow-card: 0 2px 8px rgba(0,0,0,0.4);
|
||||||
|
--transition-fast: 150ms ease;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (prefers-color-scheme: light) {
|
||||||
|
:root {
|
||||||
|
--color-bg: #f5f5f5;
|
||||||
|
--color-surface: #ffffff;
|
||||||
|
--color-surface-alt: #e8eaf6;
|
||||||
|
--color-accent: #c62828;
|
||||||
|
--color-accent-dim: #e57373;
|
||||||
|
--color-text: #212121;
|
||||||
|
--color-text-muted: #757575;
|
||||||
|
--color-border: #e0e0e0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
||||||
|
|
||||||
|
body {
|
||||||
|
background: var(--color-bg);
|
||||||
|
color: var(--color-text);
|
||||||
|
font-family: var(--font-base);
|
||||||
|
font-size: 1rem;
|
||||||
|
line-height: 1.6;
|
||||||
|
min-height: 100vh;
|
||||||
|
}
|
||||||
240
web/src/views/ChatView.vue
Normal file
240
web/src/views/ChatView.vue
Normal file
|
|
@ -0,0 +1,240 @@
|
||||||
|
<template>
|
||||||
|
<div class="chat-layout">
|
||||||
|
<!-- Message pane -->
|
||||||
|
<div class="chat-pane">
|
||||||
|
<div class="chat-messages" ref="messagesEl">
|
||||||
|
<p class="empty-chat" v-if="history.length === 0">
|
||||||
|
Ask a question across your indexed rulebooks.
|
||||||
|
No rulebooks indexed? Go to <RouterLink to="/">Library</RouterLink> first.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div
|
||||||
|
v-for="(msg, i) in history"
|
||||||
|
:key="i"
|
||||||
|
class="message"
|
||||||
|
:class="msg.role"
|
||||||
|
>
|
||||||
|
<div class="message-body">{{ msg.content }}</div>
|
||||||
|
<div class="message-citations" v-if="msg.citations?.length">
|
||||||
|
<p class="citations-label">Sources:</p>
|
||||||
|
<CitationPanel
|
||||||
|
v-for="(cite, j) in msg.citations"
|
||||||
|
:key="j"
|
||||||
|
:citation="cite"
|
||||||
|
:doc-title="docTitles[cite.doc_id] ?? cite.doc_id"
|
||||||
|
:bm25-score="cite.bm25_score ?? undefined"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="message assistant loading" v-if="thinking">
|
||||||
|
<div class="loading-dots"><span /><span /><span /></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="error-banner" v-if="errorMsg" role="alert">
|
||||||
|
{{ errorMsg }}
|
||||||
|
<span v-if="error402"> — <RouterLink to="/">Library</RouterLink> or set PAGEPIPER_OLLAMA_URL.</span>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<form class="chat-input-row" @submit.prevent="send">
|
||||||
|
<input
|
||||||
|
ref="inputEl"
|
||||||
|
v-model="draft"
|
||||||
|
class="chat-input"
|
||||||
|
placeholder="Ask about your rulebooks…"
|
||||||
|
:disabled="thinking"
|
||||||
|
aria-label="Chat message"
|
||||||
|
autofocus
|
||||||
|
/>
|
||||||
|
<button class="btn-send" type="submit" :disabled="thinking || !draft.trim()">Send</button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Book filter sidebar -->
|
||||||
|
<aside class="sidebar" role="complementary" aria-label="Filter by book">
|
||||||
|
<h2 class="sidebar-title">Books</h2>
|
||||||
|
<p class="sidebar-hint">Select books to search (all = none selected)</p>
|
||||||
|
<label
|
||||||
|
v-for="doc in readyDocs"
|
||||||
|
:key="doc.id"
|
||||||
|
class="book-filter"
|
||||||
|
>
|
||||||
|
<input type="checkbox" :value="doc.id" v-model="selectedDocs" />
|
||||||
|
{{ doc.title }}
|
||||||
|
</label>
|
||||||
|
</aside>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { computed, nextTick, onMounted, ref } from "vue"
|
||||||
|
import { RouterLink } from "vue-router"
|
||||||
|
import { api, type Citation, type Document } from "@/api"
|
||||||
|
import CitationPanel from "@/components/CitationPanel.vue"
|
||||||
|
|
||||||
|
interface ChatMessage {
|
||||||
|
role: "user" | "assistant"
|
||||||
|
content: string
|
||||||
|
citations?: Citation[]
|
||||||
|
}
|
||||||
|
|
||||||
|
const history = ref<ChatMessage[]>([])
|
||||||
|
const draft = ref("")
|
||||||
|
const thinking = ref(false)
|
||||||
|
const errorMsg = ref("")
|
||||||
|
const error402 = ref(false)
|
||||||
|
const messagesEl = ref<HTMLElement | null>(null)
|
||||||
|
const inputEl = ref<HTMLInputElement | null>(null)
|
||||||
|
const allDocs = ref<Document[]>([])
|
||||||
|
const selectedDocs = ref<string[]>([])
|
||||||
|
|
||||||
|
const readyDocs = computed(() => allDocs.value.filter(d => d.status === "ready"))
|
||||||
|
const docTitles = computed(() =>
|
||||||
|
Object.fromEntries(allDocs.value.map(d => [d.id, d.title]))
|
||||||
|
)
|
||||||
|
|
||||||
|
onMounted(async () => {
|
||||||
|
allDocs.value = await api.getLibrary().catch(() => [])
|
||||||
|
inputEl.value?.focus()
|
||||||
|
})
|
||||||
|
|
||||||
|
async function send() {
|
||||||
|
const msg = draft.value.trim()
|
||||||
|
if (!msg || thinking.value) return
|
||||||
|
|
||||||
|
draft.value = ""
|
||||||
|
errorMsg.value = ""
|
||||||
|
error402.value = false
|
||||||
|
history.value.push({ role: "user", content: msg })
|
||||||
|
thinking.value = true
|
||||||
|
await nextTick()
|
||||||
|
scrollBottom()
|
||||||
|
|
||||||
|
try {
|
||||||
|
const docIds = selectedDocs.value.length ? selectedDocs.value : undefined
|
||||||
|
const apiHistory = history.value.slice(0, -1).map(m => ({ role: m.role, content: m.content }))
|
||||||
|
const result = await api.chat(msg, apiHistory, docIds)
|
||||||
|
history.value.push({ role: "assistant", content: result.answer, citations: result.citations })
|
||||||
|
} catch (err: unknown) {
|
||||||
|
const e = err as Error & { status?: number; detail?: { message?: string } }
|
||||||
|
if (e.status === 402) {
|
||||||
|
error402.value = true
|
||||||
|
errorMsg.value = e.detail?.message ?? "Ollama not configured. Set PAGEPIPER_OLLAMA_URL."
|
||||||
|
} else {
|
||||||
|
errorMsg.value = e.message ?? "Something went wrong."
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
thinking.value = false
|
||||||
|
await nextTick()
|
||||||
|
scrollBottom()
|
||||||
|
inputEl.value?.focus()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function scrollBottom() {
|
||||||
|
if (messagesEl.value) {
|
||||||
|
messagesEl.value.scrollTop = messagesEl.value.scrollHeight
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.chat-layout {
|
||||||
|
display: flex;
|
||||||
|
height: calc(100vh - 56px);
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.chat-pane {
|
||||||
|
flex: 1;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.chat-messages {
|
||||||
|
flex: 1;
|
||||||
|
overflow-y: auto;
|
||||||
|
padding: 1.5rem;
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 1rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.empty-chat { color: var(--color-text-muted); line-height: 1.8; }
|
||||||
|
|
||||||
|
.message { max-width: 80%; }
|
||||||
|
.message.user { align-self: flex-end; }
|
||||||
|
.message.assistant { align-self: flex-start; }
|
||||||
|
|
||||||
|
.message-body {
|
||||||
|
background: var(--color-surface);
|
||||||
|
border: 1px solid var(--color-border);
|
||||||
|
border-radius: var(--radius-md);
|
||||||
|
padding: 0.75rem 1rem;
|
||||||
|
line-height: 1.6;
|
||||||
|
white-space: pre-wrap;
|
||||||
|
}
|
||||||
|
.message.user .message-body {
|
||||||
|
background: var(--color-surface-alt);
|
||||||
|
border-color: transparent;
|
||||||
|
}
|
||||||
|
|
||||||
|
.message-citations { margin-top: 0.75rem; }
|
||||||
|
.citations-label { font-size: 0.75rem; color: var(--color-text-muted); margin-bottom: 0.4rem; font-style: italic; }
|
||||||
|
|
||||||
|
.loading-dots { display: flex; gap: 6px; padding: 0.75rem 1rem; }
|
||||||
|
.loading-dots span {
|
||||||
|
width: 8px; height: 8px; border-radius: 50%;
|
||||||
|
background: var(--color-text-muted);
|
||||||
|
animation: bounce 1.2s ease-in-out infinite;
|
||||||
|
}
|
||||||
|
.loading-dots span:nth-child(2) { animation-delay: 0.2s; }
|
||||||
|
.loading-dots span:nth-child(3) { animation-delay: 0.4s; }
|
||||||
|
@keyframes bounce { 0%, 80%, 100% { transform: scale(0.6); } 40% { transform: scale(1); } }
|
||||||
|
@media (prefers-reduced-motion: reduce) { .loading-dots span { animation: none; opacity: 0.5; } }
|
||||||
|
|
||||||
|
.error-banner {
|
||||||
|
padding: 0.75rem 1.5rem;
|
||||||
|
background: color-mix(in srgb, var(--color-error) 15%, var(--color-surface));
|
||||||
|
color: var(--color-error);
|
||||||
|
font-size: 0.9rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.chat-input-row {
|
||||||
|
display: flex;
|
||||||
|
gap: 0.5rem;
|
||||||
|
padding: 1rem 1.5rem;
|
||||||
|
border-top: 1px solid var(--color-border);
|
||||||
|
background: var(--color-surface);
|
||||||
|
}
|
||||||
|
.chat-input {
|
||||||
|
flex: 1; padding: 0.6rem 1rem;
|
||||||
|
background: var(--color-bg); border: 1px solid var(--color-border);
|
||||||
|
border-radius: var(--radius-sm); color: var(--color-text); font-size: 1rem;
|
||||||
|
}
|
||||||
|
.chat-input:focus { outline: 2px solid var(--color-accent); border-color: transparent; }
|
||||||
|
.btn-send {
|
||||||
|
padding: 0.6rem 1.25rem; background: var(--color-accent); color: #fff;
|
||||||
|
border: none; border-radius: var(--radius-sm); cursor: pointer; font-size: 0.95rem;
|
||||||
|
}
|
||||||
|
.btn-send:disabled { opacity: 0.4; cursor: default; }
|
||||||
|
|
||||||
|
.sidebar {
|
||||||
|
width: 240px; border-left: 1px solid var(--color-border);
|
||||||
|
background: var(--color-surface); overflow-y: auto; padding: 1rem;
|
||||||
|
}
|
||||||
|
.sidebar-title { font-size: 1rem; font-weight: 600; margin-bottom: 0.5rem; }
|
||||||
|
.sidebar-hint { font-size: 0.75rem; color: var(--color-text-muted); margin-bottom: 0.75rem; line-height: 1.4; }
|
||||||
|
.book-filter {
|
||||||
|
display: flex; align-items: flex-start; gap: 0.5rem;
|
||||||
|
font-size: 0.85rem; margin-bottom: 0.5rem; cursor: pointer; line-height: 1.4;
|
||||||
|
}
|
||||||
|
|
||||||
|
@media (max-width: 640px) {
|
||||||
|
.chat-layout { flex-direction: column-reverse; }
|
||||||
|
.sidebar { width: 100%; height: auto; max-height: 30vh; border-left: none; border-top: 1px solid var(--color-border); }
|
||||||
|
.message { max-width: 95%; }
|
||||||
|
}
|
||||||
|
</style>
|
||||||
108
web/src/views/LibraryView.vue
Normal file
108
web/src/views/LibraryView.vue
Normal file
|
|
@ -0,0 +1,108 @@
|
||||||
|
<template>
|
||||||
|
<main class="library">
|
||||||
|
<header class="library-header">
|
||||||
|
<h1>Library</h1>
|
||||||
|
<button class="btn-primary" @click="scan" :disabled="scanning">
|
||||||
|
{{ scanning ? "Scanning..." : "Scan for PDFs" }}
|
||||||
|
</button>
|
||||||
|
</header>
|
||||||
|
|
||||||
|
<p class="error-msg" v-if="error">{{ error }}</p>
|
||||||
|
|
||||||
|
<p class="empty-state" v-if="!loading && docs.length === 0">
|
||||||
|
No books indexed yet. Click "Scan for PDFs" to discover PDFs in your books directory.<br>
|
||||||
|
Make sure your PDF directory is mounted at <code>/books</code> inside the container.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div class="doc-grid" v-else>
|
||||||
|
<DocumentCard
|
||||||
|
v-for="doc in docs"
|
||||||
|
:key="doc.id"
|
||||||
|
:doc="doc"
|
||||||
|
@reingest="reingest"
|
||||||
|
@delete="remove"
|
||||||
|
@refresh="load"
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<p class="scan-result" v-if="scanResult">
|
||||||
|
Found {{ scanResult.discovered }} PDFs, queued {{ scanResult.queued }} for indexing.
|
||||||
|
</p>
|
||||||
|
</main>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<script setup lang="ts">
|
||||||
|
import { onMounted, ref } from "vue"
|
||||||
|
import { api, type Document } from "@/api"
|
||||||
|
import DocumentCard from "@/components/DocumentCard.vue"
|
||||||
|
|
||||||
|
const docs = ref<Document[]>([])
|
||||||
|
const loading = ref(true)
|
||||||
|
const scanning = ref(false)
|
||||||
|
const error = ref<string | null>(null)
|
||||||
|
const scanResult = ref<{ discovered: number; queued: number } | null>(null)
|
||||||
|
|
||||||
|
async function load() {
|
||||||
|
loading.value = true
|
||||||
|
error.value = null
|
||||||
|
try {
|
||||||
|
docs.value = await api.getLibrary()
|
||||||
|
} catch (e) {
|
||||||
|
error.value = e instanceof Error ? e.message : "Failed to load library"
|
||||||
|
} finally {
|
||||||
|
loading.value = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scan() {
|
||||||
|
scanning.value = true
|
||||||
|
error.value = null
|
||||||
|
try {
|
||||||
|
scanResult.value = await api.scanLibrary()
|
||||||
|
await load()
|
||||||
|
} catch (e) {
|
||||||
|
error.value = e instanceof Error ? e.message : "Scan failed"
|
||||||
|
} finally {
|
||||||
|
scanning.value = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function reingest(id: string) {
|
||||||
|
error.value = null
|
||||||
|
try {
|
||||||
|
await api.reingestDocument(id)
|
||||||
|
await load()
|
||||||
|
} catch (e) {
|
||||||
|
error.value = e instanceof Error ? e.message : "Re-index failed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function remove(id: string) {
|
||||||
|
if (!confirm("Remove this book from the library? The PDF file is not deleted.")) return
|
||||||
|
error.value = null
|
||||||
|
try {
|
||||||
|
await api.deleteDocument(id)
|
||||||
|
await load()
|
||||||
|
} catch (e) {
|
||||||
|
error.value = e instanceof Error ? e.message : "Remove failed"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
onMounted(load)
|
||||||
|
</script>
|
||||||
|
|
||||||
|
<style scoped>
|
||||||
|
.library { padding: 1.5rem; max-width: 1200px; margin: 0 auto; }
|
||||||
|
.library-header { display: flex; align-items: center; justify-content: space-between; margin-bottom: 1.5rem; flex-wrap: wrap; gap: 1rem; }
|
||||||
|
h1 { font-size: 1.5rem; }
|
||||||
|
.btn-primary {
|
||||||
|
background: var(--color-accent); color: #fff; border: none; padding: 0.6rem 1.2rem;
|
||||||
|
border-radius: var(--radius-sm); cursor: pointer; font-size: 0.95rem;
|
||||||
|
}
|
||||||
|
.btn-primary:disabled { opacity: 0.5; cursor: default; }
|
||||||
|
.doc-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 1rem; }
|
||||||
|
.empty-state { color: var(--color-text-muted); line-height: 1.8; }
|
||||||
|
.empty-state code { font-family: var(--font-mono); background: var(--color-surface-alt); padding: 2px 6px; border-radius: 3px; }
|
||||||
|
.scan-result { margin-top: 1rem; color: var(--color-text-muted); font-size: 0.9rem; }
|
||||||
|
.error-msg { color: var(--color-error); margin-bottom: 1rem; font-size: 0.9rem; }
|
||||||
|
</style>
|
||||||
18
web/tsconfig.app.json
Normal file
18
web/tsconfig.app.json
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
{
|
||||||
|
"extends": "@vue/tsconfig/tsconfig.dom.json",
|
||||||
|
"include": ["env.d.ts", "src/**/*", "src/**/*.vue"],
|
||||||
|
"exclude": ["src/**/__tests__/*"],
|
||||||
|
"compilerOptions": {
|
||||||
|
// Extra safety for array and object lookups, but may have false positives.
|
||||||
|
"noUncheckedIndexedAccess": true,
|
||||||
|
|
||||||
|
// Path mapping for cleaner imports.
|
||||||
|
"paths": {
|
||||||
|
"@/*": ["./src/*"]
|
||||||
|
},
|
||||||
|
|
||||||
|
// `vue-tsc --build` produces a .tsbuildinfo file for incremental type-checking.
|
||||||
|
// Specified here to keep it out of the root directory.
|
||||||
|
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.app.tsbuildinfo"
|
||||||
|
}
|
||||||
|
}
|
||||||
11
web/tsconfig.json
Normal file
11
web/tsconfig.json
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
"files": [],
|
||||||
|
"references": [
|
||||||
|
{
|
||||||
|
"path": "./tsconfig.node.json"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"path": "./tsconfig.app.json"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
27
web/tsconfig.node.json
Normal file
27
web/tsconfig.node.json
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
// TSConfig for modules that run in Node.js environment via either transpilation or type-stripping.
|
||||||
|
{
|
||||||
|
"extends": "@tsconfig/node24/tsconfig.json",
|
||||||
|
"include": [
|
||||||
|
"vite.config.*",
|
||||||
|
"vitest.config.*",
|
||||||
|
"cypress.config.*",
|
||||||
|
"playwright.config.*",
|
||||||
|
"eslint.config.*"
|
||||||
|
],
|
||||||
|
"compilerOptions": {
|
||||||
|
// Most tools use transpilation instead of Node.js's native type-stripping.
|
||||||
|
// Bundler mode provides a smoother developer experience.
|
||||||
|
"module": "preserve",
|
||||||
|
"moduleResolution": "bundler",
|
||||||
|
|
||||||
|
// Include Node.js types and avoid accidentally including other `@types/*` packages.
|
||||||
|
"types": ["node"],
|
||||||
|
|
||||||
|
// Disable emitting output during `vue-tsc --build`, which is used for type-checking only.
|
||||||
|
"noEmit": true,
|
||||||
|
|
||||||
|
// `vue-tsc --build` produces a .tsbuildinfo file for incremental type-checking.
|
||||||
|
// Specified here to keep it out of the root directory.
|
||||||
|
"tsBuildInfoFile": "./node_modules/.tmp/tsconfig.node.tsbuildinfo"
|
||||||
|
}
|
||||||
|
}
|
||||||
18
web/vite.config.ts
Normal file
18
web/vite.config.ts
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
import { fileURLToPath, URL } from 'node:url'
|
||||||
|
|
||||||
|
import { defineConfig } from 'vite'
|
||||||
|
import vue from '@vitejs/plugin-vue'
|
||||||
|
import vueDevTools from 'vite-plugin-vue-devtools'
|
||||||
|
|
||||||
|
// https://vite.dev/config/
|
||||||
|
export default defineConfig({
|
||||||
|
plugins: [
|
||||||
|
vue(),
|
||||||
|
vueDevTools(),
|
||||||
|
],
|
||||||
|
resolve: {
|
||||||
|
alias: {
|
||||||
|
'@': fileURLToPath(new URL('./src', import.meta.url))
|
||||||
|
},
|
||||||
|
},
|
||||||
|
})
|
||||||
Loading…
Reference in a new issue