feat: add database schema and migration runner

This commit is contained in:
pyr0ball 2026-05-04 17:10:38 -07:00
parent 3c9598c443
commit 9797e76931
9 changed files with 161 additions and 0 deletions

0
app/__init__.py Normal file
View file

0
app/api/__init__.py Normal file
View file

32
app/config.py Normal file
View file

@ -0,0 +1,32 @@
"""Configuration from environment variables — no file parsing required for basic use."""
from __future__ import annotations
import os
from pathlib import Path
DATA_DIR = Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
DB_PATH = str(DATA_DIR / "pagepiper.db")
VEC_DB_PATH = str(DATA_DIR / "pagepiper_vecs.db")
WATCH_DIR = Path(os.environ.get("PAGEPIPER_WATCH_DIR", "books"))
def get_llm_config() -> dict | None:
"""Build LLMRouter config from env vars. Returns None if PAGEPIPER_OLLAMA_URL is unset."""
url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
if not url:
return None
return {
"fallback_order": ["ollama"],
"backends": {
"ollama": {
"type": "openai_compat",
"base_url": url.rstrip("/") + "/v1",
"model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
"embedding_model": os.environ.get(
"PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
),
"supports_images": False,
}
},
}

0
app/services/__init__.py Normal file
View file

View file

@ -0,0 +1,29 @@
-- pagepiper initial schema
-- Run via: conda run -n cf python scripts/db_migrate.py
CREATE TABLE IF NOT EXISTS documents (
id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))),
title TEXT NOT NULL,
file_path TEXT NOT NULL UNIQUE,
status TEXT NOT NULL DEFAULT 'pending',
-- status: pending | processing | ready | error
task_id TEXT,
page_count INTEGER,
error_msg TEXT,
created_at TEXT NOT NULL DEFAULT (datetime('now')),
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE TABLE IF NOT EXISTS page_chunks (
id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))),
doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
page_number INTEGER NOT NULL,
text TEXT NOT NULL DEFAULT '',
source TEXT NOT NULL,
-- source: text_layer | ocr
word_count INTEGER NOT NULL DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (datetime('now'))
);
CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_id ON page_chunks(doc_id);
CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_page ON page_chunks(doc_id, page_number);

0
scripts/__init__.py Normal file
View file

30
scripts/db_migrate.py Normal file
View file

@ -0,0 +1,30 @@
"""Apply all pending migrations to the pagepiper SQLite database."""
from __future__ import annotations
import sqlite3
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from app.config import DB_PATH
def migrate(db_path: str = DB_PATH) -> None:
migrations_dir = Path(__file__).parent.parent / "migrations"
conn = sqlite3.connect(db_path)
conn.execute("PRAGMA foreign_keys = ON")
conn.execute("PRAGMA journal_mode = WAL")
for sql_file in sorted(migrations_dir.glob("*.sql")):
print(f"Applying {sql_file.name}...")
conn.executescript(sql_file.read_text())
conn.commit()
conn.close()
print(f"Migrations applied to {db_path}")
if __name__ == "__main__":
migrate()

0
tests/__init__.py Normal file
View file

70
tests/test_db_migrate.py Normal file
View file

@ -0,0 +1,70 @@
"""Tests for migrations/001_initial_schema.sql via scripts/db_migrate.py."""
from __future__ import annotations
import sqlite3
from pathlib import Path
import pytest
def test_migration_creates_documents_table(tmp_path):
db_path = str(tmp_path / "test.db")
schema = Path("migrations/001_initial_schema.sql").read_text()
conn = sqlite3.connect(db_path)
conn.executescript(schema)
conn.commit()
tables = {
r[0]
for r in conn.execute(
"SELECT name FROM sqlite_master WHERE type='table'"
).fetchall()
}
assert "documents" in tables
assert "page_chunks" in tables
def test_documents_table_has_required_columns(tmp_path):
db_path = str(tmp_path / "test.db")
schema = Path("migrations/001_initial_schema.sql").read_text()
conn = sqlite3.connect(db_path)
conn.executescript(schema)
cols = {r[1] for r in conn.execute("PRAGMA table_info(documents)").fetchall()}
assert {
"id",
"title",
"file_path",
"status",
"task_id",
"page_count",
"created_at",
} <= cols
def test_page_chunks_foreign_key_cascades(tmp_path):
db_path = str(tmp_path / "test.db")
schema = Path("migrations/001_initial_schema.sql").read_text()
conn = sqlite3.connect(db_path)
conn.executescript(schema)
conn.execute("PRAGMA foreign_keys = ON")
conn.execute(
"INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Book','path.pdf','ready')"
)
conn.execute(
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES ('d1',1,'text','text_layer',1)"
)
conn.commit()
conn.execute("DELETE FROM documents WHERE id='d1'")
conn.commit()
count = conn.execute(
"SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'"
).fetchone()[0]
assert count == 0