feat: add database schema and migration runner
This commit is contained in:
parent
3c9598c443
commit
9797e76931
9 changed files with 161 additions and 0 deletions
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
32
app/config.py
Normal file
32
app/config.py
Normal file
|
|
@ -0,0 +1,32 @@
|
||||||
|
"""Configuration from environment variables — no file parsing required for basic use."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
DATA_DIR = Path(os.environ.get("PAGEPIPER_DATA_DIR", "data"))
|
||||||
|
DB_PATH = str(DATA_DIR / "pagepiper.db")
|
||||||
|
VEC_DB_PATH = str(DATA_DIR / "pagepiper_vecs.db")
|
||||||
|
WATCH_DIR = Path(os.environ.get("PAGEPIPER_WATCH_DIR", "books"))
|
||||||
|
|
||||||
|
|
||||||
|
def get_llm_config() -> dict | None:
|
||||||
|
"""Build LLMRouter config from env vars. Returns None if PAGEPIPER_OLLAMA_URL is unset."""
|
||||||
|
url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip()
|
||||||
|
if not url:
|
||||||
|
return None
|
||||||
|
return {
|
||||||
|
"fallback_order": ["ollama"],
|
||||||
|
"backends": {
|
||||||
|
"ollama": {
|
||||||
|
"type": "openai_compat",
|
||||||
|
"base_url": url.rstrip("/") + "/v1",
|
||||||
|
"model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"),
|
||||||
|
"embedding_model": os.environ.get(
|
||||||
|
"PAGEPIPER_EMBED_MODEL", "nomic-embed-text"
|
||||||
|
),
|
||||||
|
"supports_images": False,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
29
migrations/001_initial_schema.sql
Normal file
29
migrations/001_initial_schema.sql
Normal file
|
|
@ -0,0 +1,29 @@
|
||||||
|
-- pagepiper initial schema
|
||||||
|
-- Run via: conda run -n cf python scripts/db_migrate.py
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS documents (
|
||||||
|
id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))),
|
||||||
|
title TEXT NOT NULL,
|
||||||
|
file_path TEXT NOT NULL UNIQUE,
|
||||||
|
status TEXT NOT NULL DEFAULT 'pending',
|
||||||
|
-- status: pending | processing | ready | error
|
||||||
|
task_id TEXT,
|
||||||
|
page_count INTEGER,
|
||||||
|
error_msg TEXT,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now')),
|
||||||
|
updated_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS page_chunks (
|
||||||
|
id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))),
|
||||||
|
doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE,
|
||||||
|
page_number INTEGER NOT NULL,
|
||||||
|
text TEXT NOT NULL DEFAULT '',
|
||||||
|
source TEXT NOT NULL,
|
||||||
|
-- source: text_layer | ocr
|
||||||
|
word_count INTEGER NOT NULL DEFAULT 0,
|
||||||
|
created_at TEXT NOT NULL DEFAULT (datetime('now'))
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_id ON page_chunks(doc_id);
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_page ON page_chunks(doc_id, page_number);
|
||||||
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
30
scripts/db_migrate.py
Normal file
30
scripts/db_migrate.py
Normal file
|
|
@ -0,0 +1,30 @@
|
||||||
|
"""Apply all pending migrations to the pagepiper SQLite database."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from app.config import DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def migrate(db_path: str = DB_PATH) -> None:
|
||||||
|
migrations_dir = Path(__file__).parent.parent / "migrations"
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
conn.execute("PRAGMA journal_mode = WAL")
|
||||||
|
|
||||||
|
for sql_file in sorted(migrations_dir.glob("*.sql")):
|
||||||
|
print(f"Applying {sql_file.name}...")
|
||||||
|
conn.executescript(sql_file.read_text())
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
print(f"Migrations applied to {db_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
migrate()
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
70
tests/test_db_migrate.py
Normal file
70
tests/test_db_migrate.py
Normal file
|
|
@ -0,0 +1,70 @@
|
||||||
|
"""Tests for migrations/001_initial_schema.sql via scripts/db_migrate.py."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_migration_creates_documents_table(tmp_path):
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.executescript(schema)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
tables = {
|
||||||
|
r[0]
|
||||||
|
for r in conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table'"
|
||||||
|
).fetchall()
|
||||||
|
}
|
||||||
|
assert "documents" in tables
|
||||||
|
assert "page_chunks" in tables
|
||||||
|
|
||||||
|
|
||||||
|
def test_documents_table_has_required_columns(tmp_path):
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.executescript(schema)
|
||||||
|
|
||||||
|
cols = {r[1] for r in conn.execute("PRAGMA table_info(documents)").fetchall()}
|
||||||
|
assert {
|
||||||
|
"id",
|
||||||
|
"title",
|
||||||
|
"file_path",
|
||||||
|
"status",
|
||||||
|
"task_id",
|
||||||
|
"page_count",
|
||||||
|
"created_at",
|
||||||
|
} <= cols
|
||||||
|
|
||||||
|
|
||||||
|
def test_page_chunks_foreign_key_cascades(tmp_path):
|
||||||
|
db_path = str(tmp_path / "test.db")
|
||||||
|
schema = Path("migrations/001_initial_schema.sql").read_text()
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.executescript(schema)
|
||||||
|
conn.execute("PRAGMA foreign_keys = ON")
|
||||||
|
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Book','path.pdf','ready')"
|
||||||
|
)
|
||||||
|
conn.execute(
|
||||||
|
"INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES ('d1',1,'text','text_layer',1)"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
conn.execute("DELETE FROM documents WHERE id='d1'")
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
count = conn.execute(
|
||||||
|
"SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'"
|
||||||
|
).fetchone()[0]
|
||||||
|
assert count == 0
|
||||||
Loading…
Reference in a new issue