diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..2938887 --- /dev/null +++ b/app/config.py @@ -0,0 +1,32 @@ +"""Configuration from environment variables — no file parsing required for basic use.""" + +from __future__ import annotations + +import os +from pathlib import Path + +DATA_DIR = Path(os.environ.get("PAGEPIPER_DATA_DIR", "data")) +DB_PATH = str(DATA_DIR / "pagepiper.db") +VEC_DB_PATH = str(DATA_DIR / "pagepiper_vecs.db") +WATCH_DIR = Path(os.environ.get("PAGEPIPER_WATCH_DIR", "books")) + + +def get_llm_config() -> dict | None: + """Build LLMRouter config from env vars. Returns None if PAGEPIPER_OLLAMA_URL is unset.""" + url = os.environ.get("PAGEPIPER_OLLAMA_URL", "").strip() + if not url: + return None + return { + "fallback_order": ["ollama"], + "backends": { + "ollama": { + "type": "openai_compat", + "base_url": url.rstrip("/") + "/v1", + "model": os.environ.get("PAGEPIPER_CHAT_MODEL", "mistral:7b"), + "embedding_model": os.environ.get( + "PAGEPIPER_EMBED_MODEL", "nomic-embed-text" + ), + "supports_images": False, + } + }, + } diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/migrations/001_initial_schema.sql b/migrations/001_initial_schema.sql new file mode 100644 index 0000000..e9675f5 --- /dev/null +++ b/migrations/001_initial_schema.sql @@ -0,0 +1,29 @@ +-- pagepiper initial schema +-- Run via: conda run -n cf python scripts/db_migrate.py + +CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + title TEXT NOT NULL, + file_path TEXT NOT NULL UNIQUE, + status TEXT NOT NULL DEFAULT 'pending', + -- status: pending | processing | ready | error + task_id TEXT, + page_count INTEGER, + error_msg TEXT, + created_at TEXT NOT NULL DEFAULT (datetime('now')), + updated_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE TABLE IF NOT EXISTS page_chunks ( + id TEXT PRIMARY KEY DEFAULT (lower(hex(randomblob(16)))), + doc_id TEXT NOT NULL REFERENCES documents(id) ON DELETE CASCADE, + page_number INTEGER NOT NULL, + text TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL, + -- source: text_layer | ocr + word_count INTEGER NOT NULL DEFAULT 0, + created_at TEXT NOT NULL DEFAULT (datetime('now')) +); + +CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_id ON page_chunks(doc_id); +CREATE INDEX IF NOT EXISTS idx_page_chunks_doc_page ON page_chunks(doc_id, page_number); diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/db_migrate.py b/scripts/db_migrate.py new file mode 100644 index 0000000..a4c77b6 --- /dev/null +++ b/scripts/db_migrate.py @@ -0,0 +1,30 @@ +"""Apply all pending migrations to the pagepiper SQLite database.""" + +from __future__ import annotations + +import sqlite3 +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from app.config import DB_PATH + + +def migrate(db_path: str = DB_PATH) -> None: + migrations_dir = Path(__file__).parent.parent / "migrations" + conn = sqlite3.connect(db_path) + conn.execute("PRAGMA foreign_keys = ON") + conn.execute("PRAGMA journal_mode = WAL") + + for sql_file in sorted(migrations_dir.glob("*.sql")): + print(f"Applying {sql_file.name}...") + conn.executescript(sql_file.read_text()) + + conn.commit() + conn.close() + print(f"Migrations applied to {db_path}") + + +if __name__ == "__main__": + migrate() diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_db_migrate.py b/tests/test_db_migrate.py new file mode 100644 index 0000000..445969e --- /dev/null +++ b/tests/test_db_migrate.py @@ -0,0 +1,70 @@ +"""Tests for migrations/001_initial_schema.sql via scripts/db_migrate.py.""" + +from __future__ import annotations + +import sqlite3 +from pathlib import Path + +import pytest + + +def test_migration_creates_documents_table(tmp_path): + db_path = str(tmp_path / "test.db") + schema = Path("migrations/001_initial_schema.sql").read_text() + + conn = sqlite3.connect(db_path) + conn.executescript(schema) + conn.commit() + + tables = { + r[0] + for r in conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'" + ).fetchall() + } + assert "documents" in tables + assert "page_chunks" in tables + + +def test_documents_table_has_required_columns(tmp_path): + db_path = str(tmp_path / "test.db") + schema = Path("migrations/001_initial_schema.sql").read_text() + + conn = sqlite3.connect(db_path) + conn.executescript(schema) + + cols = {r[1] for r in conn.execute("PRAGMA table_info(documents)").fetchall()} + assert { + "id", + "title", + "file_path", + "status", + "task_id", + "page_count", + "created_at", + } <= cols + + +def test_page_chunks_foreign_key_cascades(tmp_path): + db_path = str(tmp_path / "test.db") + schema = Path("migrations/001_initial_schema.sql").read_text() + + conn = sqlite3.connect(db_path) + conn.executescript(schema) + conn.execute("PRAGMA foreign_keys = ON") + + conn.execute( + "INSERT INTO documents(id, title, file_path, status) VALUES ('d1','Book','path.pdf','ready')" + ) + conn.execute( + "INSERT INTO page_chunks(doc_id, page_number, text, source, word_count) VALUES ('d1',1,'text','text_layer',1)" + ) + conn.commit() + + conn.execute("DELETE FROM documents WHERE id='d1'") + conn.commit() + + count = conn.execute( + "SELECT COUNT(*) FROM page_chunks WHERE doc_id='d1'" + ).fetchone()[0] + assert count == 0