From 64554dbef1a72feca8ecd9e3ab08d0f06c2c0651 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sat, 4 Apr 2026 22:17:42 -0700
Subject: [PATCH] feat(#43): numbered SQL migration runner (Rails-style)

- migrations/001_baseline.sql: full schema baseline (all tables/cols)
- scripts/db_migrate.py: apply sorted *.sql files, track in schema_migrations
- Wired into FastAPI startup and Streamlit app.py startup
- Replaces ad-hoc digest_queue CREATE in _startup()
- 6 tests covering apply, idempotency, partial apply, failure rollback
- docs/developer-guide/contributing.md: migration authoring guide
---
 app/app.py                           |   2 +
 dev-api.py                           |  17 +--
 docs/developer-guide/contributing.md |  17 +++
 migrations/001_baseline.sql          |  97 ++++++++++++++++++
 scripts/db_migrate.py                |  73 +++++++++++++
 tests/test_db_migrate.py             | 148 +++++++++++++++++++++++++++
 6 files changed, 340 insertions(+), 14 deletions(-)
 create mode 100644 migrations/001_baseline.sql
 create mode 100644 scripts/db_migrate.py
 create mode 100644 tests/test_db_migrate.py

diff --git a/app/app.py b/app/app.py
index c61fe4c..efa6e51 100644
--- a/app/app.py
+++ b/app/app.py
@@ -26,6 +26,7 @@ IS_DEMO = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
 
 import streamlit as st
 from scripts.db import DEFAULT_DB, init_db, get_active_tasks
+from scripts.db_migrate import migrate_db
 from app.feedback import inject_feedback_button
 from app.cloud_session import resolve_session, get_db_path, get_config_dir, get_cloud_tier
 import sqlite3
@@ -41,6 +42,7 @@ st.set_page_config(
 
 resolve_session("peregrine")
 init_db(get_db_path())
+migrate_db(Path(get_db_path()))
 
 # Demo tier — initialize once per session (cookie persistence handled client-side)
 if IS_DEMO and "simulated_tier" not in st.session_state:
diff --git a/dev-api.py b/dev-api.py
index 7fc57e4..02ff9bb 100644
--- a/dev-api.py
+++ b/dev-api.py
@@ -35,6 +35,7 @@ if str(PEREGRINE_ROOT) not in sys.path:
 
 from circuitforge_core.config.settings import load_env as _load_env  # noqa: E402
 from scripts.credential_store import get_credential, set_credential, delete_credential  # noqa: E402
+from scripts.db_migrate import migrate_db  # noqa: E402
 
 DB_PATH = os.environ.get("STAGING_DB", "/devl/job-seeker/staging.db")
 
@@ -132,23 +133,11 @@ def _strip_html(text: str | None) -> str | None:
 
 @app.on_event("startup")
 def _startup():
-    """Load .env then ensure digest_queue table exists."""
+    """Load .env then run pending SQLite migrations."""
     # Load .env before any runtime env reads — safe because startup doesn't run
     # when dev_api is imported by tests (only when uvicorn actually starts).
     _load_env(PEREGRINE_ROOT / ".env")
-    db = _get_db()
-    try:
-        db.execute("""
-            CREATE TABLE IF NOT EXISTS digest_queue (
-              id             INTEGER PRIMARY KEY,
-              job_contact_id INTEGER NOT NULL REFERENCES job_contacts(id),
-              created_at     TEXT DEFAULT (datetime('now')),
-              UNIQUE(job_contact_id)
-            )
-        """)
-        db.commit()
-    finally:
-        db.close()
+    migrate_db(Path(DB_PATH))
 
 
 # ── Link extraction helpers ───────────────────────────────────────────────
diff --git a/docs/developer-guide/contributing.md b/docs/developer-guide/contributing.md
index d160182..e4d6261 100644
--- a/docs/developer-guide/contributing.md
+++ b/docs/developer-guide/contributing.md
@@ -102,6 +102,23 @@ Before opening a pull request:
 
 ---
 
+## Database Migrations
+
+Peregrine uses a numbered SQL migration system (Rails-style). Each migration is a `.sql` file in the `migrations/` directory at the repo root, named `NNN_description.sql` (e.g. `002_add_foo_column.sql`). Applied migrations are tracked in a `schema_migrations` table in each user database.
+
+### Adding a migration
+
+1. Create `migrations/NNN_description.sql` where `NNN` is the next sequential number (zero-padded to 3 digits).
+2. Write standard SQL — `CREATE TABLE IF NOT EXISTS`, `ALTER TABLE ADD COLUMN`, etc. Keep each migration idempotent where possible.
+3. Do **not** modify `scripts/db.py`'s legacy `_MIGRATIONS` lists — those are superseded and will be removed once all active databases have been bootstrapped by the migration runner.
+4. The runner (`scripts/db_migrate.py`) applies pending migrations at startup automatically (both FastAPI and Streamlit paths call `migrate_db(db_path)`).
+
+### Rollbacks
+
+SQLite does not support transactional DDL for all statement types. Write forward-only migrations. If you need to undo a schema change, add a new migration that reverses it.
+
+---
+
 ## What NOT to Do
 
 - Do not commit `config/user.yaml`, `config/notion.yaml`, `config/email.yaml`, `config/adzuna.yaml`, or any `config/integrations/*.yaml` — all are gitignored
diff --git a/migrations/001_baseline.sql b/migrations/001_baseline.sql
new file mode 100644
index 0000000..36e3526
--- /dev/null
+++ b/migrations/001_baseline.sql
@@ -0,0 +1,97 @@
+-- Migration 001: Baseline schema
+-- Captures the full schema as of v0.8.5 (all columns including those added via ALTER TABLE)
+
+CREATE TABLE IF NOT EXISTS jobs (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    title TEXT,
+    company TEXT,
+    url TEXT UNIQUE,
+    source TEXT,
+    location TEXT,
+    is_remote INTEGER DEFAULT 0,
+    salary TEXT,
+    description TEXT,
+    match_score REAL,
+    keyword_gaps TEXT,
+    date_found TEXT,
+    status TEXT DEFAULT 'pending',
+    notion_page_id TEXT,
+    cover_letter TEXT,
+    applied_at TEXT,
+    interview_date TEXT,
+    rejection_stage TEXT,
+    phone_screen_at TEXT,
+    interviewing_at TEXT,
+    offer_at TEXT,
+    hired_at TEXT,
+    survey_at TEXT,
+    calendar_event_id TEXT,
+    optimized_resume TEXT,
+    ats_gap_report TEXT
+);
+
+CREATE TABLE IF NOT EXISTS job_contacts (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_id INTEGER,
+    direction TEXT,
+    subject TEXT,
+    from_addr TEXT,
+    to_addr TEXT,
+    body TEXT,
+    received_at TEXT,
+    is_response_needed INTEGER DEFAULT 0,
+    responded_at TEXT,
+    message_id TEXT,
+    stage_signal TEXT,
+    suggestion_dismissed INTEGER DEFAULT 0
+);
+
+CREATE TABLE IF NOT EXISTS company_research (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_id INTEGER UNIQUE,
+    generated_at TEXT,
+    company_brief TEXT,
+    ceo_brief TEXT,
+    talking_points TEXT,
+    raw_output TEXT,
+    tech_brief TEXT,
+    funding_brief TEXT,
+    competitors_brief TEXT,
+    red_flags TEXT,
+    scrape_used INTEGER DEFAULT 0,
+    accessibility_brief TEXT
+);
+
+CREATE TABLE IF NOT EXISTS background_tasks (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    task_type TEXT,
+    job_id INTEGER,
+    params TEXT,
+    status TEXT DEFAULT 'pending',
+    error TEXT,
+    created_at TEXT,
+    started_at TEXT,
+    finished_at TEXT,
+    stage TEXT,
+    updated_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS survey_responses (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_id INTEGER,
+    survey_name TEXT,
+    received_at TEXT,
+    source TEXT,
+    raw_input TEXT,
+    image_path TEXT,
+    mode TEXT,
+    llm_output TEXT,
+    reported_score REAL,
+    created_at TEXT
+);
+
+CREATE TABLE IF NOT EXISTS digest_queue (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    job_contact_id INTEGER UNIQUE,
+    created_at TEXT
+);
diff --git a/scripts/db_migrate.py b/scripts/db_migrate.py
new file mode 100644
index 0000000..bbb407f
--- /dev/null
+++ b/scripts/db_migrate.py
@@ -0,0 +1,73 @@
+"""
+db_migrate.py — Rails-style numbered SQL migration runner for Peregrine user DBs.
+
+Migration files live in migrations/ (sibling to this script's parent directory),
+named NNN_description.sql (e.g. 001_baseline.sql). They are applied in sorted
+order and tracked in the schema_migrations table so each runs exactly once.
+
+Usage:
+    from scripts.db_migrate import migrate_db
+    migrate_db(Path("/path/to/user.db"))
+"""
+
+import logging
+import sqlite3
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+# Resolved at import time: peregrine repo root / migrations/
+_MIGRATIONS_DIR = Path(__file__).parent.parent / "migrations"
+
+_CREATE_MIGRATIONS_TABLE = """
+CREATE TABLE IF NOT EXISTS schema_migrations (
+    version TEXT PRIMARY KEY,
+    applied_at TEXT NOT NULL DEFAULT (datetime('now'))
+)
+"""
+
+
+def migrate_db(db_path: Path) -> list[str]:
+    """Apply any pending migrations to db_path. Returns list of applied versions."""
+    applied: list[str] = []
+
+    con = sqlite3.connect(db_path)
+    try:
+        con.execute(_CREATE_MIGRATIONS_TABLE)
+        con.commit()
+
+        if not _MIGRATIONS_DIR.is_dir():
+            log.warning("migrations/ directory not found at %s — skipping", _MIGRATIONS_DIR)
+            return applied
+
+        migration_files = sorted(_MIGRATIONS_DIR.glob("*.sql"))
+        if not migration_files:
+            return applied
+
+        already_applied = {
+            row[0] for row in con.execute("SELECT version FROM schema_migrations")
+        }
+
+        for path in migration_files:
+            version = path.stem  # e.g. "001_baseline"
+            if version in already_applied:
+                continue
+
+            sql = path.read_text(encoding="utf-8")
+            log.info("Applying migration %s to %s", version, db_path.name)
+            try:
+                con.executescript(sql)
+                con.execute(
+                    "INSERT INTO schema_migrations (version) VALUES (?)", (version,)
+                )
+                con.commit()
+                applied.append(version)
+                log.info("Migration %s applied successfully", version)
+            except Exception as exc:
+                con.rollback()
+                log.error("Migration %s failed: %s", version, exc)
+                raise RuntimeError(f"Migration {version} failed: {exc}") from exc
+    finally:
+        con.close()
+
+    return applied
diff --git a/tests/test_db_migrate.py b/tests/test_db_migrate.py
new file mode 100644
index 0000000..8da4a24
--- /dev/null
+++ b/tests/test_db_migrate.py
@@ -0,0 +1,148 @@
+"""Tests for scripts/db_migrate.py — numbered SQL migration runner."""
+
+import sqlite3
+import textwrap
+from pathlib import Path
+
+import pytest
+
+from scripts.db_migrate import migrate_db
+
+
+# ── helpers ───────────────────────────────────────────────────────────────────
+
+def _applied(db_path: Path) -> list[str]:
+    con = sqlite3.connect(db_path)
+    try:
+        rows = con.execute("SELECT version FROM schema_migrations ORDER BY version").fetchall()
+        return [r[0] for r in rows]
+    finally:
+        con.close()
+
+
+def _tables(db_path: Path) -> set[str]:
+    con = sqlite3.connect(db_path)
+    try:
+        rows = con.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
+        ).fetchall()
+        return {r[0] for r in rows}
+    finally:
+        con.close()
+
+
+# ── tests ──────────────────────────────────────────────────────────────────────
+
+def test_creates_schema_migrations_table(tmp_path):
+    """Running against an empty DB creates the tracking table."""
+    db = tmp_path / "test.db"
+    (tmp_path / "migrations").mkdir()  # empty migrations dir
+    # Patch the module-level _MIGRATIONS_DIR
+    import scripts.db_migrate as m
+    orig = m._MIGRATIONS_DIR
+    m._MIGRATIONS_DIR = tmp_path / "migrations"
+    try:
+        migrate_db(db)
+        assert "schema_migrations" in _tables(db)
+    finally:
+        m._MIGRATIONS_DIR = orig
+
+
+def test_applies_migration_file(tmp_path):
+    """A .sql file in migrations/ is applied and recorded."""
+    db = tmp_path / "test.db"
+    mdir = tmp_path / "migrations"
+    mdir.mkdir()
+    (mdir / "001_test.sql").write_text(
+        "CREATE TABLE IF NOT EXISTS widgets (id INTEGER PRIMARY KEY, name TEXT);"
+    )
+
+    import scripts.db_migrate as m
+    orig = m._MIGRATIONS_DIR
+    m._MIGRATIONS_DIR = mdir
+    try:
+        applied = migrate_db(db)
+        assert applied == ["001_test"]
+        assert "widgets" in _tables(db)
+        assert _applied(db) == ["001_test"]
+    finally:
+        m._MIGRATIONS_DIR = orig
+
+
+def test_idempotent_second_run(tmp_path):
+    """Running migrate_db twice does not re-apply migrations."""
+    db = tmp_path / "test.db"
+    mdir = tmp_path / "migrations"
+    mdir.mkdir()
+    (mdir / "001_test.sql").write_text(
+        "CREATE TABLE IF NOT EXISTS widgets (id INTEGER PRIMARY KEY, name TEXT);"
+    )
+
+    import scripts.db_migrate as m
+    orig = m._MIGRATIONS_DIR
+    m._MIGRATIONS_DIR = mdir
+    try:
+        migrate_db(db)
+        applied = migrate_db(db)  # second run
+        assert applied == []
+        assert _applied(db) == ["001_test"]
+    finally:
+        m._MIGRATIONS_DIR = orig
+
+
+def test_applies_only_new_migrations(tmp_path):
+    """Migrations already in schema_migrations are skipped; only new ones run."""
+    db = tmp_path / "test.db"
+    mdir = tmp_path / "migrations"
+    mdir.mkdir()
+    (mdir / "001_first.sql").write_text(
+        "CREATE TABLE IF NOT EXISTS first_table (id INTEGER PRIMARY KEY);"
+    )
+
+    import scripts.db_migrate as m
+    orig = m._MIGRATIONS_DIR
+    m._MIGRATIONS_DIR = mdir
+    try:
+        migrate_db(db)
+
+        # Add a second migration
+        (mdir / "002_second.sql").write_text(
+            "CREATE TABLE IF NOT EXISTS second_table (id INTEGER PRIMARY KEY);"
+        )
+        applied = migrate_db(db)
+        assert applied == ["002_second"]
+        assert set(_applied(db)) == {"001_first", "002_second"}
+        assert "second_table" in _tables(db)
+    finally:
+        m._MIGRATIONS_DIR = orig
+
+
+def test_migration_failure_raises(tmp_path):
+    """A bad migration raises RuntimeError and does not record the version."""
+    db = tmp_path / "test.db"
+    mdir = tmp_path / "migrations"
+    mdir.mkdir()
+    (mdir / "001_bad.sql").write_text("THIS IS NOT VALID SQL !!!")
+
+    import scripts.db_migrate as m
+    orig = m._MIGRATIONS_DIR
+    m._MIGRATIONS_DIR = mdir
+    try:
+        with pytest.raises(RuntimeError, match="001_bad"):
+            migrate_db(db)
+        assert _applied(db) == []
+    finally:
+        m._MIGRATIONS_DIR = orig
+
+
+def test_baseline_migration_runs(tmp_path):
+    """The real 001_baseline.sql applies cleanly to a fresh database."""
+    db = tmp_path / "test.db"
+    applied = migrate_db(db)
+    assert "001_baseline" in applied
+    expected_tables = {
+        "jobs", "job_contacts", "company_research",
+        "background_tasks", "survey_responses", "digest_queue",
+        "schema_migrations",
+    }
+    assert expected_tables <= _tables(db)