feat(#43): numbered SQL migration runner (Rails-style)

- migrations/001_baseline.sql: full schema baseline (all tables/cols) - scripts/db_migrate.py: apply sorted *.sql files, track in schema_migrations - Wired into FastAPI startup and Streamlit app.py startup - Replaces ad-hoc digest_queue CREATE in _startup() - 6 tests covering apply, idempotency, partial apply, failure rollback - docs/developer-guide/contributing.md: migration authoring guide
2026-04-04 22:17:42 -07:00 · 2026-04-04 22:17:42 -07:00 · 64554dbef1
commit 64554dbef1
parent 065c02feb7
6 changed files with 340 additions and 14 deletions
--- a/app/app.py
+++ b/app/app.py
@ -26,6 +26,7 @@ IS_DEMO = os.environ.get("DEMO_MODE", "").lower() in ("1", "true", "yes")
 import streamlit as st
 from scripts.db import DEFAULT_DB, init_db, get_active_tasks
 from scripts.db_migrate import migrate_db
 from app.feedback import inject_feedback_button
 from app.cloud_session import resolve_session, get_db_path, get_config_dir, get_cloud_tier
 import sqlite3
@ -41,6 +42,7 @@ st.set_page_config(
 resolve_session("peregrine")
 init_db(get_db_path())
 migrate_db(Path(get_db_path()))
 # Demo tier — initialize once per session (cookie persistence handled client-side)
 if IS_DEMO and "simulated_tier" not in st.session_state:
--- a/dev-api.py
+++ b/dev-api.py
@ -35,6 +35,7 @@ if str(PEREGRINE_ROOT) not in sys.path:
 from circuitforge_core.config.settings import load_env as _load_env  # noqa: E402
 from scripts.credential_store import get_credential, set_credential, delete_credential  # noqa: E402
 from scripts.db_migrate import migrate_db  # noqa: E402
 DB_PATH = os.environ.get("STAGING_DB", "/devl/job-seeker/staging.db")
@ -132,23 +133,11 @@ def _strip_html(text: str | None) -> str | None:
@app.on_event("startup")
 def _startup():
-    """Load .env then ensure digest_queue table exists."""
+    """Load .env then run pending SQLite migrations."""
    # Load .env before any runtime env reads — safe because startup doesn't run
    # when dev_api is imported by tests (only when uvicorn actually starts).
    _load_env(PEREGRINE_ROOT / ".env")
-    db = _get_db()
+    migrate_db(Path(DB_PATH))
    try:
        db.execute("""
            CREATE TABLE IF NOT EXISTS digest_queue (
              id             INTEGER PRIMARY KEY,
              job_contact_id INTEGER NOT NULL REFERENCES job_contacts(id),
              created_at     TEXT DEFAULT (datetime('now')),
              UNIQUE(job_contact_id)
            )
        """)
        db.commit()
    finally:
        db.close()
 # ── Link extraction helpers ───────────────────────────────────────────────
--- a/docs/developer-guide/contributing.md
+++ b/docs/developer-guide/contributing.md
@ -102,6 +102,23 @@ Before opening a pull request:
 ---
 ## Database Migrations
 Peregrine uses a numbered SQL migration system (Rails-style). Each migration is a `.sql` file in the `migrations/` directory at the repo root, named `NNN_description.sql` (e.g. `002_add_foo_column.sql`). Applied migrations are tracked in a `schema_migrations` table in each user database.
 ### Adding a migration
 1. Create `migrations/NNN_description.sql` where `NNN` is the next sequential number (zero-padded to 3 digits).
 2. Write standard SQL — `CREATE TABLE IF NOT EXISTS`, `ALTER TABLE ADD COLUMN`, etc. Keep each migration idempotent where possible.
 3. Do **not** modify `scripts/db.py`'s legacy `_MIGRATIONS` lists — those are superseded and will be removed once all active databases have been bootstrapped by the migration runner.
 4. The runner (`scripts/db_migrate.py`) applies pending migrations at startup automatically (both FastAPI and Streamlit paths call `migrate_db(db_path)`).
 ### Rollbacks
 SQLite does not support transactional DDL for all statement types. Write forward-only migrations. If you need to undo a schema change, add a new migration that reverses it.
 ---
 ## What NOT to Do
 - Do not commit `config/user.yaml`, `config/notion.yaml`, `config/email.yaml`, `config/adzuna.yaml`, or any `config/integrations/*.yaml` — all are gitignored
--- a/migrations/001_baseline.sql
+++ b/migrations/001_baseline.sql
@ -0,0 +1,97 @@
 -- Migration 001: Baseline schema
 -- Captures the full schema as of v0.8.5 (all columns including those added via ALTER TABLE)
 CREATE TABLE IF NOT EXISTS jobs (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    title TEXT,
    company TEXT,
    url TEXT UNIQUE,
    source TEXT,
    location TEXT,
    is_remote INTEGER DEFAULT 0,
    salary TEXT,
    description TEXT,
    match_score REAL,
    keyword_gaps TEXT,
    date_found TEXT,
    status TEXT DEFAULT 'pending',
    notion_page_id TEXT,
    cover_letter TEXT,
    applied_at TEXT,
    interview_date TEXT,
    rejection_stage TEXT,
    phone_screen_at TEXT,
    interviewing_at TEXT,
    offer_at TEXT,
    hired_at TEXT,
    survey_at TEXT,
    calendar_event_id TEXT,
    optimized_resume TEXT,
    ats_gap_report TEXT
 );
 CREATE TABLE IF NOT EXISTS job_contacts (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id INTEGER,
    direction TEXT,
    subject TEXT,
    from_addr TEXT,
    to_addr TEXT,
    body TEXT,
    received_at TEXT,
    is_response_needed INTEGER DEFAULT 0,
    responded_at TEXT,
    message_id TEXT,
    stage_signal TEXT,
    suggestion_dismissed INTEGER DEFAULT 0
 );
 CREATE TABLE IF NOT EXISTS company_research (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id INTEGER UNIQUE,
    generated_at TEXT,
    company_brief TEXT,
    ceo_brief TEXT,
    talking_points TEXT,
    raw_output TEXT,
    tech_brief TEXT,
    funding_brief TEXT,
    competitors_brief TEXT,
    red_flags TEXT,
    scrape_used INTEGER DEFAULT 0,
    accessibility_brief TEXT
 );
 CREATE TABLE IF NOT EXISTS background_tasks (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    task_type TEXT,
    job_id INTEGER,
    params TEXT,
    status TEXT DEFAULT 'pending',
    error TEXT,
    created_at TEXT,
    started_at TEXT,
    finished_at TEXT,
    stage TEXT,
    updated_at TEXT
 );
 CREATE TABLE IF NOT EXISTS survey_responses (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_id INTEGER,
    survey_name TEXT,
    received_at TEXT,
    source TEXT,
    raw_input TEXT,
    image_path TEXT,
    mode TEXT,
    llm_output TEXT,
    reported_score REAL,
    created_at TEXT
 );
 CREATE TABLE IF NOT EXISTS digest_queue (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    job_contact_id INTEGER UNIQUE,
    created_at TEXT
 );
--- a/scripts/db_migrate.py
+++ b/scripts/db_migrate.py
@ -0,0 +1,73 @@
 """
 db_migrate.py — Rails-style numbered SQL migration runner for Peregrine user DBs.
 Migration files live in migrations/ (sibling to this script's parent directory),
 named NNN_description.sql (e.g. 001_baseline.sql). They are applied in sorted
 order and tracked in the schema_migrations table so each runs exactly once.
 Usage:
    from scripts.db_migrate import migrate_db
    migrate_db(Path("/path/to/user.db"))
 """
 import logging
 import sqlite3
 from pathlib import Path
 log = logging.getLogger(__name__)
 # Resolved at import time: peregrine repo root / migrations/
 _MIGRATIONS_DIR = Path(__file__).parent.parent / "migrations"
 _CREATE_MIGRATIONS_TABLE = """
 CREATE TABLE IF NOT EXISTS schema_migrations (
    version TEXT PRIMARY KEY,
    applied_at TEXT NOT NULL DEFAULT (datetime('now'))
 )
 """
 def migrate_db(db_path: Path) -> list[str]:
    """Apply any pending migrations to db_path. Returns list of applied versions."""
    applied: list[str] = []
    con = sqlite3.connect(db_path)
    try:
        con.execute(_CREATE_MIGRATIONS_TABLE)
        con.commit()
        if not _MIGRATIONS_DIR.is_dir():
            log.warning("migrations/ directory not found at %s — skipping", _MIGRATIONS_DIR)
            return applied
        migration_files = sorted(_MIGRATIONS_DIR.glob("*.sql"))
        if not migration_files:
            return applied
        already_applied = {
            row[0] for row in con.execute("SELECT version FROM schema_migrations")
        }
        for path in migration_files:
            version = path.stem  # e.g. "001_baseline"
            if version in already_applied:
                continue
            sql = path.read_text(encoding="utf-8")
            log.info("Applying migration %s to %s", version, db_path.name)
            try:
                con.executescript(sql)
                con.execute(
                    "INSERT INTO schema_migrations (version) VALUES (?)", (version,)
                )
                con.commit()
                applied.append(version)
                log.info("Migration %s applied successfully", version)
            except Exception as exc:
                con.rollback()
                log.error("Migration %s failed: %s", version, exc)
                raise RuntimeError(f"Migration {version} failed: {exc}") from exc
    finally:
        con.close()
    return applied
--- a/tests/test_db_migrate.py
+++ b/tests/test_db_migrate.py
@ -0,0 +1,148 @@
 """Tests for scripts/db_migrate.py — numbered SQL migration runner."""
 import sqlite3
 import textwrap
 from pathlib import Path
 import pytest
 from scripts.db_migrate import migrate_db
 # ── helpers ───────────────────────────────────────────────────────────────────
 def _applied(db_path: Path) -> list[str]:
    con = sqlite3.connect(db_path)
    try:
        rows = con.execute("SELECT version FROM schema_migrations ORDER BY version").fetchall()
        return [r[0] for r in rows]
    finally:
        con.close()
 def _tables(db_path: Path) -> set[str]:
    con = sqlite3.connect(db_path)
    try:
        rows = con.execute(
            "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
        ).fetchall()
        return {r[0] for r in rows}
    finally:
        con.close()
 # ── tests ──────────────────────────────────────────────────────────────────────
 def test_creates_schema_migrations_table(tmp_path):
    """Running against an empty DB creates the tracking table."""
    db = tmp_path / "test.db"
    (tmp_path / "migrations").mkdir()  # empty migrations dir
    # Patch the module-level _MIGRATIONS_DIR
    import scripts.db_migrate as m
    orig = m._MIGRATIONS_DIR
    m._MIGRATIONS_DIR = tmp_path / "migrations"
    try:
        migrate_db(db)
        assert "schema_migrations" in _tables(db)
    finally:
        m._MIGRATIONS_DIR = orig
 def test_applies_migration_file(tmp_path):
    """A .sql file in migrations/ is applied and recorded."""
    db = tmp_path / "test.db"
    mdir = tmp_path / "migrations"
    mdir.mkdir()
    (mdir / "001_test.sql").write_text(
        "CREATE TABLE IF NOT EXISTS widgets (id INTEGER PRIMARY KEY, name TEXT);"
    )
    import scripts.db_migrate as m
    orig = m._MIGRATIONS_DIR
    m._MIGRATIONS_DIR = mdir
    try:
        applied = migrate_db(db)
        assert applied == ["001_test"]
        assert "widgets" in _tables(db)
        assert _applied(db) == ["001_test"]
    finally:
        m._MIGRATIONS_DIR = orig
 def test_idempotent_second_run(tmp_path):
    """Running migrate_db twice does not re-apply migrations."""
    db = tmp_path / "test.db"
    mdir = tmp_path / "migrations"
    mdir.mkdir()
    (mdir / "001_test.sql").write_text(
        "CREATE TABLE IF NOT EXISTS widgets (id INTEGER PRIMARY KEY, name TEXT);"
    )
    import scripts.db_migrate as m
    orig = m._MIGRATIONS_DIR
    m._MIGRATIONS_DIR = mdir
    try:
        migrate_db(db)
        applied = migrate_db(db)  # second run
        assert applied == []
        assert _applied(db) == ["001_test"]
    finally:
        m._MIGRATIONS_DIR = orig
 def test_applies_only_new_migrations(tmp_path):
    """Migrations already in schema_migrations are skipped; only new ones run."""
    db = tmp_path / "test.db"
    mdir = tmp_path / "migrations"
    mdir.mkdir()
    (mdir / "001_first.sql").write_text(
        "CREATE TABLE IF NOT EXISTS first_table (id INTEGER PRIMARY KEY);"
    )
    import scripts.db_migrate as m
    orig = m._MIGRATIONS_DIR
    m._MIGRATIONS_DIR = mdir
    try:
        migrate_db(db)
        # Add a second migration
        (mdir / "002_second.sql").write_text(
            "CREATE TABLE IF NOT EXISTS second_table (id INTEGER PRIMARY KEY);"
        )
        applied = migrate_db(db)
        assert applied == ["002_second"]
        assert set(_applied(db)) == {"001_first", "002_second"}
        assert "second_table" in _tables(db)
    finally:
        m._MIGRATIONS_DIR = orig
 def test_migration_failure_raises(tmp_path):
    """A bad migration raises RuntimeError and does not record the version."""
    db = tmp_path / "test.db"
    mdir = tmp_path / "migrations"
    mdir.mkdir()
    (mdir / "001_bad.sql").write_text("THIS IS NOT VALID SQL !!!")
    import scripts.db_migrate as m
    orig = m._MIGRATIONS_DIR
    m._MIGRATIONS_DIR = mdir
    try:
        with pytest.raises(RuntimeError, match="001_bad"):
            migrate_db(db)
        assert _applied(db) == []
    finally:
        m._MIGRATIONS_DIR = orig
 def test_baseline_migration_runs(tmp_path):
    """The real 001_baseline.sql applies cleanly to a fresh database."""
    db = tmp_path / "test.db"
    applied = migrate_db(db)
    assert "001_baseline" in applied
    expected_tables = {
        "jobs", "job_contacts", "company_research",
        "background_tasks", "survey_responses", "digest_queue",
        "schema_migrations",
    }
    assert expected_tables <= _tables(db)