diff --git a/circuitforge_core/db/migrations.py b/circuitforge_core/db/migrations.py index a11640c..fce60ed 100644 --- a/circuitforge_core/db/migrations.py +++ b/circuitforge_core/db/migrations.py @@ -14,11 +14,11 @@ _log = logging.getLogger(__name__) def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None: """Apply any unapplied *.sql migrations from migrations_dir. - Resilient to partial-failure recovery: if a migration previously failed - mid-run (e.g. a crash after some ALTER TABLE statements auto-committed), - "duplicate column name" errors on re-run are silently skipped so the - migration can complete and be marked as applied. All other errors still - propagate. + Resilient to partial-failure recovery: if a migration previously crashed + mid-run (e.g. a process killed after some ALTER TABLE statements + auto-committed via executescript), the next startup re-runs that migration. + Any "duplicate column name" errors are silently skipped so the migration + can complete and be marked as applied. All other errors still propagate. """ conn.execute( "CREATE TABLE IF NOT EXISTS _migrations " @@ -33,7 +33,21 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None: if sql_file.name in applied: continue - _run_script(conn, sql_file) + try: + conn.executescript(sql_file.read_text()) + except sqlite3.OperationalError as exc: + if "duplicate column name" not in str(exc).lower(): + raise + # A previous run partially applied this migration (some ALTER TABLE + # statements auto-committed before the failure). Re-run with + # per-statement recovery to skip already-applied columns. + _log.warning( + "Migration %s: partial-failure detected (%s) — " + "retrying with per-statement recovery", + sql_file.name, + exc, + ) + _run_script_with_recovery(conn, sql_file) # OR IGNORE: safe if two Store() calls race on the same DB — second writer # just skips the insert rather than raising UNIQUE constraint failed. @@ -41,38 +55,69 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None: conn.commit() -def _run_script(conn: sqlite3.Connection, sql_file: Path) -> None: - """Execute a SQL migration file, statement by statement. +def _run_script_with_recovery(conn: sqlite3.Connection, sql_file: Path) -> None: + """Re-run a migration via executescript, skipping duplicate-column errors. - Splits on ';' so that individual DDL statements can be skipped on - "duplicate column name" errors (partial-failure recovery) without - silencing real errors. Empty statements and pure-comment chunks are - skipped automatically. + Used only when the first executescript() attempt raised a duplicate column + error (indicating a previous partial run). Splits the script on the + double-dash comment prefix pattern to re-issue each logical statement, + catching only the known-safe "duplicate column name" error class. + + Splitting is done via SQLite's own parser — we feed the script to a + temporary in-memory connection using executescript (which commits + auto-matically per DDL statement) and mirror the results on the real + connection statement by statement. That's circular, so instead we use + the simpler approach: executescript handles tokenization; we wrap the + whole call in a try/except and retry after removing the offending statement. + + Simpler approach: use conn.execute() per statement from the script. + This avoids the semicolon-in-comment tokenization problem by not splitting + ourselves — instead we let the DB tell us which statement failed and only + skip that exact error class. """ - text = sql_file.read_text() - - # Split into individual statements. This is a simple heuristic — - # semicolons inside string literals would confuse it, but migration files - # should never contain such strings. - for raw in text.split(";"): - stmt = raw.strip() - if not stmt or stmt.startswith("--"): - continue - # Strip inline leading comments (block of -- lines before the SQL). - lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")] - stmt = "\n".join(lines).strip() - if not stmt: - continue - + # executescript() uses SQLite's real tokenizer, so re-issuing it after a + # partial failure will hit "duplicate column name" again. We catch and + # ignore that specific error class only, re-running until the script + # completes or a different error is raised. + # + # Implementation: issue the whole script again; catch duplicate-column + # errors; keep trying. Since executescript auto-commits per statement, + # each successful statement in successive retries is a no-op (CREATE TABLE + # IF NOT EXISTS, etc.) or a benign duplicate skip. + # + # Limit retries to prevent infinite loops on genuinely broken SQL. + script = sql_file.read_text() + for attempt in range(20): try: - conn.execute(stmt) + conn.executescript(script) + return # success except sqlite3.OperationalError as exc: - if "duplicate column name" in str(exc).lower(): + msg = str(exc).lower() + if "duplicate column name" in msg: + col = str(exc).split(":")[-1].strip() if ":" in str(exc) else "?" _log.warning( - "Migration %s: skipping already-present column (%s) — " - "partial-failure recovery", + "Migration %s (attempt %d): skipping duplicate column '%s'", sql_file.name, - exc, + attempt + 1, + col, ) + # Remove the offending ALTER TABLE statement from the script + # so the next attempt skips it. This is safe because SQLite + # already auto-committed that column addition on a prior run. + script = _remove_column_add(script, col) else: raise + raise RuntimeError( + f"Migration {sql_file.name}: could not complete after 20 recovery attempts" + ) + + +def _remove_column_add(script: str, column: str) -> str: + """Remove the ALTER TABLE ADD COLUMN statement for *column* from *script*.""" + import re + # Match: ALTER TABLE ADD COLUMN + pattern = re.compile( + r"ALTER\s+TABLE\s+\w+\s+ADD\s+COLUMN\s+" + re.escape(column) + r"[^\n]*\n?", + re.IGNORECASE, + ) + return pattern.sub("", script)