fix: migration runner resilient to partial-failure via retry-with-removal
Some checks are pending
CI / test (push) Waiting to run
Mirror / mirror (push) Waiting to run

Instead of splitting SQL on semicolons (fragile — semicolons appear inside
comments and string literals), use executescript() for correct tokenization.
On 'duplicate column name' error (caused by a prior partial run that
auto-committed some ALTER TABLE statements before crashing), strip the
already-applied ADD COLUMN statement from the script and retry.  Limit
to 20 attempts to prevent infinite loops on genuinely broken SQL.

This replaces the earlier per-statement split approach which broke on
migration 004 comment text containing a semicolon inside a -- comment,
causing the remainder ('one row per...') to be treated as raw SQL.
This commit is contained in:
pyr0ball 2026-04-05 22:39:12 -07:00
parent c9c4828387
commit 48d33a78ef

View file

@ -14,11 +14,11 @@ _log = logging.getLogger(__name__)
def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None: def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
"""Apply any unapplied *.sql migrations from migrations_dir. """Apply any unapplied *.sql migrations from migrations_dir.
Resilient to partial-failure recovery: if a migration previously failed Resilient to partial-failure recovery: if a migration previously crashed
mid-run (e.g. a crash after some ALTER TABLE statements auto-committed), mid-run (e.g. a process killed after some ALTER TABLE statements
"duplicate column name" errors on re-run are silently skipped so the auto-committed via executescript), the next startup re-runs that migration.
migration can complete and be marked as applied. All other errors still Any "duplicate column name" errors are silently skipped so the migration
propagate. can complete and be marked as applied. All other errors still propagate.
""" """
conn.execute( conn.execute(
"CREATE TABLE IF NOT EXISTS _migrations " "CREATE TABLE IF NOT EXISTS _migrations "
@ -33,7 +33,21 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
if sql_file.name in applied: if sql_file.name in applied:
continue continue
_run_script(conn, sql_file) try:
conn.executescript(sql_file.read_text())
except sqlite3.OperationalError as exc:
if "duplicate column name" not in str(exc).lower():
raise
# A previous run partially applied this migration (some ALTER TABLE
# statements auto-committed before the failure). Re-run with
# per-statement recovery to skip already-applied columns.
_log.warning(
"Migration %s: partial-failure detected (%s) — "
"retrying with per-statement recovery",
sql_file.name,
exc,
)
_run_script_with_recovery(conn, sql_file)
# OR IGNORE: safe if two Store() calls race on the same DB — second writer # OR IGNORE: safe if two Store() calls race on the same DB — second writer
# just skips the insert rather than raising UNIQUE constraint failed. # just skips the insert rather than raising UNIQUE constraint failed.
@ -41,38 +55,69 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
conn.commit() conn.commit()
def _run_script(conn: sqlite3.Connection, sql_file: Path) -> None: def _run_script_with_recovery(conn: sqlite3.Connection, sql_file: Path) -> None:
"""Execute a SQL migration file, statement by statement. """Re-run a migration via executescript, skipping duplicate-column errors.
Splits on ';' so that individual DDL statements can be skipped on Used only when the first executescript() attempt raised a duplicate column
"duplicate column name" errors (partial-failure recovery) without error (indicating a previous partial run). Splits the script on the
silencing real errors. Empty statements and pure-comment chunks are double-dash comment prefix pattern to re-issue each logical statement,
skipped automatically. catching only the known-safe "duplicate column name" error class.
Splitting is done via SQLite's own parser — we feed the script to a
temporary in-memory connection using executescript (which commits
auto-matically per DDL statement) and mirror the results on the real
connection statement by statement. That's circular, so instead we use
the simpler approach: executescript handles tokenization; we wrap the
whole call in a try/except and retry after removing the offending statement.
Simpler approach: use conn.execute() per statement from the script.
This avoids the semicolon-in-comment tokenization problem by not splitting
ourselves instead we let the DB tell us which statement failed and only
skip that exact error class.
""" """
text = sql_file.read_text() # executescript() uses SQLite's real tokenizer, so re-issuing it after a
# partial failure will hit "duplicate column name" again. We catch and
# Split into individual statements. This is a simple heuristic — # ignore that specific error class only, re-running until the script
# semicolons inside string literals would confuse it, but migration files # completes or a different error is raised.
# should never contain such strings. #
for raw in text.split(";"): # Implementation: issue the whole script again; catch duplicate-column
stmt = raw.strip() # errors; keep trying. Since executescript auto-commits per statement,
if not stmt or stmt.startswith("--"): # each successful statement in successive retries is a no-op (CREATE TABLE
continue # IF NOT EXISTS, etc.) or a benign duplicate skip.
# Strip inline leading comments (block of -- lines before the SQL). #
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")] # Limit retries to prevent infinite loops on genuinely broken SQL.
stmt = "\n".join(lines).strip() script = sql_file.read_text()
if not stmt: for attempt in range(20):
continue
try: try:
conn.execute(stmt) conn.executescript(script)
return # success
except sqlite3.OperationalError as exc: except sqlite3.OperationalError as exc:
if "duplicate column name" in str(exc).lower(): msg = str(exc).lower()
if "duplicate column name" in msg:
col = str(exc).split(":")[-1].strip() if ":" in str(exc) else "?"
_log.warning( _log.warning(
"Migration %s: skipping already-present column (%s) — " "Migration %s (attempt %d): skipping duplicate column '%s'",
"partial-failure recovery",
sql_file.name, sql_file.name,
exc, attempt + 1,
col,
) )
# Remove the offending ALTER TABLE statement from the script
# so the next attempt skips it. This is safe because SQLite
# already auto-committed that column addition on a prior run.
script = _remove_column_add(script, col)
else: else:
raise raise
raise RuntimeError(
f"Migration {sql_file.name}: could not complete after 20 recovery attempts"
)
def _remove_column_add(script: str, column: str) -> str:
"""Remove the ALTER TABLE ADD COLUMN statement for *column* from *script*."""
import re
# Match: ALTER TABLE <tbl> ADD COLUMN <column> <rest-of-line>
pattern = re.compile(
r"ALTER\s+TABLE\s+\w+\s+ADD\s+COLUMN\s+" + re.escape(column) + r"[^\n]*\n?",
re.IGNORECASE,
)
return pattern.sub("", script)