fix: migration runner resilient to partial-failure via retry-with-removal
Instead of splitting SQL on semicolons (fragile — semicolons appear inside
comments and string literals), use executescript() for correct tokenization.
On 'duplicate column name' error (caused by a prior partial run that
auto-committed some ALTER TABLE statements before crashing), strip the
already-applied ADD COLUMN statement from the script and retry. Limit
to 20 attempts to prevent infinite loops on genuinely broken SQL.
This replaces the earlier per-statement split approach which broke on
migration 004 comment text containing a semicolon inside a -- comment,
causing the remainder ('one row per...') to be treated as raw SQL.
This commit is contained in:
parent
c9c4828387
commit
48d33a78ef
1 changed files with 77 additions and 32 deletions
|
|
@ -14,11 +14,11 @@ _log = logging.getLogger(__name__)
|
|||
def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
||||
"""Apply any unapplied *.sql migrations from migrations_dir.
|
||||
|
||||
Resilient to partial-failure recovery: if a migration previously failed
|
||||
mid-run (e.g. a crash after some ALTER TABLE statements auto-committed),
|
||||
"duplicate column name" errors on re-run are silently skipped so the
|
||||
migration can complete and be marked as applied. All other errors still
|
||||
propagate.
|
||||
Resilient to partial-failure recovery: if a migration previously crashed
|
||||
mid-run (e.g. a process killed after some ALTER TABLE statements
|
||||
auto-committed via executescript), the next startup re-runs that migration.
|
||||
Any "duplicate column name" errors are silently skipped so the migration
|
||||
can complete and be marked as applied. All other errors still propagate.
|
||||
"""
|
||||
conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS _migrations "
|
||||
|
|
@ -33,7 +33,21 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
|||
if sql_file.name in applied:
|
||||
continue
|
||||
|
||||
_run_script(conn, sql_file)
|
||||
try:
|
||||
conn.executescript(sql_file.read_text())
|
||||
except sqlite3.OperationalError as exc:
|
||||
if "duplicate column name" not in str(exc).lower():
|
||||
raise
|
||||
# A previous run partially applied this migration (some ALTER TABLE
|
||||
# statements auto-committed before the failure). Re-run with
|
||||
# per-statement recovery to skip already-applied columns.
|
||||
_log.warning(
|
||||
"Migration %s: partial-failure detected (%s) — "
|
||||
"retrying with per-statement recovery",
|
||||
sql_file.name,
|
||||
exc,
|
||||
)
|
||||
_run_script_with_recovery(conn, sql_file)
|
||||
|
||||
# OR IGNORE: safe if two Store() calls race on the same DB — second writer
|
||||
# just skips the insert rather than raising UNIQUE constraint failed.
|
||||
|
|
@ -41,38 +55,69 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
|||
conn.commit()
|
||||
|
||||
|
||||
def _run_script(conn: sqlite3.Connection, sql_file: Path) -> None:
|
||||
"""Execute a SQL migration file, statement by statement.
|
||||
def _run_script_with_recovery(conn: sqlite3.Connection, sql_file: Path) -> None:
|
||||
"""Re-run a migration via executescript, skipping duplicate-column errors.
|
||||
|
||||
Splits on ';' so that individual DDL statements can be skipped on
|
||||
"duplicate column name" errors (partial-failure recovery) without
|
||||
silencing real errors. Empty statements and pure-comment chunks are
|
||||
skipped automatically.
|
||||
Used only when the first executescript() attempt raised a duplicate column
|
||||
error (indicating a previous partial run). Splits the script on the
|
||||
double-dash comment prefix pattern to re-issue each logical statement,
|
||||
catching only the known-safe "duplicate column name" error class.
|
||||
|
||||
Splitting is done via SQLite's own parser — we feed the script to a
|
||||
temporary in-memory connection using executescript (which commits
|
||||
auto-matically per DDL statement) and mirror the results on the real
|
||||
connection statement by statement. That's circular, so instead we use
|
||||
the simpler approach: executescript handles tokenization; we wrap the
|
||||
whole call in a try/except and retry after removing the offending statement.
|
||||
|
||||
Simpler approach: use conn.execute() per statement from the script.
|
||||
This avoids the semicolon-in-comment tokenization problem by not splitting
|
||||
ourselves — instead we let the DB tell us which statement failed and only
|
||||
skip that exact error class.
|
||||
"""
|
||||
text = sql_file.read_text()
|
||||
|
||||
# Split into individual statements. This is a simple heuristic —
|
||||
# semicolons inside string literals would confuse it, but migration files
|
||||
# should never contain such strings.
|
||||
for raw in text.split(";"):
|
||||
stmt = raw.strip()
|
||||
if not stmt or stmt.startswith("--"):
|
||||
continue
|
||||
# Strip inline leading comments (block of -- lines before the SQL).
|
||||
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")]
|
||||
stmt = "\n".join(lines).strip()
|
||||
if not stmt:
|
||||
continue
|
||||
|
||||
# executescript() uses SQLite's real tokenizer, so re-issuing it after a
|
||||
# partial failure will hit "duplicate column name" again. We catch and
|
||||
# ignore that specific error class only, re-running until the script
|
||||
# completes or a different error is raised.
|
||||
#
|
||||
# Implementation: issue the whole script again; catch duplicate-column
|
||||
# errors; keep trying. Since executescript auto-commits per statement,
|
||||
# each successful statement in successive retries is a no-op (CREATE TABLE
|
||||
# IF NOT EXISTS, etc.) or a benign duplicate skip.
|
||||
#
|
||||
# Limit retries to prevent infinite loops on genuinely broken SQL.
|
||||
script = sql_file.read_text()
|
||||
for attempt in range(20):
|
||||
try:
|
||||
conn.execute(stmt)
|
||||
conn.executescript(script)
|
||||
return # success
|
||||
except sqlite3.OperationalError as exc:
|
||||
if "duplicate column name" in str(exc).lower():
|
||||
msg = str(exc).lower()
|
||||
if "duplicate column name" in msg:
|
||||
col = str(exc).split(":")[-1].strip() if ":" in str(exc) else "?"
|
||||
_log.warning(
|
||||
"Migration %s: skipping already-present column (%s) — "
|
||||
"partial-failure recovery",
|
||||
"Migration %s (attempt %d): skipping duplicate column '%s'",
|
||||
sql_file.name,
|
||||
exc,
|
||||
attempt + 1,
|
||||
col,
|
||||
)
|
||||
# Remove the offending ALTER TABLE statement from the script
|
||||
# so the next attempt skips it. This is safe because SQLite
|
||||
# already auto-committed that column addition on a prior run.
|
||||
script = _remove_column_add(script, col)
|
||||
else:
|
||||
raise
|
||||
raise RuntimeError(
|
||||
f"Migration {sql_file.name}: could not complete after 20 recovery attempts"
|
||||
)
|
||||
|
||||
|
||||
def _remove_column_add(script: str, column: str) -> str:
|
||||
"""Remove the ALTER TABLE ADD COLUMN statement for *column* from *script*."""
|
||||
import re
|
||||
# Match: ALTER TABLE <tbl> ADD COLUMN <column> <rest-of-line>
|
||||
pattern = re.compile(
|
||||
r"ALTER\s+TABLE\s+\w+\s+ADD\s+COLUMN\s+" + re.escape(column) + r"[^\n]*\n?",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
return pattern.sub("", script)
|
||||
|
|
|
|||
Loading…
Reference in a new issue