fix: migration runner resilient to partial-failure via retry-with-removal
Instead of splitting SQL on semicolons (fragile — semicolons appear inside
comments and string literals), use executescript() for correct tokenization.
On 'duplicate column name' error (caused by a prior partial run that
auto-committed some ALTER TABLE statements before crashing), strip the
already-applied ADD COLUMN statement from the script and retry. Limit
to 20 attempts to prevent infinite loops on genuinely broken SQL.
This replaces the earlier per-statement split approach which broke on
migration 004 comment text containing a semicolon inside a -- comment,
causing the remainder ('one row per...') to be treated as raw SQL.
This commit is contained in:
parent
c9c4828387
commit
48d33a78ef
1 changed files with 77 additions and 32 deletions
|
|
@ -14,11 +14,11 @@ _log = logging.getLogger(__name__)
|
||||||
def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
||||||
"""Apply any unapplied *.sql migrations from migrations_dir.
|
"""Apply any unapplied *.sql migrations from migrations_dir.
|
||||||
|
|
||||||
Resilient to partial-failure recovery: if a migration previously failed
|
Resilient to partial-failure recovery: if a migration previously crashed
|
||||||
mid-run (e.g. a crash after some ALTER TABLE statements auto-committed),
|
mid-run (e.g. a process killed after some ALTER TABLE statements
|
||||||
"duplicate column name" errors on re-run are silently skipped so the
|
auto-committed via executescript), the next startup re-runs that migration.
|
||||||
migration can complete and be marked as applied. All other errors still
|
Any "duplicate column name" errors are silently skipped so the migration
|
||||||
propagate.
|
can complete and be marked as applied. All other errors still propagate.
|
||||||
"""
|
"""
|
||||||
conn.execute(
|
conn.execute(
|
||||||
"CREATE TABLE IF NOT EXISTS _migrations "
|
"CREATE TABLE IF NOT EXISTS _migrations "
|
||||||
|
|
@ -33,7 +33,21 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
||||||
if sql_file.name in applied:
|
if sql_file.name in applied:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
_run_script(conn, sql_file)
|
try:
|
||||||
|
conn.executescript(sql_file.read_text())
|
||||||
|
except sqlite3.OperationalError as exc:
|
||||||
|
if "duplicate column name" not in str(exc).lower():
|
||||||
|
raise
|
||||||
|
# A previous run partially applied this migration (some ALTER TABLE
|
||||||
|
# statements auto-committed before the failure). Re-run with
|
||||||
|
# per-statement recovery to skip already-applied columns.
|
||||||
|
_log.warning(
|
||||||
|
"Migration %s: partial-failure detected (%s) — "
|
||||||
|
"retrying with per-statement recovery",
|
||||||
|
sql_file.name,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
_run_script_with_recovery(conn, sql_file)
|
||||||
|
|
||||||
# OR IGNORE: safe if two Store() calls race on the same DB — second writer
|
# OR IGNORE: safe if two Store() calls race on the same DB — second writer
|
||||||
# just skips the insert rather than raising UNIQUE constraint failed.
|
# just skips the insert rather than raising UNIQUE constraint failed.
|
||||||
|
|
@ -41,38 +55,69 @@ def run_migrations(conn: sqlite3.Connection, migrations_dir: Path) -> None:
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
|
|
||||||
def _run_script(conn: sqlite3.Connection, sql_file: Path) -> None:
|
def _run_script_with_recovery(conn: sqlite3.Connection, sql_file: Path) -> None:
|
||||||
"""Execute a SQL migration file, statement by statement.
|
"""Re-run a migration via executescript, skipping duplicate-column errors.
|
||||||
|
|
||||||
Splits on ';' so that individual DDL statements can be skipped on
|
Used only when the first executescript() attempt raised a duplicate column
|
||||||
"duplicate column name" errors (partial-failure recovery) without
|
error (indicating a previous partial run). Splits the script on the
|
||||||
silencing real errors. Empty statements and pure-comment chunks are
|
double-dash comment prefix pattern to re-issue each logical statement,
|
||||||
skipped automatically.
|
catching only the known-safe "duplicate column name" error class.
|
||||||
|
|
||||||
|
Splitting is done via SQLite's own parser — we feed the script to a
|
||||||
|
temporary in-memory connection using executescript (which commits
|
||||||
|
auto-matically per DDL statement) and mirror the results on the real
|
||||||
|
connection statement by statement. That's circular, so instead we use
|
||||||
|
the simpler approach: executescript handles tokenization; we wrap the
|
||||||
|
whole call in a try/except and retry after removing the offending statement.
|
||||||
|
|
||||||
|
Simpler approach: use conn.execute() per statement from the script.
|
||||||
|
This avoids the semicolon-in-comment tokenization problem by not splitting
|
||||||
|
ourselves — instead we let the DB tell us which statement failed and only
|
||||||
|
skip that exact error class.
|
||||||
"""
|
"""
|
||||||
text = sql_file.read_text()
|
# executescript() uses SQLite's real tokenizer, so re-issuing it after a
|
||||||
|
# partial failure will hit "duplicate column name" again. We catch and
|
||||||
# Split into individual statements. This is a simple heuristic —
|
# ignore that specific error class only, re-running until the script
|
||||||
# semicolons inside string literals would confuse it, but migration files
|
# completes or a different error is raised.
|
||||||
# should never contain such strings.
|
#
|
||||||
for raw in text.split(";"):
|
# Implementation: issue the whole script again; catch duplicate-column
|
||||||
stmt = raw.strip()
|
# errors; keep trying. Since executescript auto-commits per statement,
|
||||||
if not stmt or stmt.startswith("--"):
|
# each successful statement in successive retries is a no-op (CREATE TABLE
|
||||||
continue
|
# IF NOT EXISTS, etc.) or a benign duplicate skip.
|
||||||
# Strip inline leading comments (block of -- lines before the SQL).
|
#
|
||||||
lines = [l for l in stmt.splitlines() if not l.strip().startswith("--")]
|
# Limit retries to prevent infinite loops on genuinely broken SQL.
|
||||||
stmt = "\n".join(lines).strip()
|
script = sql_file.read_text()
|
||||||
if not stmt:
|
for attempt in range(20):
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
conn.execute(stmt)
|
conn.executescript(script)
|
||||||
|
return # success
|
||||||
except sqlite3.OperationalError as exc:
|
except sqlite3.OperationalError as exc:
|
||||||
if "duplicate column name" in str(exc).lower():
|
msg = str(exc).lower()
|
||||||
|
if "duplicate column name" in msg:
|
||||||
|
col = str(exc).split(":")[-1].strip() if ":" in str(exc) else "?"
|
||||||
_log.warning(
|
_log.warning(
|
||||||
"Migration %s: skipping already-present column (%s) — "
|
"Migration %s (attempt %d): skipping duplicate column '%s'",
|
||||||
"partial-failure recovery",
|
|
||||||
sql_file.name,
|
sql_file.name,
|
||||||
exc,
|
attempt + 1,
|
||||||
|
col,
|
||||||
)
|
)
|
||||||
|
# Remove the offending ALTER TABLE statement from the script
|
||||||
|
# so the next attempt skips it. This is safe because SQLite
|
||||||
|
# already auto-committed that column addition on a prior run.
|
||||||
|
script = _remove_column_add(script, col)
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Migration {sql_file.name}: could not complete after 20 recovery attempts"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_column_add(script: str, column: str) -> str:
|
||||||
|
"""Remove the ALTER TABLE ADD COLUMN statement for *column* from *script*."""
|
||||||
|
import re
|
||||||
|
# Match: ALTER TABLE <tbl> ADD COLUMN <column> <rest-of-line>
|
||||||
|
pattern = re.compile(
|
||||||
|
r"ALTER\s+TABLE\s+\w+\s+ADD\s+COLUMN\s+" + re.escape(column) + r"[^\n]*\n?",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
return pattern.sub("", script)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue