feat: backup/restore script with multi-instance and legacy support

- create_backup() / restore_backup() / list_backup_contents() public API
- --base-dir PATH flag: targets any instance root (default: this repo)
  --base-dir /devl/job-seeker backs up the legacy Conda install
- _DB_CANDIDATES fallback: data/staging.db (Peregrine) or staging.db root (legacy)
- Manifest records source label (dir name), source_path, created_at, files, includes_db
- Added config/resume_keywords.yaml and config/server.yaml to backup lists
- 21 tests covering create, list, restore, legacy DB path, overwrite, roundtrip
This commit is contained in:
pyr0ball 2026-03-04 10:52:51 -08:00
parent 0d6cf58271
commit 78b4df79d4
2 changed files with 508 additions and 0 deletions

277
scripts/backup.py Normal file
View file

@ -0,0 +1,277 @@
"""Config backup / restore / teleport for Peregrine.
Creates a portable zip of all gitignored configs + optionally the staging DB.
Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
Usage (CLI):
conda run -n job-seeker python scripts/backup.py --create backup.zip
conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
conda run -n job-seeker python scripts/backup.py --restore backup.zip
conda run -n job-seeker python scripts/backup.py --list backup.zip
Usage (programmatic called from Settings UI):
from scripts.backup import create_backup, restore_backup, list_backup_contents
zip_bytes = create_backup(base_dir, include_db=True)
info = list_backup_contents(zip_bytes)
result = restore_backup(zip_bytes, base_dir, include_db=True)
"""
from __future__ import annotations
import io
import json
import zipfile
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# Files included in every backup (relative to repo root)
# ---------------------------------------------------------------------------
# Gitignored config files that hold secrets / personal data
_SECRET_CONFIGS = [
"config/notion.yaml",
"config/tokens.yaml",
"config/email.yaml",
"config/adzuna.yaml",
"config/craigslist.yaml",
"config/user.yaml",
"config/plain_text_resume.yaml",
"config/license.json",
"config/user.yaml.working",
]
# Gitignored integration configs (glob pattern — each matching file is added)
_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
# Non-secret committed configs worth preserving for portability
# (also present in the legacy /devl/job-seeker instance)
_EXTRA_CONFIGS = [
"config/llm.yaml",
"config/search_profiles.yaml",
"config/resume_keywords.yaml", # personal keyword list — present in both instances
"config/skills_suggestions.yaml",
"config/blocklist.yaml",
"config/server.yaml", # deployment config (base URL path, port) — Peregrine only
]
# Candidate DB paths (first one that exists wins)
_DB_CANDIDATES = ["data/staging.db", "staging.db"]
_MANIFEST_NAME = "backup-manifest.json"
# ---------------------------------------------------------------------------
# Source detection
# ---------------------------------------------------------------------------
def _detect_source_label(base_dir: Path) -> str:
"""Return a human-readable label for the instance being backed up.
Uses the directory name stable as long as the repo root isn't renamed,
which is the normal case for both the Docker install (peregrine/) and the
legacy Conda install (job-seeker/).
Args:
base_dir: The root directory being backed up.
Returns:
A short identifier string, e.g. "peregrine" or "job-seeker".
"""
return base_dir.name
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def create_backup(
base_dir: Path,
include_db: bool = True,
source_label: str | None = None,
) -> bytes:
"""Return a zip archive as raw bytes.
Args:
base_dir: Repo root (parent of config/ and staging.db).
include_db: If True, include staging.db in the archive.
source_label: Human-readable instance name stored in the manifest
(e.g. "peregrine", "job-seeker"). Auto-detected if None.
"""
buf = io.BytesIO()
included: list[str] = []
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
# Gitignored secret configs
for rel in _SECRET_CONFIGS:
p = base_dir / rel
if p.exists():
zf.write(p, rel)
included.append(rel)
# Integration configs (glob)
for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
rel = str(p.relative_to(base_dir))
zf.write(p, rel)
included.append(rel)
# Extra non-secret configs
for rel in _EXTRA_CONFIGS:
p = base_dir / rel
if p.exists():
zf.write(p, rel)
included.append(rel)
# Staging DB
if include_db:
for candidate in _DB_CANDIDATES:
p = base_dir / candidate
if p.exists():
zf.write(p, candidate)
included.append(candidate)
break
# Manifest
manifest = {
"created_at": datetime.now().isoformat(),
"source": source_label or _detect_source_label(base_dir),
"source_path": str(base_dir.resolve()),
"peregrine_version": "1.0",
"files": included,
"includes_db": include_db and any(f.endswith(".db") for f in included),
}
zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
return buf.getvalue()
def list_backup_contents(zip_bytes: bytes) -> dict:
"""Return manifest + file list from a backup zip (no extraction)."""
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
manifest: dict = {}
if _MANIFEST_NAME in zf.namelist():
manifest = json.loads(zf.read(_MANIFEST_NAME))
sizes = {info.filename: info.file_size for info in zf.infolist()}
return {
"manifest": manifest,
"files": names,
"sizes": sizes,
"total_bytes": sum(sizes[n] for n in names if n in sizes),
}
def restore_backup(
zip_bytes: bytes,
base_dir: Path,
include_db: bool = True,
overwrite: bool = True,
) -> dict[str, list[str]]:
"""Extract a backup zip into base_dir.
Args:
zip_bytes: Raw bytes of the backup zip.
base_dir: Repo root to restore into.
include_db: If False, skip any .db files.
overwrite: If False, skip files that already exist.
Returns:
{"restored": [...], "skipped": [...]}
"""
restored: list[str] = []
skipped: list[str] = []
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
for name in zf.namelist():
if name == _MANIFEST_NAME:
continue
if not include_db and name.endswith(".db"):
skipped.append(name)
continue
dest = base_dir / name
if dest.exists() and not overwrite:
skipped.append(name)
continue
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(zf.read(name))
restored.append(name)
return {"restored": restored, "skipped": skipped}
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> None:
import argparse
import sys
parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
parser.add_argument("--no-overwrite", action="store_true",
help="Skip files that already exist (--restore)")
parser.add_argument(
"--base-dir", metavar="PATH",
help="Root of the instance to back up/restore (default: this repo root). "
"Use /devl/job-seeker to target the legacy Conda install.",
)
args = parser.parse_args()
base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
if args.create:
out = Path(args.create)
data = create_backup(base_dir, include_db=not args.no_db)
out.write_bytes(data)
info = list_backup_contents(data)
m = info["manifest"]
print(f"Backup created: {out} ({len(data):,} bytes)")
print(f" Source: {m.get('source', '?')} ({base_dir})")
print(f" {len(info['files'])} files archived:")
for name in info["files"]:
size = info["sizes"].get(name, 0)
print(f" {name} ({size:,} bytes)")
elif args.restore:
in_path = Path(args.restore)
if not in_path.exists():
print(f"ERROR: {in_path} not found", file=sys.stderr)
sys.exit(1)
data = in_path.read_bytes()
result = restore_backup(data, base_dir,
include_db=not args.no_db,
overwrite=not args.no_overwrite)
print(f"Restored {len(result['restored'])} files:")
for name in result["restored"]:
print(f"{name}")
if result["skipped"]:
print(f"Skipped {len(result['skipped'])} files:")
for name in result["skipped"]:
print(f" - {name}")
elif args.list:
in_path = Path(args.list)
if not in_path.exists():
print(f"ERROR: {in_path} not found", file=sys.stderr)
sys.exit(1)
data = in_path.read_bytes()
info = list_backup_contents(data)
m = info["manifest"]
if m:
print(f"Created: {m.get('created_at', 'unknown')}")
print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})")
print(f"Has DB: {m.get('includes_db', '?')}")
print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):")
for name in info["files"]:
size = info["sizes"].get(name, 0)
print(f" {name} ({size:,} bytes)")
if __name__ == "__main__":
main()

231
tests/test_backup.py Normal file
View file

@ -0,0 +1,231 @@
"""Tests for scripts/backup.py — create, list, restore, and multi-instance support."""
from __future__ import annotations
import json
import zipfile
from pathlib import Path
import pytest
from scripts.backup import (
_detect_source_label,
create_backup,
list_backup_contents,
restore_backup,
)
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
def _make_instance(tmp_path: Path, name: str, *, root_db: bool = False) -> Path:
"""Build a minimal fake instance directory for testing."""
base = tmp_path / name
base.mkdir()
# Secret configs
(base / "config").mkdir()
(base / "config" / "notion.yaml").write_text("token: secret")
(base / "config" / "email.yaml").write_text("user: test@example.com")
# Extra config
(base / "config" / "llm.yaml").write_text("backend: ollama")
(base / "config" / "resume_keywords.yaml").write_text("keywords: [python]")
(base / "config" / "server.yaml").write_text("port: 8502")
# DB — either at data/staging.db (Peregrine) or staging.db root (legacy)
if root_db:
(base / "staging.db").write_bytes(b"SQLite legacy")
else:
(base / "data").mkdir()
(base / "data" / "staging.db").write_bytes(b"SQLite peregrine")
return base
# ---------------------------------------------------------------------------
# create_backup
# ---------------------------------------------------------------------------
class TestCreateBackup:
def test_returns_valid_zip(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
assert zipfile.is_zipfile(__import__("io").BytesIO(data))
def test_includes_secret_configs(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert "config/notion.yaml" in info["files"]
assert "config/email.yaml" in info["files"]
def test_includes_extra_configs(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert "config/llm.yaml" in info["files"]
assert "config/resume_keywords.yaml" in info["files"]
assert "config/server.yaml" in info["files"]
def test_includes_db_by_default(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert info["manifest"]["includes_db"] is True
assert any(f.endswith(".db") for f in info["files"])
def test_excludes_db_when_flag_false(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base, include_db=False)
info = list_backup_contents(data)
assert info["manifest"]["includes_db"] is False
assert not any(f.endswith(".db") for f in info["files"])
def test_silently_skips_missing_files(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
# tokens.yaml not created in fixture — should not raise
data = create_backup(base)
info = list_backup_contents(data)
assert "config/tokens.yaml" not in info["files"]
def test_manifest_contains_source_label(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert info["manifest"]["source"] == "peregrine"
def test_source_label_override(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base, source_label="custom-label")
info = list_backup_contents(data)
assert info["manifest"]["source"] == "custom-label"
# ---------------------------------------------------------------------------
# Legacy instance (staging.db at repo root)
# ---------------------------------------------------------------------------
class TestLegacyInstance:
def test_picks_up_root_db(self, tmp_path):
base = _make_instance(tmp_path, "job-seeker", root_db=True)
data = create_backup(base)
info = list_backup_contents(data)
assert "staging.db" in info["files"]
assert "data/staging.db" not in info["files"]
def test_source_label_is_job_seeker(self, tmp_path):
base = _make_instance(tmp_path, "job-seeker", root_db=True)
data = create_backup(base)
info = list_backup_contents(data)
assert info["manifest"]["source"] == "job-seeker"
def test_missing_peregrine_only_configs_skipped(self, tmp_path):
"""Legacy doesn't have server.yaml, user.yaml, etc. — should not error."""
base = _make_instance(tmp_path, "job-seeker", root_db=True)
# Remove server.yaml to simulate legacy (it won't exist there)
(base / "config" / "server.yaml").unlink()
data = create_backup(base)
info = list_backup_contents(data)
assert "config/server.yaml" not in info["files"]
assert "config/notion.yaml" in info["files"]
# ---------------------------------------------------------------------------
# list_backup_contents
# ---------------------------------------------------------------------------
class TestListBackupContents:
def test_returns_manifest_and_files(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert "manifest" in info
assert "files" in info
assert "sizes" in info
assert "total_bytes" in info
def test_total_bytes_is_sum_of_file_sizes(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
expected = sum(info["sizes"][f] for f in info["files"] if f in info["sizes"])
assert info["total_bytes"] == expected
def test_manifest_not_in_files_list(self, tmp_path):
base = _make_instance(tmp_path, "peregrine")
data = create_backup(base)
info = list_backup_contents(data)
assert "backup-manifest.json" not in info["files"]
# ---------------------------------------------------------------------------
# restore_backup
# ---------------------------------------------------------------------------
class TestRestoreBackup:
def test_restores_all_files(self, tmp_path):
src = _make_instance(tmp_path, "peregrine")
dst = tmp_path / "restored"
dst.mkdir()
data = create_backup(src)
result = restore_backup(data, dst)
assert len(result["restored"]) > 0
assert (dst / "config" / "notion.yaml").exists()
def test_skips_db_when_flag_false(self, tmp_path):
src = _make_instance(tmp_path, "peregrine")
dst = tmp_path / "restored"
dst.mkdir()
data = create_backup(src)
result = restore_backup(data, dst, include_db=False)
assert not any(f.endswith(".db") for f in result["restored"])
assert any(f.endswith(".db") for f in result["skipped"])
def test_no_overwrite_skips_existing(self, tmp_path):
src = _make_instance(tmp_path, "peregrine")
dst = tmp_path / "restored"
dst.mkdir()
(dst / "config").mkdir()
existing = dst / "config" / "notion.yaml"
existing.write_text("original content")
data = create_backup(src)
result = restore_backup(data, dst, overwrite=False)
assert "config/notion.yaml" in result["skipped"]
assert existing.read_text() == "original content"
def test_overwrite_replaces_existing(self, tmp_path):
src = _make_instance(tmp_path, "peregrine")
dst = tmp_path / "restored"
dst.mkdir()
(dst / "config").mkdir()
(dst / "config" / "notion.yaml").write_text("stale content")
data = create_backup(src)
restore_backup(data, dst, overwrite=True)
assert (dst / "config" / "notion.yaml").read_text() == "token: secret"
def test_roundtrip_preserves_content(self, tmp_path):
src = _make_instance(tmp_path, "peregrine")
original = (src / "config" / "notion.yaml").read_text()
dst = tmp_path / "restored"
dst.mkdir()
data = create_backup(src)
restore_backup(data, dst)
assert (dst / "config" / "notion.yaml").read_text() == original
# ---------------------------------------------------------------------------
# _detect_source_label
# ---------------------------------------------------------------------------
class TestDetectSourceLabel:
def test_returns_directory_name(self, tmp_path):
base = tmp_path / "peregrine"
base.mkdir()
assert _detect_source_label(base) == "peregrine"
def test_legacy_label(self, tmp_path):
base = tmp_path / "job-seeker"
base.mkdir()
assert _detect_source_label(base) == "job-seeker"