diff --git a/scripts/backup.py b/scripts/backup.py new file mode 100644 index 0000000..b20a465 --- /dev/null +++ b/scripts/backup.py @@ -0,0 +1,277 @@ +"""Config backup / restore / teleport for Peregrine. + +Creates a portable zip of all gitignored configs + optionally the staging DB. +Intended for: machine migrations, Docker volume transfers, and safe wizard testing. +Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install. + +Usage (CLI): + conda run -n job-seeker python scripts/backup.py --create backup.zip + conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db + conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker + conda run -n job-seeker python scripts/backup.py --restore backup.zip + conda run -n job-seeker python scripts/backup.py --list backup.zip + +Usage (programmatic — called from Settings UI): + from scripts.backup import create_backup, restore_backup, list_backup_contents + zip_bytes = create_backup(base_dir, include_db=True) + info = list_backup_contents(zip_bytes) + result = restore_backup(zip_bytes, base_dir, include_db=True) +""" +from __future__ import annotations + +import io +import json +import zipfile +from datetime import datetime +from pathlib import Path + +# --------------------------------------------------------------------------- +# Files included in every backup (relative to repo root) +# --------------------------------------------------------------------------- + +# Gitignored config files that hold secrets / personal data +_SECRET_CONFIGS = [ + "config/notion.yaml", + "config/tokens.yaml", + "config/email.yaml", + "config/adzuna.yaml", + "config/craigslist.yaml", + "config/user.yaml", + "config/plain_text_resume.yaml", + "config/license.json", + "config/user.yaml.working", +] + +# Gitignored integration configs (glob pattern — each matching file is added) +_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml" + +# Non-secret committed configs worth preserving for portability +# (also present in the legacy /devl/job-seeker instance) +_EXTRA_CONFIGS = [ + "config/llm.yaml", + "config/search_profiles.yaml", + "config/resume_keywords.yaml", # personal keyword list — present in both instances + "config/skills_suggestions.yaml", + "config/blocklist.yaml", + "config/server.yaml", # deployment config (base URL path, port) — Peregrine only +] + +# Candidate DB paths (first one that exists wins) +_DB_CANDIDATES = ["data/staging.db", "staging.db"] + +_MANIFEST_NAME = "backup-manifest.json" + + +# --------------------------------------------------------------------------- +# Source detection +# --------------------------------------------------------------------------- + +def _detect_source_label(base_dir: Path) -> str: + """Return a human-readable label for the instance being backed up. + + Uses the directory name — stable as long as the repo root isn't renamed, + which is the normal case for both the Docker install (peregrine/) and the + legacy Conda install (job-seeker/). + + Args: + base_dir: The root directory being backed up. + + Returns: + A short identifier string, e.g. "peregrine" or "job-seeker". + """ + return base_dir.name + + +# --------------------------------------------------------------------------- +# Public API +# --------------------------------------------------------------------------- + +def create_backup( + base_dir: Path, + include_db: bool = True, + source_label: str | None = None, +) -> bytes: + """Return a zip archive as raw bytes. + + Args: + base_dir: Repo root (parent of config/ and staging.db). + include_db: If True, include staging.db in the archive. + source_label: Human-readable instance name stored in the manifest + (e.g. "peregrine", "job-seeker"). Auto-detected if None. + """ + buf = io.BytesIO() + included: list[str] = [] + + with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf: + # Gitignored secret configs + for rel in _SECRET_CONFIGS: + p = base_dir / rel + if p.exists(): + zf.write(p, rel) + included.append(rel) + + # Integration configs (glob) + for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)): + rel = str(p.relative_to(base_dir)) + zf.write(p, rel) + included.append(rel) + + # Extra non-secret configs + for rel in _EXTRA_CONFIGS: + p = base_dir / rel + if p.exists(): + zf.write(p, rel) + included.append(rel) + + # Staging DB + if include_db: + for candidate in _DB_CANDIDATES: + p = base_dir / candidate + if p.exists(): + zf.write(p, candidate) + included.append(candidate) + break + + # Manifest + manifest = { + "created_at": datetime.now().isoformat(), + "source": source_label or _detect_source_label(base_dir), + "source_path": str(base_dir.resolve()), + "peregrine_version": "1.0", + "files": included, + "includes_db": include_db and any(f.endswith(".db") for f in included), + } + zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2)) + + return buf.getvalue() + + +def list_backup_contents(zip_bytes: bytes) -> dict: + """Return manifest + file list from a backup zip (no extraction).""" + with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: + names = [n for n in zf.namelist() if n != _MANIFEST_NAME] + manifest: dict = {} + if _MANIFEST_NAME in zf.namelist(): + manifest = json.loads(zf.read(_MANIFEST_NAME)) + sizes = {info.filename: info.file_size for info in zf.infolist()} + return { + "manifest": manifest, + "files": names, + "sizes": sizes, + "total_bytes": sum(sizes[n] for n in names if n in sizes), + } + + +def restore_backup( + zip_bytes: bytes, + base_dir: Path, + include_db: bool = True, + overwrite: bool = True, +) -> dict[str, list[str]]: + """Extract a backup zip into base_dir. + + Args: + zip_bytes: Raw bytes of the backup zip. + base_dir: Repo root to restore into. + include_db: If False, skip any .db files. + overwrite: If False, skip files that already exist. + + Returns: + {"restored": [...], "skipped": [...]} + """ + restored: list[str] = [] + skipped: list[str] = [] + + with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf: + for name in zf.namelist(): + if name == _MANIFEST_NAME: + continue + if not include_db and name.endswith(".db"): + skipped.append(name) + continue + dest = base_dir / name + if dest.exists() and not overwrite: + skipped.append(name) + continue + dest.parent.mkdir(parents=True, exist_ok=True) + dest.write_bytes(zf.read(name)) + restored.append(name) + + return {"restored": restored, "skipped": skipped} + + +# --------------------------------------------------------------------------- +# CLI entry point +# --------------------------------------------------------------------------- + +def main() -> None: + import argparse + import sys + + parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip") + group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip") + group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip") + parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)") + parser.add_argument("--no-overwrite", action="store_true", + help="Skip files that already exist (--restore)") + parser.add_argument( + "--base-dir", metavar="PATH", + help="Root of the instance to back up/restore (default: this repo root). " + "Use /devl/job-seeker to target the legacy Conda install.", + ) + args = parser.parse_args() + + base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent + + if args.create: + out = Path(args.create) + data = create_backup(base_dir, include_db=not args.no_db) + out.write_bytes(data) + info = list_backup_contents(data) + m = info["manifest"] + print(f"Backup created: {out} ({len(data):,} bytes)") + print(f" Source: {m.get('source', '?')} ({base_dir})") + print(f" {len(info['files'])} files archived:") + for name in info["files"]: + size = info["sizes"].get(name, 0) + print(f" {name} ({size:,} bytes)") + + elif args.restore: + in_path = Path(args.restore) + if not in_path.exists(): + print(f"ERROR: {in_path} not found", file=sys.stderr) + sys.exit(1) + data = in_path.read_bytes() + result = restore_backup(data, base_dir, + include_db=not args.no_db, + overwrite=not args.no_overwrite) + print(f"Restored {len(result['restored'])} files:") + for name in result["restored"]: + print(f" ✓ {name}") + if result["skipped"]: + print(f"Skipped {len(result['skipped'])} files:") + for name in result["skipped"]: + print(f" - {name}") + + elif args.list: + in_path = Path(args.list) + if not in_path.exists(): + print(f"ERROR: {in_path} not found", file=sys.stderr) + sys.exit(1) + data = in_path.read_bytes() + info = list_backup_contents(data) + m = info["manifest"] + if m: + print(f"Created: {m.get('created_at', 'unknown')}") + print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})") + print(f"Has DB: {m.get('includes_db', '?')}") + print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):") + for name in info["files"]: + size = info["sizes"].get(name, 0) + print(f" {name} ({size:,} bytes)") + + +if __name__ == "__main__": + main() diff --git a/tests/test_backup.py b/tests/test_backup.py new file mode 100644 index 0000000..a96de42 --- /dev/null +++ b/tests/test_backup.py @@ -0,0 +1,231 @@ +"""Tests for scripts/backup.py — create, list, restore, and multi-instance support.""" +from __future__ import annotations + +import json +import zipfile +from pathlib import Path + +import pytest + +from scripts.backup import ( + _detect_source_label, + create_backup, + list_backup_contents, + restore_backup, +) + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + +def _make_instance(tmp_path: Path, name: str, *, root_db: bool = False) -> Path: + """Build a minimal fake instance directory for testing.""" + base = tmp_path / name + base.mkdir() + + # Secret configs + (base / "config").mkdir() + (base / "config" / "notion.yaml").write_text("token: secret") + (base / "config" / "email.yaml").write_text("user: test@example.com") + + # Extra config + (base / "config" / "llm.yaml").write_text("backend: ollama") + (base / "config" / "resume_keywords.yaml").write_text("keywords: [python]") + (base / "config" / "server.yaml").write_text("port: 8502") + + # DB — either at data/staging.db (Peregrine) or staging.db root (legacy) + if root_db: + (base / "staging.db").write_bytes(b"SQLite legacy") + else: + (base / "data").mkdir() + (base / "data" / "staging.db").write_bytes(b"SQLite peregrine") + + return base + + +# --------------------------------------------------------------------------- +# create_backup +# --------------------------------------------------------------------------- + +class TestCreateBackup: + def test_returns_valid_zip(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + assert zipfile.is_zipfile(__import__("io").BytesIO(data)) + + def test_includes_secret_configs(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert "config/notion.yaml" in info["files"] + assert "config/email.yaml" in info["files"] + + def test_includes_extra_configs(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert "config/llm.yaml" in info["files"] + assert "config/resume_keywords.yaml" in info["files"] + assert "config/server.yaml" in info["files"] + + def test_includes_db_by_default(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert info["manifest"]["includes_db"] is True + assert any(f.endswith(".db") for f in info["files"]) + + def test_excludes_db_when_flag_false(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base, include_db=False) + info = list_backup_contents(data) + assert info["manifest"]["includes_db"] is False + assert not any(f.endswith(".db") for f in info["files"]) + + def test_silently_skips_missing_files(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + # tokens.yaml not created in fixture — should not raise + data = create_backup(base) + info = list_backup_contents(data) + assert "config/tokens.yaml" not in info["files"] + + def test_manifest_contains_source_label(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert info["manifest"]["source"] == "peregrine" + + def test_source_label_override(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base, source_label="custom-label") + info = list_backup_contents(data) + assert info["manifest"]["source"] == "custom-label" + + +# --------------------------------------------------------------------------- +# Legacy instance (staging.db at repo root) +# --------------------------------------------------------------------------- + +class TestLegacyInstance: + def test_picks_up_root_db(self, tmp_path): + base = _make_instance(tmp_path, "job-seeker", root_db=True) + data = create_backup(base) + info = list_backup_contents(data) + assert "staging.db" in info["files"] + assert "data/staging.db" not in info["files"] + + def test_source_label_is_job_seeker(self, tmp_path): + base = _make_instance(tmp_path, "job-seeker", root_db=True) + data = create_backup(base) + info = list_backup_contents(data) + assert info["manifest"]["source"] == "job-seeker" + + def test_missing_peregrine_only_configs_skipped(self, tmp_path): + """Legacy doesn't have server.yaml, user.yaml, etc. — should not error.""" + base = _make_instance(tmp_path, "job-seeker", root_db=True) + # Remove server.yaml to simulate legacy (it won't exist there) + (base / "config" / "server.yaml").unlink() + data = create_backup(base) + info = list_backup_contents(data) + assert "config/server.yaml" not in info["files"] + assert "config/notion.yaml" in info["files"] + + +# --------------------------------------------------------------------------- +# list_backup_contents +# --------------------------------------------------------------------------- + +class TestListBackupContents: + def test_returns_manifest_and_files(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert "manifest" in info + assert "files" in info + assert "sizes" in info + assert "total_bytes" in info + + def test_total_bytes_is_sum_of_file_sizes(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + expected = sum(info["sizes"][f] for f in info["files"] if f in info["sizes"]) + assert info["total_bytes"] == expected + + def test_manifest_not_in_files_list(self, tmp_path): + base = _make_instance(tmp_path, "peregrine") + data = create_backup(base) + info = list_backup_contents(data) + assert "backup-manifest.json" not in info["files"] + + +# --------------------------------------------------------------------------- +# restore_backup +# --------------------------------------------------------------------------- + +class TestRestoreBackup: + def test_restores_all_files(self, tmp_path): + src = _make_instance(tmp_path, "peregrine") + dst = tmp_path / "restored" + dst.mkdir() + data = create_backup(src) + result = restore_backup(data, dst) + assert len(result["restored"]) > 0 + assert (dst / "config" / "notion.yaml").exists() + + def test_skips_db_when_flag_false(self, tmp_path): + src = _make_instance(tmp_path, "peregrine") + dst = tmp_path / "restored" + dst.mkdir() + data = create_backup(src) + result = restore_backup(data, dst, include_db=False) + assert not any(f.endswith(".db") for f in result["restored"]) + assert any(f.endswith(".db") for f in result["skipped"]) + + def test_no_overwrite_skips_existing(self, tmp_path): + src = _make_instance(tmp_path, "peregrine") + dst = tmp_path / "restored" + dst.mkdir() + (dst / "config").mkdir() + existing = dst / "config" / "notion.yaml" + existing.write_text("original content") + data = create_backup(src) + result = restore_backup(data, dst, overwrite=False) + assert "config/notion.yaml" in result["skipped"] + assert existing.read_text() == "original content" + + def test_overwrite_replaces_existing(self, tmp_path): + src = _make_instance(tmp_path, "peregrine") + dst = tmp_path / "restored" + dst.mkdir() + (dst / "config").mkdir() + (dst / "config" / "notion.yaml").write_text("stale content") + data = create_backup(src) + restore_backup(data, dst, overwrite=True) + assert (dst / "config" / "notion.yaml").read_text() == "token: secret" + + def test_roundtrip_preserves_content(self, tmp_path): + src = _make_instance(tmp_path, "peregrine") + original = (src / "config" / "notion.yaml").read_text() + dst = tmp_path / "restored" + dst.mkdir() + data = create_backup(src) + restore_backup(data, dst) + assert (dst / "config" / "notion.yaml").read_text() == original + + +# --------------------------------------------------------------------------- +# _detect_source_label +# --------------------------------------------------------------------------- + +class TestDetectSourceLabel: + def test_returns_directory_name(self, tmp_path): + base = tmp_path / "peregrine" + base.mkdir() + assert _detect_source_label(base) == "peregrine" + + def test_legacy_label(self, tmp_path): + base = tmp_path / "job-seeker" + base.mkdir() + assert _detect_source_label(base) == "job-seeker"