feat: backup/restore script with multi-instance and legacy support
- create_backup() / restore_backup() / list_backup_contents() public API - --base-dir PATH flag: targets any instance root (default: this repo) --base-dir /devl/job-seeker backs up the legacy Conda install - _DB_CANDIDATES fallback: data/staging.db (Peregrine) or staging.db root (legacy) - Manifest records source label (dir name), source_path, created_at, files, includes_db - Added config/resume_keywords.yaml and config/server.yaml to backup lists - 21 tests covering create, list, restore, legacy DB path, overwrite, roundtrip
This commit is contained in:
parent
5556817c9a
commit
636694edd1
2 changed files with 508 additions and 0 deletions
277
scripts/backup.py
Normal file
277
scripts/backup.py
Normal file
|
|
@ -0,0 +1,277 @@
|
|||
"""Config backup / restore / teleport for Peregrine.
|
||||
|
||||
Creates a portable zip of all gitignored configs + optionally the staging DB.
|
||||
Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
|
||||
Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
|
||||
|
||||
Usage (CLI):
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
|
||||
conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
|
||||
conda run -n job-seeker python scripts/backup.py --restore backup.zip
|
||||
conda run -n job-seeker python scripts/backup.py --list backup.zip
|
||||
|
||||
Usage (programmatic — called from Settings UI):
|
||||
from scripts.backup import create_backup, restore_backup, list_backup_contents
|
||||
zip_bytes = create_backup(base_dir, include_db=True)
|
||||
info = list_backup_contents(zip_bytes)
|
||||
result = restore_backup(zip_bytes, base_dir, include_db=True)
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import json
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Files included in every backup (relative to repo root)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Gitignored config files that hold secrets / personal data
|
||||
_SECRET_CONFIGS = [
|
||||
"config/notion.yaml",
|
||||
"config/tokens.yaml",
|
||||
"config/email.yaml",
|
||||
"config/adzuna.yaml",
|
||||
"config/craigslist.yaml",
|
||||
"config/user.yaml",
|
||||
"config/plain_text_resume.yaml",
|
||||
"config/license.json",
|
||||
"config/user.yaml.working",
|
||||
]
|
||||
|
||||
# Gitignored integration configs (glob pattern — each matching file is added)
|
||||
_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
|
||||
|
||||
# Non-secret committed configs worth preserving for portability
|
||||
# (also present in the legacy /devl/job-seeker instance)
|
||||
_EXTRA_CONFIGS = [
|
||||
"config/llm.yaml",
|
||||
"config/search_profiles.yaml",
|
||||
"config/resume_keywords.yaml", # personal keyword list — present in both instances
|
||||
"config/skills_suggestions.yaml",
|
||||
"config/blocklist.yaml",
|
||||
"config/server.yaml", # deployment config (base URL path, port) — Peregrine only
|
||||
]
|
||||
|
||||
# Candidate DB paths (first one that exists wins)
|
||||
_DB_CANDIDATES = ["data/staging.db", "staging.db"]
|
||||
|
||||
_MANIFEST_NAME = "backup-manifest.json"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Source detection
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _detect_source_label(base_dir: Path) -> str:
|
||||
"""Return a human-readable label for the instance being backed up.
|
||||
|
||||
Uses the directory name — stable as long as the repo root isn't renamed,
|
||||
which is the normal case for both the Docker install (peregrine/) and the
|
||||
legacy Conda install (job-seeker/).
|
||||
|
||||
Args:
|
||||
base_dir: The root directory being backed up.
|
||||
|
||||
Returns:
|
||||
A short identifier string, e.g. "peregrine" or "job-seeker".
|
||||
"""
|
||||
return base_dir.name
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def create_backup(
|
||||
base_dir: Path,
|
||||
include_db: bool = True,
|
||||
source_label: str | None = None,
|
||||
) -> bytes:
|
||||
"""Return a zip archive as raw bytes.
|
||||
|
||||
Args:
|
||||
base_dir: Repo root (parent of config/ and staging.db).
|
||||
include_db: If True, include staging.db in the archive.
|
||||
source_label: Human-readable instance name stored in the manifest
|
||||
(e.g. "peregrine", "job-seeker"). Auto-detected if None.
|
||||
"""
|
||||
buf = io.BytesIO()
|
||||
included: list[str] = []
|
||||
|
||||
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
|
||||
# Gitignored secret configs
|
||||
for rel in _SECRET_CONFIGS:
|
||||
p = base_dir / rel
|
||||
if p.exists():
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Integration configs (glob)
|
||||
for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
|
||||
rel = str(p.relative_to(base_dir))
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Extra non-secret configs
|
||||
for rel in _EXTRA_CONFIGS:
|
||||
p = base_dir / rel
|
||||
if p.exists():
|
||||
zf.write(p, rel)
|
||||
included.append(rel)
|
||||
|
||||
# Staging DB
|
||||
if include_db:
|
||||
for candidate in _DB_CANDIDATES:
|
||||
p = base_dir / candidate
|
||||
if p.exists():
|
||||
zf.write(p, candidate)
|
||||
included.append(candidate)
|
||||
break
|
||||
|
||||
# Manifest
|
||||
manifest = {
|
||||
"created_at": datetime.now().isoformat(),
|
||||
"source": source_label or _detect_source_label(base_dir),
|
||||
"source_path": str(base_dir.resolve()),
|
||||
"peregrine_version": "1.0",
|
||||
"files": included,
|
||||
"includes_db": include_db and any(f.endswith(".db") for f in included),
|
||||
}
|
||||
zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
def list_backup_contents(zip_bytes: bytes) -> dict:
|
||||
"""Return manifest + file list from a backup zip (no extraction)."""
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
|
||||
manifest: dict = {}
|
||||
if _MANIFEST_NAME in zf.namelist():
|
||||
manifest = json.loads(zf.read(_MANIFEST_NAME))
|
||||
sizes = {info.filename: info.file_size for info in zf.infolist()}
|
||||
return {
|
||||
"manifest": manifest,
|
||||
"files": names,
|
||||
"sizes": sizes,
|
||||
"total_bytes": sum(sizes[n] for n in names if n in sizes),
|
||||
}
|
||||
|
||||
|
||||
def restore_backup(
|
||||
zip_bytes: bytes,
|
||||
base_dir: Path,
|
||||
include_db: bool = True,
|
||||
overwrite: bool = True,
|
||||
) -> dict[str, list[str]]:
|
||||
"""Extract a backup zip into base_dir.
|
||||
|
||||
Args:
|
||||
zip_bytes: Raw bytes of the backup zip.
|
||||
base_dir: Repo root to restore into.
|
||||
include_db: If False, skip any .db files.
|
||||
overwrite: If False, skip files that already exist.
|
||||
|
||||
Returns:
|
||||
{"restored": [...], "skipped": [...]}
|
||||
"""
|
||||
restored: list[str] = []
|
||||
skipped: list[str] = []
|
||||
|
||||
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
|
||||
for name in zf.namelist():
|
||||
if name == _MANIFEST_NAME:
|
||||
continue
|
||||
if not include_db and name.endswith(".db"):
|
||||
skipped.append(name)
|
||||
continue
|
||||
dest = base_dir / name
|
||||
if dest.exists() and not overwrite:
|
||||
skipped.append(name)
|
||||
continue
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_bytes(zf.read(name))
|
||||
restored.append(name)
|
||||
|
||||
return {"restored": restored, "skipped": skipped}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
|
||||
group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
|
||||
group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
|
||||
parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
|
||||
parser.add_argument("--no-overwrite", action="store_true",
|
||||
help="Skip files that already exist (--restore)")
|
||||
parser.add_argument(
|
||||
"--base-dir", metavar="PATH",
|
||||
help="Root of the instance to back up/restore (default: this repo root). "
|
||||
"Use /devl/job-seeker to target the legacy Conda install.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
|
||||
|
||||
if args.create:
|
||||
out = Path(args.create)
|
||||
data = create_backup(base_dir, include_db=not args.no_db)
|
||||
out.write_bytes(data)
|
||||
info = list_backup_contents(data)
|
||||
m = info["manifest"]
|
||||
print(f"Backup created: {out} ({len(data):,} bytes)")
|
||||
print(f" Source: {m.get('source', '?')} ({base_dir})")
|
||||
print(f" {len(info['files'])} files archived:")
|
||||
for name in info["files"]:
|
||||
size = info["sizes"].get(name, 0)
|
||||
print(f" {name} ({size:,} bytes)")
|
||||
|
||||
elif args.restore:
|
||||
in_path = Path(args.restore)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
data = in_path.read_bytes()
|
||||
result = restore_backup(data, base_dir,
|
||||
include_db=not args.no_db,
|
||||
overwrite=not args.no_overwrite)
|
||||
print(f"Restored {len(result['restored'])} files:")
|
||||
for name in result["restored"]:
|
||||
print(f" ✓ {name}")
|
||||
if result["skipped"]:
|
||||
print(f"Skipped {len(result['skipped'])} files:")
|
||||
for name in result["skipped"]:
|
||||
print(f" - {name}")
|
||||
|
||||
elif args.list:
|
||||
in_path = Path(args.list)
|
||||
if not in_path.exists():
|
||||
print(f"ERROR: {in_path} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
data = in_path.read_bytes()
|
||||
info = list_backup_contents(data)
|
||||
m = info["manifest"]
|
||||
if m:
|
||||
print(f"Created: {m.get('created_at', 'unknown')}")
|
||||
print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})")
|
||||
print(f"Has DB: {m.get('includes_db', '?')}")
|
||||
print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):")
|
||||
for name in info["files"]:
|
||||
size = info["sizes"].get(name, 0)
|
||||
print(f" {name} ({size:,} bytes)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
231
tests/test_backup.py
Normal file
231
tests/test_backup.py
Normal file
|
|
@ -0,0 +1,231 @@
|
|||
"""Tests for scripts/backup.py — create, list, restore, and multi-instance support."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from scripts.backup import (
|
||||
_detect_source_label,
|
||||
create_backup,
|
||||
list_backup_contents,
|
||||
restore_backup,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _make_instance(tmp_path: Path, name: str, *, root_db: bool = False) -> Path:
|
||||
"""Build a minimal fake instance directory for testing."""
|
||||
base = tmp_path / name
|
||||
base.mkdir()
|
||||
|
||||
# Secret configs
|
||||
(base / "config").mkdir()
|
||||
(base / "config" / "notion.yaml").write_text("token: secret")
|
||||
(base / "config" / "email.yaml").write_text("user: test@example.com")
|
||||
|
||||
# Extra config
|
||||
(base / "config" / "llm.yaml").write_text("backend: ollama")
|
||||
(base / "config" / "resume_keywords.yaml").write_text("keywords: [python]")
|
||||
(base / "config" / "server.yaml").write_text("port: 8502")
|
||||
|
||||
# DB — either at data/staging.db (Peregrine) or staging.db root (legacy)
|
||||
if root_db:
|
||||
(base / "staging.db").write_bytes(b"SQLite legacy")
|
||||
else:
|
||||
(base / "data").mkdir()
|
||||
(base / "data" / "staging.db").write_bytes(b"SQLite peregrine")
|
||||
|
||||
return base
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# create_backup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestCreateBackup:
|
||||
def test_returns_valid_zip(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
assert zipfile.is_zipfile(__import__("io").BytesIO(data))
|
||||
|
||||
def test_includes_secret_configs(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "config/notion.yaml" in info["files"]
|
||||
assert "config/email.yaml" in info["files"]
|
||||
|
||||
def test_includes_extra_configs(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "config/llm.yaml" in info["files"]
|
||||
assert "config/resume_keywords.yaml" in info["files"]
|
||||
assert "config/server.yaml" in info["files"]
|
||||
|
||||
def test_includes_db_by_default(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert info["manifest"]["includes_db"] is True
|
||||
assert any(f.endswith(".db") for f in info["files"])
|
||||
|
||||
def test_excludes_db_when_flag_false(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base, include_db=False)
|
||||
info = list_backup_contents(data)
|
||||
assert info["manifest"]["includes_db"] is False
|
||||
assert not any(f.endswith(".db") for f in info["files"])
|
||||
|
||||
def test_silently_skips_missing_files(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
# tokens.yaml not created in fixture — should not raise
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "config/tokens.yaml" not in info["files"]
|
||||
|
||||
def test_manifest_contains_source_label(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert info["manifest"]["source"] == "peregrine"
|
||||
|
||||
def test_source_label_override(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base, source_label="custom-label")
|
||||
info = list_backup_contents(data)
|
||||
assert info["manifest"]["source"] == "custom-label"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Legacy instance (staging.db at repo root)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestLegacyInstance:
|
||||
def test_picks_up_root_db(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "staging.db" in info["files"]
|
||||
assert "data/staging.db" not in info["files"]
|
||||
|
||||
def test_source_label_is_job_seeker(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert info["manifest"]["source"] == "job-seeker"
|
||||
|
||||
def test_missing_peregrine_only_configs_skipped(self, tmp_path):
|
||||
"""Legacy doesn't have server.yaml, user.yaml, etc. — should not error."""
|
||||
base = _make_instance(tmp_path, "job-seeker", root_db=True)
|
||||
# Remove server.yaml to simulate legacy (it won't exist there)
|
||||
(base / "config" / "server.yaml").unlink()
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "config/server.yaml" not in info["files"]
|
||||
assert "config/notion.yaml" in info["files"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# list_backup_contents
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestListBackupContents:
|
||||
def test_returns_manifest_and_files(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "manifest" in info
|
||||
assert "files" in info
|
||||
assert "sizes" in info
|
||||
assert "total_bytes" in info
|
||||
|
||||
def test_total_bytes_is_sum_of_file_sizes(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
expected = sum(info["sizes"][f] for f in info["files"] if f in info["sizes"])
|
||||
assert info["total_bytes"] == expected
|
||||
|
||||
def test_manifest_not_in_files_list(self, tmp_path):
|
||||
base = _make_instance(tmp_path, "peregrine")
|
||||
data = create_backup(base)
|
||||
info = list_backup_contents(data)
|
||||
assert "backup-manifest.json" not in info["files"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# restore_backup
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestRestoreBackup:
|
||||
def test_restores_all_files(self, tmp_path):
|
||||
src = _make_instance(tmp_path, "peregrine")
|
||||
dst = tmp_path / "restored"
|
||||
dst.mkdir()
|
||||
data = create_backup(src)
|
||||
result = restore_backup(data, dst)
|
||||
assert len(result["restored"]) > 0
|
||||
assert (dst / "config" / "notion.yaml").exists()
|
||||
|
||||
def test_skips_db_when_flag_false(self, tmp_path):
|
||||
src = _make_instance(tmp_path, "peregrine")
|
||||
dst = tmp_path / "restored"
|
||||
dst.mkdir()
|
||||
data = create_backup(src)
|
||||
result = restore_backup(data, dst, include_db=False)
|
||||
assert not any(f.endswith(".db") for f in result["restored"])
|
||||
assert any(f.endswith(".db") for f in result["skipped"])
|
||||
|
||||
def test_no_overwrite_skips_existing(self, tmp_path):
|
||||
src = _make_instance(tmp_path, "peregrine")
|
||||
dst = tmp_path / "restored"
|
||||
dst.mkdir()
|
||||
(dst / "config").mkdir()
|
||||
existing = dst / "config" / "notion.yaml"
|
||||
existing.write_text("original content")
|
||||
data = create_backup(src)
|
||||
result = restore_backup(data, dst, overwrite=False)
|
||||
assert "config/notion.yaml" in result["skipped"]
|
||||
assert existing.read_text() == "original content"
|
||||
|
||||
def test_overwrite_replaces_existing(self, tmp_path):
|
||||
src = _make_instance(tmp_path, "peregrine")
|
||||
dst = tmp_path / "restored"
|
||||
dst.mkdir()
|
||||
(dst / "config").mkdir()
|
||||
(dst / "config" / "notion.yaml").write_text("stale content")
|
||||
data = create_backup(src)
|
||||
restore_backup(data, dst, overwrite=True)
|
||||
assert (dst / "config" / "notion.yaml").read_text() == "token: secret"
|
||||
|
||||
def test_roundtrip_preserves_content(self, tmp_path):
|
||||
src = _make_instance(tmp_path, "peregrine")
|
||||
original = (src / "config" / "notion.yaml").read_text()
|
||||
dst = tmp_path / "restored"
|
||||
dst.mkdir()
|
||||
data = create_backup(src)
|
||||
restore_backup(data, dst)
|
||||
assert (dst / "config" / "notion.yaml").read_text() == original
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _detect_source_label
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestDetectSourceLabel:
|
||||
def test_returns_directory_name(self, tmp_path):
|
||||
base = tmp_path / "peregrine"
|
||||
base.mkdir()
|
||||
assert _detect_source_label(base) == "peregrine"
|
||||
|
||||
def test_legacy_label(self, tmp_path):
|
||||
base = tmp_path / "job-seeker"
|
||||
base.mkdir()
|
||||
assert _detect_source_label(base) == "job-seeker"
|
||||
Loading…
Reference in a new issue