peregrine/scripts/backup.py
pyr0ball 636694edd1 feat: backup/restore script with multi-instance and legacy support
- create_backup() / restore_backup() / list_backup_contents() public API
- --base-dir PATH flag: targets any instance root (default: this repo)
  --base-dir /devl/job-seeker backs up the legacy Conda install
- _DB_CANDIDATES fallback: data/staging.db (Peregrine) or staging.db root (legacy)
- Manifest records source label (dir name), source_path, created_at, files, includes_db
- Added config/resume_keywords.yaml and config/server.yaml to backup lists
- 21 tests covering create, list, restore, legacy DB path, overwrite, roundtrip
2026-03-04 10:52:51 -08:00

277 lines
10 KiB
Python

"""Config backup / restore / teleport for Peregrine.
Creates a portable zip of all gitignored configs + optionally the staging DB.
Intended for: machine migrations, Docker volume transfers, and safe wizard testing.
Supports both the Peregrine Docker instance and the legacy /devl/job-seeker install.
Usage (CLI):
conda run -n job-seeker python scripts/backup.py --create backup.zip
conda run -n job-seeker python scripts/backup.py --create backup.zip --no-db
conda run -n job-seeker python scripts/backup.py --create backup.zip --base-dir /devl/job-seeker
conda run -n job-seeker python scripts/backup.py --restore backup.zip
conda run -n job-seeker python scripts/backup.py --list backup.zip
Usage (programmatic — called from Settings UI):
from scripts.backup import create_backup, restore_backup, list_backup_contents
zip_bytes = create_backup(base_dir, include_db=True)
info = list_backup_contents(zip_bytes)
result = restore_backup(zip_bytes, base_dir, include_db=True)
"""
from __future__ import annotations
import io
import json
import zipfile
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# Files included in every backup (relative to repo root)
# ---------------------------------------------------------------------------
# Gitignored config files that hold secrets / personal data
_SECRET_CONFIGS = [
"config/notion.yaml",
"config/tokens.yaml",
"config/email.yaml",
"config/adzuna.yaml",
"config/craigslist.yaml",
"config/user.yaml",
"config/plain_text_resume.yaml",
"config/license.json",
"config/user.yaml.working",
]
# Gitignored integration configs (glob pattern — each matching file is added)
_INTEGRATION_CONFIG_GLOB = "config/integrations/*.yaml"
# Non-secret committed configs worth preserving for portability
# (also present in the legacy /devl/job-seeker instance)
_EXTRA_CONFIGS = [
"config/llm.yaml",
"config/search_profiles.yaml",
"config/resume_keywords.yaml", # personal keyword list — present in both instances
"config/skills_suggestions.yaml",
"config/blocklist.yaml",
"config/server.yaml", # deployment config (base URL path, port) — Peregrine only
]
# Candidate DB paths (first one that exists wins)
_DB_CANDIDATES = ["data/staging.db", "staging.db"]
_MANIFEST_NAME = "backup-manifest.json"
# ---------------------------------------------------------------------------
# Source detection
# ---------------------------------------------------------------------------
def _detect_source_label(base_dir: Path) -> str:
"""Return a human-readable label for the instance being backed up.
Uses the directory name — stable as long as the repo root isn't renamed,
which is the normal case for both the Docker install (peregrine/) and the
legacy Conda install (job-seeker/).
Args:
base_dir: The root directory being backed up.
Returns:
A short identifier string, e.g. "peregrine" or "job-seeker".
"""
return base_dir.name
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def create_backup(
base_dir: Path,
include_db: bool = True,
source_label: str | None = None,
) -> bytes:
"""Return a zip archive as raw bytes.
Args:
base_dir: Repo root (parent of config/ and staging.db).
include_db: If True, include staging.db in the archive.
source_label: Human-readable instance name stored in the manifest
(e.g. "peregrine", "job-seeker"). Auto-detected if None.
"""
buf = io.BytesIO()
included: list[str] = []
with zipfile.ZipFile(buf, "w", compression=zipfile.ZIP_DEFLATED) as zf:
# Gitignored secret configs
for rel in _SECRET_CONFIGS:
p = base_dir / rel
if p.exists():
zf.write(p, rel)
included.append(rel)
# Integration configs (glob)
for p in sorted((base_dir).glob(_INTEGRATION_CONFIG_GLOB)):
rel = str(p.relative_to(base_dir))
zf.write(p, rel)
included.append(rel)
# Extra non-secret configs
for rel in _EXTRA_CONFIGS:
p = base_dir / rel
if p.exists():
zf.write(p, rel)
included.append(rel)
# Staging DB
if include_db:
for candidate in _DB_CANDIDATES:
p = base_dir / candidate
if p.exists():
zf.write(p, candidate)
included.append(candidate)
break
# Manifest
manifest = {
"created_at": datetime.now().isoformat(),
"source": source_label or _detect_source_label(base_dir),
"source_path": str(base_dir.resolve()),
"peregrine_version": "1.0",
"files": included,
"includes_db": include_db and any(f.endswith(".db") for f in included),
}
zf.writestr(_MANIFEST_NAME, json.dumps(manifest, indent=2))
return buf.getvalue()
def list_backup_contents(zip_bytes: bytes) -> dict:
"""Return manifest + file list from a backup zip (no extraction)."""
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
names = [n for n in zf.namelist() if n != _MANIFEST_NAME]
manifest: dict = {}
if _MANIFEST_NAME in zf.namelist():
manifest = json.loads(zf.read(_MANIFEST_NAME))
sizes = {info.filename: info.file_size for info in zf.infolist()}
return {
"manifest": manifest,
"files": names,
"sizes": sizes,
"total_bytes": sum(sizes[n] for n in names if n in sizes),
}
def restore_backup(
zip_bytes: bytes,
base_dir: Path,
include_db: bool = True,
overwrite: bool = True,
) -> dict[str, list[str]]:
"""Extract a backup zip into base_dir.
Args:
zip_bytes: Raw bytes of the backup zip.
base_dir: Repo root to restore into.
include_db: If False, skip any .db files.
overwrite: If False, skip files that already exist.
Returns:
{"restored": [...], "skipped": [...]}
"""
restored: list[str] = []
skipped: list[str] = []
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
for name in zf.namelist():
if name == _MANIFEST_NAME:
continue
if not include_db and name.endswith(".db"):
skipped.append(name)
continue
dest = base_dir / name
if dest.exists() and not overwrite:
skipped.append(name)
continue
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_bytes(zf.read(name))
restored.append(name)
return {"restored": restored, "skipped": skipped}
# ---------------------------------------------------------------------------
# CLI entry point
# ---------------------------------------------------------------------------
def main() -> None:
import argparse
import sys
parser = argparse.ArgumentParser(description="Peregrine config backup / restore / teleport")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("--create", metavar="OUT.zip", help="Create a backup zip")
group.add_argument("--restore", metavar="IN.zip", help="Restore from a backup zip")
group.add_argument("--list", metavar="IN.zip", help="List contents of a backup zip")
parser.add_argument("--no-db", action="store_true", help="Exclude staging.db (--create/--restore)")
parser.add_argument("--no-overwrite", action="store_true",
help="Skip files that already exist (--restore)")
parser.add_argument(
"--base-dir", metavar="PATH",
help="Root of the instance to back up/restore (default: this repo root). "
"Use /devl/job-seeker to target the legacy Conda install.",
)
args = parser.parse_args()
base_dir = Path(args.base_dir).resolve() if args.base_dir else Path(__file__).parent.parent
if args.create:
out = Path(args.create)
data = create_backup(base_dir, include_db=not args.no_db)
out.write_bytes(data)
info = list_backup_contents(data)
m = info["manifest"]
print(f"Backup created: {out} ({len(data):,} bytes)")
print(f" Source: {m.get('source', '?')} ({base_dir})")
print(f" {len(info['files'])} files archived:")
for name in info["files"]:
size = info["sizes"].get(name, 0)
print(f" {name} ({size:,} bytes)")
elif args.restore:
in_path = Path(args.restore)
if not in_path.exists():
print(f"ERROR: {in_path} not found", file=sys.stderr)
sys.exit(1)
data = in_path.read_bytes()
result = restore_backup(data, base_dir,
include_db=not args.no_db,
overwrite=not args.no_overwrite)
print(f"Restored {len(result['restored'])} files:")
for name in result["restored"]:
print(f"{name}")
if result["skipped"]:
print(f"Skipped {len(result['skipped'])} files:")
for name in result["skipped"]:
print(f" - {name}")
elif args.list:
in_path = Path(args.list)
if not in_path.exists():
print(f"ERROR: {in_path} not found", file=sys.stderr)
sys.exit(1)
data = in_path.read_bytes()
info = list_backup_contents(data)
m = info["manifest"]
if m:
print(f"Created: {m.get('created_at', 'unknown')}")
print(f"Source: {m.get('source', '?')} ({m.get('source_path', '?')})")
print(f"Has DB: {m.get('includes_db', '?')}")
print(f"\n{len(info['files'])} files ({info['total_bytes']:,} bytes uncompressed):")
for name in info["files"]:
size = info["sizes"].get(name, 0)
print(f" {name} ({size:,} bytes)")
if __name__ == "__main__":
main()