peregrine/scripts/preflight.py
pyr0ball f58063264b feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight
- Add _mixed_mode_vram_warning() to flag low VRAM on GPU 1 in mixed mode
- Wire download size report block into main() before closing border line
- Wire mixed-mode VRAM warning into report if triggered
- Write DUAL_GPU_MODE=ollama default to .env for new 2-GPU setups (no override if already set)
- Promote import os to top-level (was local import inside get_cpu_cores)
2026-02-27 00:17:00 -08:00

527 lines
22 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Peregrine preflight check.
Scans for port conflicts, assesses system resources (RAM / CPU / GPU),
recommends a Docker Compose profile, and calculates optional vLLM KV-cache
CPU offload when VRAM is tight. Writes resolved settings to .env so docker
compose picks them up automatically.
When a managed service (ollama, vllm, vision, searxng) is already running
on its configured port, preflight *adopts* it: the app is configured to reach
it via host.docker.internal, and a compose.override.yml is generated to
prevent Docker from starting a conflicting container.
Usage:
python scripts/preflight.py # full report + write .env
python scripts/preflight.py --check-only # report only, no .env write
python scripts/preflight.py --service streamlit # print resolved port, exit
python scripts/preflight.py --quiet # machine-readable, exit 0/1
Exit codes:
0 — all checks passed (or issues auto-resolved)
1 — manual action required (unresolvable port conflict on external service)
"""
import argparse
import os
import platform
import socket
import subprocess
import sys
from pathlib import Path
import yaml
ROOT = Path(__file__).parent.parent
USER_YAML = ROOT / "config" / "user.yaml"
LLM_YAML = ROOT / "config" / "llm.yaml"
ENV_FILE = ROOT / ".env"
OVERRIDE_YML = ROOT / "compose.override.yml"
# ── Service table ──────────────────────────────────────────────────────────────
# (yaml_key, default_port, env_var, docker_owned, adoptable)
#
# docker_owned — True if Docker Compose normally starts this service
# adoptable — True if an existing process on this port should be used instead
# of starting a Docker container (and the Docker service disabled)
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
}
# LLM yaml backend keys → url suffix, keyed by service name
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
"ollama": [("ollama", "/v1")],
"ollama_research": [("ollama_research", "/v1")],
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
"vision": [("vision_service", "")],
}
# Docker-internal hostname:port for each service (when running in Docker)
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
"ollama": ("ollama", 11434),
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
"vllm": ("vllm", 8000),
"vision": ("vision", 8002),
"searxng": ("searxng", 8080), # searxng internal port differs from host port
}
# ── System probes (stdlib only — no psutil) ───────────────────────────────────
def _sh(*cmd: str, timeout: int = 5) -> str:
try:
r = subprocess.run(list(cmd), capture_output=True, text=True, timeout=timeout)
return r.stdout.strip() if r.returncode == 0 else ""
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
return ""
def get_ram_gb() -> tuple[float, float]:
"""Return (total_gb, available_gb). Returns (0, 0) if undetectable."""
os_name = platform.system()
if os_name == "Linux":
try:
meminfo = Path("/proc/meminfo").read_text()
except OSError:
return 0.0, 0.0
total = available = 0
for line in meminfo.splitlines():
if line.startswith("MemTotal:"):
total = int(line.split()[1])
elif line.startswith("MemAvailable:"):
available = int(line.split()[1])
return total / 1024 / 1024, available / 1024 / 1024
elif os_name == "Darwin":
total_bytes = _sh("sysctl", "-n", "hw.memsize")
total = int(total_bytes) / 1024 ** 3 if total_bytes.isdigit() else 0.0
vm = _sh("vm_stat")
free_pages = 0
for line in vm.splitlines():
if "Pages free" in line or "Pages speculative" in line:
try:
free_pages += int(line.split()[-1].rstrip("."))
except ValueError:
pass
available = free_pages * 4096 / 1024 ** 3
return total, available
return 0.0, 0.0
def get_cpu_cores() -> int:
return os.cpu_count() or 1
def get_gpus() -> list[dict]:
"""Return list of {name, vram_total_gb, vram_free_gb} via nvidia-smi."""
out = _sh(
"nvidia-smi",
"--query-gpu=name,memory.total,memory.free",
"--format=csv,noheader,nounits",
)
if not out:
return []
gpus = []
for line in out.splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) == 3:
try:
gpus.append({
"name": parts[0],
"vram_total_gb": round(int(parts[1]) / 1024, 1),
"vram_free_gb": round(int(parts[2]) / 1024, 1),
})
except ValueError:
pass
return gpus
# ── Port probes ───────────────────────────────────────────────────────────────
def _load_svc() -> dict:
if USER_YAML.exists():
return (yaml.safe_load(USER_YAML.read_text()) or {}).get("services", {})
return {}
def is_port_free(port: int) -> bool:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(0.3)
return s.connect_ex(("127.0.0.1", port)) != 0
def find_free_port(start: int, limit: int = 30) -> int:
for p in range(start, start + limit):
if is_port_free(p):
return p
raise RuntimeError(f"No free port found in range {start}{start + limit - 1}")
def check_ports(svc: dict) -> dict[str, dict]:
results = {}
for name, (yaml_key, default, env_var, docker_owned, adoptable) in _SERVICES.items():
configured = int(svc.get(yaml_key, default))
free = is_port_free(configured)
if free:
# Port is free — start Docker service as normal
resolved = configured
stub_port = configured
external = False
elif adoptable:
# Port is in use by a compatible service — adopt it.
# resolved = actual external port (used for host.docker.internal URL)
# stub_port = free port for the no-op stub container (avoids binding conflict)
resolved = configured
stub_port = find_free_port(configured + 1)
external = True
else:
# Port in use, not adoptable (e.g. streamlit) — reassign
resolved = find_free_port(configured + 1)
stub_port = resolved
external = False
results[name] = {
"configured": configured,
"resolved": resolved,
"stub_port": stub_port,
"changed": resolved != configured,
"docker_owned": docker_owned,
"adoptable": adoptable,
"free": free,
"external": external,
"env_var": env_var,
}
return results
# ── Recommendations ───────────────────────────────────────────────────────────
def recommend_profile(gpus: list[dict], ram_total_gb: float) -> str:
if len(gpus) >= 2:
return "dual-gpu"
if len(gpus) == 1:
return "single-gpu"
if ram_total_gb >= 8:
return "cpu"
return "remote"
def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
"""
Suggest GBs of KV cache to offload from GPU VRAM → system RAM.
Enabled when VRAM is tight (< 10 GB free on any GPU) and there is
enough RAM headroom (> 4 GB available). Uses at most 25% of the
RAM headroom above 4 GB, capped at 8 GB.
"""
if not gpus or ram_available_gb < 4:
return 0
min_vram_free = min(g["vram_free_gb"] for g in gpus)
if min_vram_free >= 10:
return 0
headroom = ram_available_gb - 4.0 # reserve 4 GB for OS
return min(int(headroom * 0.25), 8)
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
"""
Return estimated first-run download sizes in MB, keyed by component name.
Profile-aware: only includes components that will actually be pulled.
"""
sizes: dict[str, int] = {
"searxng": 300,
"app": 1500,
}
if profile in ("cpu", "single-gpu", "dual-gpu"):
sizes["ollama"] = 800
sizes["llama3_2_3b"] = 2000
if profile in ("single-gpu", "dual-gpu"):
sizes["vision_image"] = 3000
sizes["moondream2"] = 1800
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
sizes["vllm_image"] = 10000
return sizes
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
"""
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
"""
if dual_gpu_mode != "mixed" or len(gpus) < 2:
return None
free = gpus[1]["vram_free_gb"]
if free < 12:
return (
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
f"running ollama_research + vllm together may cause OOM. "
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
)
return None
# ── Config writers ─────────────────────────────────────────────────────────────
def write_env(updates: dict[str, str]) -> None:
existing: dict[str, str] = {}
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
line = line.strip()
if "=" in line and not line.startswith("#"):
k, _, v = line.partition("=")
existing[k.strip()] = v.strip()
existing.update(updates)
ENV_FILE.write_text(
"\n".join(f"{k}={v}" for k, v in sorted(existing.items())) + "\n"
)
def update_llm_yaml(ports: dict[str, dict]) -> None:
"""Rewrite base_url entries in config/llm.yaml to match adopted/internal services."""
if not LLM_YAML.exists():
return
cfg = yaml.safe_load(LLM_YAML.read_text()) or {}
backends = cfg.get("backends", {})
changed = False
for svc_name, backend_list in _LLM_BACKENDS.items():
if svc_name not in ports:
continue
info = ports[svc_name]
port = info["resolved"]
if info["external"]:
# Reach the host service from inside the Docker container
host = f"host.docker.internal:{port}"
elif svc_name in _DOCKER_INTERNAL:
# Use Docker service name + internal port
docker_host, internal_port = _DOCKER_INTERNAL[svc_name]
host = f"{docker_host}:{internal_port}"
else:
continue
for backend_name, url_suffix in backend_list:
if backend_name in backends:
new_url = f"http://{host}{url_suffix}"
if backends[backend_name].get("base_url") != new_url:
backends[backend_name]["base_url"] = new_url
changed = True
if changed:
cfg["backends"] = backends
LLM_YAML.write_text(yaml.dump(cfg, default_flow_style=False, allow_unicode=True,
sort_keys=False))
def write_compose_override(ports: dict[str, dict]) -> None:
"""
Generate compose.override.yml to stub out Docker services that are being
adopted from external processes. Cleans up the file when nothing to disable.
Stubbing strategy (not profiles): changing a service's profile to an unused
value breaks depends_on references — Docker treats it as undefined. Instead
we replace the service with a no-op stub that:
- Stays alive (sleep infinity) so depends_on: service_started is satisfied
- Reports healthy immediately so depends_on: service_healthy is satisfied
- Binds no ports (no conflict with the external service on the host)
"""
to_disable = {
name: info for name, info in ports.items()
if info["external"] and info["docker_owned"]
}
if not to_disable:
if OVERRIDE_YML.exists():
OVERRIDE_YML.unlink()
return
lines = [
"# compose.override.yml — AUTO-GENERATED by preflight.py, do not edit manually.",
"# Stubs out Docker services whose ports are already in use by host services.",
"# Re-run preflight (make preflight) to regenerate after stopping host services.",
"services:",
]
for name, info in to_disable.items():
lines += [
f" {name}: # adopted — host service on :{info['resolved']}",
f" entrypoint: [\"/bin/sh\", \"-c\", \"sleep infinity\"]",
f" ports: []",
f" healthcheck:",
f" test: [\"CMD\", \"true\"]",
f" interval: 1s",
f" timeout: 1s",
f" start_period: 0s",
f" retries: 1",
]
OVERRIDE_YML.write_text("\n".join(lines) + "\n")
# ── Main ──────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="Peregrine preflight check")
parser.add_argument("--check-only", action="store_true",
help="Print report; don't write .env")
parser.add_argument("--quiet", action="store_true",
help="Suppress output; rely on exit code")
parser.add_argument("--service", metavar="NAME",
help="Print resolved port for one service and exit (e.g. streamlit)")
args = parser.parse_args()
svc = _load_svc()
ports = check_ports(svc)
# Single-service mode — used by manage.sh / manage-ui.sh
if args.service:
info = ports.get(args.service.lower())
if info:
print(info["resolved"])
else:
_, default, *_ = _SERVICES.get(args.service.lower(), (None, 8501, None, None, None))
print(default)
return
ram_total, ram_avail = get_ram_gb()
cpu_cores = get_cpu_cores()
gpus = get_gpus()
profile = recommend_profile(gpus, ram_total)
offload_gb = calc_cpu_offload_gb(gpus, ram_avail)
if not args.quiet:
print("╔══ Peregrine Preflight ══════════════════════════════╗")
print("")
print("║ Ports")
for name, info in ports.items():
if info["external"]:
status = f"✓ adopted (using host service on :{info['resolved']})"
tag = "extern"
elif not info["docker_owned"]:
status = "⚠ not responding" if info["free"] else "✓ reachable"
tag = "extern"
elif info["free"]:
status = "✓ free"
tag = "owned "
elif info["changed"]:
status = f"→ reassigned to :{info['resolved']}"
tag = "owned "
else:
status = "⚠ in use"
tag = "owned "
print(f"{name:<10} :{info['configured']} [{tag}] {status}")
print("")
print("║ Resources")
print(f"║ CPU {cpu_cores} core{'s' if cpu_cores != 1 else ''}")
if ram_total:
print(f"║ RAM {ram_total:.0f} GB total / {ram_avail:.1f} GB available")
else:
print("║ RAM (undetectable)")
if gpus:
for i, g in enumerate(gpus):
print(f"║ GPU {i} {g['name']}"
f"{g['vram_free_gb']:.1f} / {g['vram_total_gb']:.0f} GB VRAM free")
else:
print("║ GPU none detected")
print("")
print("║ Recommendations")
print(f"║ Docker profile {profile}")
if offload_gb > 0:
print(f"║ vLLM KV offload {offload_gb} GB → RAM (CPU_OFFLOAD_GB={offload_gb})")
else:
print("║ vLLM KV offload not needed")
reassigned = [n for n, i in ports.items() if i["changed"]]
adopted = [n for n, i in ports.items() if i["external"]]
if reassigned:
print("")
print("║ Port reassignments written to .env:")
for name in reassigned:
info = ports[name]
print(f"{info['env_var']}={info['resolved']} (was :{info['configured']})")
if adopted:
print("")
print("║ Adopted external services (Docker containers disabled):")
for name in adopted:
info = ports[name]
print(f"{name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}")
# ── Download size warning ──────────────────────────────────────────────
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
sizes = _download_size_mb(profile, dual_gpu_mode)
total_mb = sum(sizes.values())
print("")
print("║ Download sizes (first-run estimates)")
print("║ Docker images")
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
if "searxng" in sizes:
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
if "ollama" in sizes:
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
if "vision_image" in sizes:
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
if "vllm_image" in sizes:
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
print("║ Model weights (lazy-loaded on first use)")
if "llama3_2_3b" in sizes:
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
if "moondream2" in sizes:
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
print("║ Note: ollama + ollama_research share model dir — no double download")
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
if vram_warn:
print("")
print(f"{vram_warn}")
print("╚════════════════════════════════════════════════════╝")
if not args.check_only:
# For adopted services, write stub_port to .env so the no-op container
# binds a harmless free port instead of conflicting with the external service.
env_updates: dict[str, str] = {i["env_var"]: str(i["stub_port"]) for i in ports.values()}
env_updates["RECOMMENDED_PROFILE"] = profile
if offload_gb > 0:
env_updates["CPU_OFFLOAD_GB"] = str(offload_gb)
# GPU info for the app container (which lacks nvidia-smi access)
env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
if len(gpus) >= 2:
existing_env: dict[str, str] = {}
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, _, v = line.partition("=")
existing_env[k.strip()] = v.strip()
if "DUAL_GPU_MODE" not in existing_env:
env_updates["DUAL_GPU_MODE"] = "ollama"
write_env(env_updates)
update_llm_yaml(ports)
write_compose_override(ports)
if not args.quiet:
artifacts = [str(ENV_FILE.relative_to(ROOT))]
if OVERRIDE_YML.exists():
artifacts.append(str(OVERRIDE_YML.relative_to(ROOT)))
print(f" wrote {', '.join(artifacts)}")
# Fail only when a non-adoptable owned port couldn't be reassigned
stuck = [n for n, i in ports.items()
if not i["free"] and not i["external"] and not i["changed"]]
sys.exit(1 if stuck else 0)
if __name__ == "__main__":
main()