When preflight.py adopts a host-running Ollama (or ollama_research) service, write OLLAMA_HOST (and OLLAMA_RESEARCH_HOST) into .env using host.docker.internal so LLMRouter's env-var auto-config resolves the correct address from inside the Docker container without requiring a config/llm.yaml to exist.
536 lines
22 KiB
Python
536 lines
22 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Peregrine preflight check.
|
||
|
||
Scans for port conflicts, assesses system resources (RAM / CPU / GPU),
|
||
recommends a Docker Compose profile, and calculates optional vLLM KV-cache
|
||
CPU offload when VRAM is tight. Writes resolved settings to .env so docker
|
||
compose picks them up automatically.
|
||
|
||
When a managed service (ollama, vllm, vision, searxng) is already running
|
||
on its configured port, preflight *adopts* it: the app is configured to reach
|
||
it via host.docker.internal, and a compose.override.yml is generated to
|
||
prevent Docker from starting a conflicting container.
|
||
|
||
Usage:
|
||
python scripts/preflight.py # full report + write .env
|
||
python scripts/preflight.py --check-only # report only, no .env write
|
||
python scripts/preflight.py --service streamlit # print resolved port, exit
|
||
python scripts/preflight.py --quiet # machine-readable, exit 0/1
|
||
|
||
Exit codes:
|
||
0 — all checks passed (or issues auto-resolved)
|
||
1 — manual action required (unresolvable port conflict on external service)
|
||
"""
|
||
import argparse
|
||
import os
|
||
import platform
|
||
import socket
|
||
import subprocess
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import yaml
|
||
|
||
ROOT = Path(__file__).parent.parent
|
||
USER_YAML = ROOT / "config" / "user.yaml"
|
||
LLM_YAML = ROOT / "config" / "llm.yaml"
|
||
ENV_FILE = ROOT / ".env"
|
||
OVERRIDE_YML = ROOT / "compose.override.yml"
|
||
|
||
# ── Service table ──────────────────────────────────────────────────────────────
|
||
# (yaml_key, default_port, env_var, docker_owned, adoptable)
|
||
#
|
||
# docker_owned — True if Docker Compose normally starts this service
|
||
# adoptable — True if an existing process on this port should be used instead
|
||
# of starting a Docker container (and the Docker service disabled)
|
||
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
|
||
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
|
||
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
|
||
# vllm removed — now managed by cf-orch (host process), not a Docker service
|
||
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
|
||
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
|
||
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
|
||
}
|
||
|
||
# LLM yaml backend keys → url suffix, keyed by service name
|
||
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
|
||
"ollama": [("ollama", "/v1")],
|
||
"ollama_research": [("ollama_research", "/v1")],
|
||
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
|
||
"vision": [("vision_service", "")],
|
||
}
|
||
|
||
# Docker-internal hostname:port for each service (when running in Docker)
|
||
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
|
||
"ollama": ("ollama", 11434),
|
||
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
|
||
"vision": ("vision", 8002),
|
||
"searxng": ("searxng", 8080), # searxng internal port differs from host port
|
||
}
|
||
|
||
|
||
# ── System probes (stdlib only — no psutil) ───────────────────────────────────
|
||
|
||
def _sh(*cmd: str, timeout: int = 5) -> str:
|
||
try:
|
||
r = subprocess.run(list(cmd), capture_output=True, text=True, timeout=timeout)
|
||
return r.stdout.strip() if r.returncode == 0 else ""
|
||
except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
|
||
return ""
|
||
|
||
|
||
def get_ram_gb() -> tuple[float, float]:
|
||
"""Return (total_gb, available_gb). Returns (0, 0) if undetectable."""
|
||
os_name = platform.system()
|
||
if os_name == "Linux":
|
||
try:
|
||
meminfo = Path("/proc/meminfo").read_text()
|
||
except OSError:
|
||
return 0.0, 0.0
|
||
total = available = 0
|
||
for line in meminfo.splitlines():
|
||
if line.startswith("MemTotal:"):
|
||
total = int(line.split()[1])
|
||
elif line.startswith("MemAvailable:"):
|
||
available = int(line.split()[1])
|
||
return total / 1024 / 1024, available / 1024 / 1024
|
||
elif os_name == "Darwin":
|
||
total_bytes = _sh("sysctl", "-n", "hw.memsize")
|
||
total = int(total_bytes) / 1024 ** 3 if total_bytes.isdigit() else 0.0
|
||
vm = _sh("vm_stat")
|
||
free_pages = 0
|
||
for line in vm.splitlines():
|
||
if "Pages free" in line or "Pages speculative" in line:
|
||
try:
|
||
free_pages += int(line.split()[-1].rstrip("."))
|
||
except ValueError:
|
||
pass
|
||
available = free_pages * 4096 / 1024 ** 3
|
||
return total, available
|
||
return 0.0, 0.0
|
||
|
||
|
||
def get_cpu_cores() -> int:
|
||
return os.cpu_count() or 1
|
||
|
||
|
||
def get_gpus() -> list[dict]:
|
||
"""Return list of {name, vram_total_gb, vram_free_gb} via nvidia-smi."""
|
||
out = _sh(
|
||
"nvidia-smi",
|
||
"--query-gpu=name,memory.total,memory.free",
|
||
"--format=csv,noheader,nounits",
|
||
)
|
||
if not out:
|
||
return []
|
||
gpus = []
|
||
for line in out.splitlines():
|
||
parts = [p.strip() for p in line.split(",")]
|
||
if len(parts) == 3:
|
||
try:
|
||
gpus.append({
|
||
"name": parts[0],
|
||
"vram_total_gb": round(int(parts[1]) / 1024, 1),
|
||
"vram_free_gb": round(int(parts[2]) / 1024, 1),
|
||
})
|
||
except ValueError:
|
||
pass
|
||
return gpus
|
||
|
||
|
||
# ── Port probes ───────────────────────────────────────────────────────────────
|
||
|
||
def _load_svc() -> dict:
|
||
if USER_YAML.exists():
|
||
return (yaml.safe_load(USER_YAML.read_text()) or {}).get("services", {})
|
||
return {}
|
||
|
||
|
||
def is_port_free(port: int) -> bool:
|
||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||
s.settimeout(0.3)
|
||
return s.connect_ex(("127.0.0.1", port)) != 0
|
||
|
||
|
||
def find_free_port(start: int, limit: int = 30) -> int:
|
||
for p in range(start, start + limit):
|
||
if is_port_free(p):
|
||
return p
|
||
raise RuntimeError(f"No free port found in range {start}–{start + limit - 1}")
|
||
|
||
|
||
def check_ports(svc: dict) -> dict[str, dict]:
|
||
results = {}
|
||
for name, (yaml_key, default, env_var, docker_owned, adoptable) in _SERVICES.items():
|
||
configured = int(svc.get(yaml_key, default))
|
||
free = is_port_free(configured)
|
||
|
||
if free:
|
||
# Port is free — start Docker service as normal
|
||
resolved = configured
|
||
stub_port = configured
|
||
external = False
|
||
elif adoptable:
|
||
# Port is in use by a compatible service — adopt it.
|
||
# resolved = actual external port (used for host.docker.internal URL)
|
||
# stub_port = free port for the no-op stub container (avoids binding conflict)
|
||
resolved = configured
|
||
stub_port = find_free_port(configured + 1)
|
||
external = True
|
||
else:
|
||
# Port in use, not adoptable (e.g. streamlit) — reassign
|
||
resolved = find_free_port(configured + 1)
|
||
stub_port = resolved
|
||
external = False
|
||
|
||
results[name] = {
|
||
"configured": configured,
|
||
"resolved": resolved,
|
||
"stub_port": stub_port,
|
||
"changed": resolved != configured,
|
||
"docker_owned": docker_owned,
|
||
"adoptable": adoptable,
|
||
"free": free,
|
||
"external": external,
|
||
"env_var": env_var,
|
||
}
|
||
return results
|
||
|
||
|
||
# ── Recommendations ───────────────────────────────────────────────────────────
|
||
|
||
def recommend_profile(gpus: list[dict], ram_total_gb: float) -> str:
|
||
if len(gpus) >= 2:
|
||
return "dual-gpu"
|
||
if len(gpus) == 1:
|
||
return "single-gpu"
|
||
if ram_total_gb >= 8:
|
||
return "cpu"
|
||
return "remote"
|
||
|
||
|
||
def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
|
||
"""
|
||
Suggest GBs of KV cache to offload from GPU VRAM → system RAM.
|
||
|
||
Enabled when VRAM is tight (< 10 GB free on any GPU) and there is
|
||
enough RAM headroom (> 4 GB available). Uses at most 25% of the
|
||
RAM headroom above 4 GB, capped at 8 GB.
|
||
"""
|
||
if not gpus or ram_available_gb < 4:
|
||
return 0
|
||
min_vram_free = min(g["vram_free_gb"] for g in gpus)
|
||
if min_vram_free >= 10:
|
||
return 0
|
||
headroom = ram_available_gb - 4.0 # reserve 4 GB for OS
|
||
return min(int(headroom * 0.25), 8)
|
||
|
||
|
||
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
|
||
"""
|
||
Return estimated first-run download sizes in MB, keyed by component name.
|
||
Profile-aware: only includes components that will actually be pulled.
|
||
"""
|
||
sizes: dict[str, int] = {
|
||
"searxng": 300,
|
||
"app": 1500,
|
||
}
|
||
if profile in ("cpu", "single-gpu", "dual-gpu"):
|
||
sizes["ollama"] = 800
|
||
sizes["llama3_2_3b"] = 2000
|
||
if profile in ("single-gpu", "dual-gpu"):
|
||
sizes["vision_image"] = 3000
|
||
sizes["moondream2"] = 1800
|
||
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
|
||
sizes["vllm_image"] = 10000
|
||
return sizes
|
||
|
||
|
||
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
|
||
"""
|
||
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
|
||
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
|
||
"""
|
||
if dual_gpu_mode != "mixed" or len(gpus) < 2:
|
||
return None
|
||
free = gpus[1]["vram_free_gb"]
|
||
if free < 12:
|
||
return (
|
||
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
|
||
f"running ollama_research + vllm together may cause OOM. "
|
||
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
|
||
)
|
||
return None
|
||
|
||
|
||
# ── Config writers ─────────────────────────────────────────────────────────────
|
||
|
||
def write_env(updates: dict[str, str]) -> None:
|
||
existing: dict[str, str] = {}
|
||
if ENV_FILE.exists():
|
||
for line in ENV_FILE.read_text().splitlines():
|
||
line = line.strip()
|
||
if "=" in line and not line.startswith("#"):
|
||
k, _, v = line.partition("=")
|
||
existing[k.strip()] = v.strip()
|
||
existing.update(updates)
|
||
ENV_FILE.write_text(
|
||
"\n".join(f"{k}={v}" for k, v in sorted(existing.items())) + "\n"
|
||
)
|
||
|
||
|
||
def update_llm_yaml(ports: dict[str, dict]) -> None:
|
||
"""Rewrite base_url entries in config/llm.yaml to match adopted/internal services."""
|
||
if not LLM_YAML.exists():
|
||
return
|
||
cfg = yaml.safe_load(LLM_YAML.read_text()) or {}
|
||
backends = cfg.get("backends", {})
|
||
changed = False
|
||
|
||
for svc_name, backend_list in _LLM_BACKENDS.items():
|
||
if svc_name not in ports:
|
||
continue
|
||
info = ports[svc_name]
|
||
port = info["resolved"]
|
||
|
||
if info["external"]:
|
||
# Reach the host service from inside the Docker container
|
||
host = f"host.docker.internal:{port}"
|
||
elif svc_name in _DOCKER_INTERNAL:
|
||
# Use Docker service name + internal port
|
||
docker_host, internal_port = _DOCKER_INTERNAL[svc_name]
|
||
host = f"{docker_host}:{internal_port}"
|
||
else:
|
||
continue
|
||
|
||
for backend_name, url_suffix in backend_list:
|
||
if backend_name in backends:
|
||
new_url = f"http://{host}{url_suffix}"
|
||
if backends[backend_name].get("base_url") != new_url:
|
||
backends[backend_name]["base_url"] = new_url
|
||
changed = True
|
||
|
||
if changed:
|
||
cfg["backends"] = backends
|
||
LLM_YAML.write_text(yaml.dump(cfg, default_flow_style=False, allow_unicode=True,
|
||
sort_keys=False))
|
||
|
||
|
||
def write_compose_override(ports: dict[str, dict]) -> None:
|
||
"""
|
||
Generate compose.override.yml to stub out Docker services that are being
|
||
adopted from external processes. Cleans up the file when nothing to disable.
|
||
|
||
Stubbing strategy (not profiles): changing a service's profile to an unused
|
||
value breaks depends_on references — Docker treats it as undefined. Instead
|
||
we replace the service with a no-op stub that:
|
||
- Stays alive (sleep infinity) so depends_on: service_started is satisfied
|
||
- Reports healthy immediately so depends_on: service_healthy is satisfied
|
||
- Binds no ports (no conflict with the external service on the host)
|
||
"""
|
||
to_disable = {
|
||
name: info for name, info in ports.items()
|
||
if info["external"] and info["docker_owned"]
|
||
}
|
||
|
||
if not to_disable:
|
||
if OVERRIDE_YML.exists():
|
||
OVERRIDE_YML.unlink()
|
||
return
|
||
|
||
lines = [
|
||
"# compose.override.yml — AUTO-GENERATED by preflight.py, do not edit manually.",
|
||
"# Stubs out Docker services whose ports are already in use by host services.",
|
||
"# Re-run preflight (make preflight) to regenerate after stopping host services.",
|
||
"services:",
|
||
]
|
||
for name, info in to_disable.items():
|
||
lines += [
|
||
f" {name}: # adopted — host service on :{info['resolved']}",
|
||
f" entrypoint: [\"/bin/sh\", \"-c\", \"sleep infinity\"]",
|
||
f" ports: []",
|
||
f" healthcheck:",
|
||
f" test: [\"CMD\", \"true\"]",
|
||
f" interval: 1s",
|
||
f" timeout: 1s",
|
||
f" start_period: 0s",
|
||
f" retries: 1",
|
||
]
|
||
|
||
OVERRIDE_YML.write_text("\n".join(lines) + "\n")
|
||
|
||
|
||
# ── Main ──────────────────────────────────────────────────────────────────────
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(description="Peregrine preflight check")
|
||
parser.add_argument("--check-only", action="store_true",
|
||
help="Print report; don't write .env")
|
||
parser.add_argument("--quiet", action="store_true",
|
||
help="Suppress output; rely on exit code")
|
||
parser.add_argument("--service", metavar="NAME",
|
||
help="Print resolved port for one service and exit (e.g. streamlit)")
|
||
args = parser.parse_args()
|
||
|
||
svc = _load_svc()
|
||
ports = check_ports(svc)
|
||
|
||
# Single-service mode — used by manage.sh / manage-ui.sh
|
||
if args.service:
|
||
info = ports.get(args.service.lower())
|
||
if info:
|
||
print(info["resolved"])
|
||
else:
|
||
_, default, *_ = _SERVICES.get(args.service.lower(), (None, 8501, None, None, None))
|
||
print(default)
|
||
return
|
||
|
||
ram_total, ram_avail = get_ram_gb()
|
||
cpu_cores = get_cpu_cores()
|
||
gpus = get_gpus()
|
||
profile = recommend_profile(gpus, ram_total)
|
||
offload_gb = calc_cpu_offload_gb(gpus, ram_avail)
|
||
|
||
if not args.quiet:
|
||
print("╔══ Peregrine Preflight ══════════════════════════════╗")
|
||
print("║")
|
||
print("║ Ports")
|
||
for name, info in ports.items():
|
||
if info["external"]:
|
||
status = f"✓ adopted (using host service on :{info['resolved']})"
|
||
tag = "extern"
|
||
elif not info["docker_owned"]:
|
||
status = "⚠ not responding" if info["free"] else "✓ reachable"
|
||
tag = "extern"
|
||
elif info["free"]:
|
||
status = "✓ free"
|
||
tag = "owned "
|
||
elif info["changed"]:
|
||
status = f"→ reassigned to :{info['resolved']}"
|
||
tag = "owned "
|
||
else:
|
||
status = "⚠ in use"
|
||
tag = "owned "
|
||
print(f"║ {name:<10} :{info['configured']} [{tag}] {status}")
|
||
|
||
print("║")
|
||
print("║ Resources")
|
||
print(f"║ CPU {cpu_cores} core{'s' if cpu_cores != 1 else ''}")
|
||
if ram_total:
|
||
print(f"║ RAM {ram_total:.0f} GB total / {ram_avail:.1f} GB available")
|
||
else:
|
||
print("║ RAM (undetectable)")
|
||
if gpus:
|
||
for i, g in enumerate(gpus):
|
||
print(f"║ GPU {i} {g['name']} — "
|
||
f"{g['vram_free_gb']:.1f} / {g['vram_total_gb']:.0f} GB VRAM free")
|
||
else:
|
||
print("║ GPU none detected")
|
||
|
||
print("║")
|
||
print("║ Recommendations")
|
||
print(f"║ Docker profile {profile}")
|
||
if offload_gb > 0:
|
||
print(f"║ vLLM KV offload {offload_gb} GB → RAM (CPU_OFFLOAD_GB={offload_gb})")
|
||
else:
|
||
print("║ vLLM KV offload not needed")
|
||
|
||
reassigned = [n for n, i in ports.items() if i["changed"]]
|
||
adopted = [n for n, i in ports.items() if i["external"]]
|
||
|
||
if reassigned:
|
||
print("║")
|
||
print("║ Port reassignments written to .env:")
|
||
for name in reassigned:
|
||
info = ports[name]
|
||
print(f"║ {info['env_var']}={info['resolved']} (was :{info['configured']})")
|
||
|
||
if adopted:
|
||
print("║")
|
||
print("║ Adopted external services (Docker containers disabled):")
|
||
for name in adopted:
|
||
info = ports[name]
|
||
print(f"║ {name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}")
|
||
|
||
# ── Download size warning ──────────────────────────────────────────────
|
||
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
|
||
sizes = _download_size_mb(profile, dual_gpu_mode)
|
||
total_mb = sum(sizes.values())
|
||
print("║")
|
||
print("║ Download sizes (first-run estimates)")
|
||
print("║ Docker images")
|
||
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
|
||
if "searxng" in sizes:
|
||
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
|
||
if "ollama" in sizes:
|
||
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
|
||
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
|
||
if "vision_image" in sizes:
|
||
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
|
||
if "vllm_image" in sizes:
|
||
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
|
||
print("║ Model weights (lazy-loaded on first use)")
|
||
if "llama3_2_3b" in sizes:
|
||
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
|
||
if "moondream2" in sizes:
|
||
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
|
||
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
|
||
print("║ Note: ollama + ollama_research share model dir — no double download")
|
||
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
|
||
|
||
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
|
||
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
|
||
if vram_warn:
|
||
print("║")
|
||
print(f"║ {vram_warn}")
|
||
|
||
print("╚════════════════════════════════════════════════════╝")
|
||
|
||
if not args.check_only:
|
||
# For adopted services, write stub_port to .env so the no-op container
|
||
# binds a harmless free port instead of conflicting with the external service.
|
||
env_updates: dict[str, str] = {i["env_var"]: str(i["stub_port"]) for i in ports.values()}
|
||
env_updates["RECOMMENDED_PROFILE"] = profile
|
||
# When Ollama is adopted from the host process, write OLLAMA_HOST so
|
||
# LLMRouter's env-var auto-config finds it without needing config/llm.yaml.
|
||
ollama_info = ports.get("ollama")
|
||
if ollama_info and ollama_info.get("external"):
|
||
env_updates["OLLAMA_HOST"] = f"http://host.docker.internal:{ollama_info['resolved']}"
|
||
|
||
ollama_research_info = ports.get("ollama_research")
|
||
if ollama_research_info and ollama_research_info.get("external"):
|
||
env_updates["OLLAMA_RESEARCH_HOST"] = f"http://host.docker.internal:{ollama_research_info['resolved']}"
|
||
|
||
if offload_gb > 0:
|
||
env_updates["CPU_OFFLOAD_GB"] = str(offload_gb)
|
||
# GPU info for the app container (which lacks nvidia-smi access)
|
||
env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
|
||
env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
|
||
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
|
||
if len(gpus) >= 2:
|
||
existing_env: dict[str, str] = {}
|
||
if ENV_FILE.exists():
|
||
for line in ENV_FILE.read_text().splitlines():
|
||
if "=" in line and not line.startswith("#"):
|
||
k, _, v = line.partition("=")
|
||
existing_env[k.strip()] = v.strip()
|
||
if "DUAL_GPU_MODE" not in existing_env:
|
||
env_updates["DUAL_GPU_MODE"] = "ollama"
|
||
write_env(env_updates)
|
||
update_llm_yaml(ports)
|
||
write_compose_override(ports)
|
||
if not args.quiet:
|
||
artifacts = [str(ENV_FILE.relative_to(ROOT))]
|
||
if OVERRIDE_YML.exists():
|
||
artifacts.append(str(OVERRIDE_YML.relative_to(ROOT)))
|
||
print(f" wrote {', '.join(artifacts)}")
|
||
|
||
# Fail only when a non-adoptable owned port couldn't be reassigned
|
||
stuck = [n for n, i in ports.items()
|
||
if not i["free"] and not i["external"] and not i["changed"]]
|
||
sys.exit(1 if stuck else 0)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|