feat: startup preflight — port collision avoidance + resource checks

scripts/preflight.py (stdlib-only, no psutil): - Port probing: owned services auto-reassign to next free port; external services (Ollama) show ✓ reachable / ⚠ not responding - System resources: CPU cores, RAM (total + available), GPU VRAM via nvidia-smi; works on Linux + macOS - Profile recommendation: remote / cpu / single-gpu / dual-gpu - vLLM KV cache offload: calculates CPU_OFFLOAD_GB when VRAM < 10 GB free and RAM headroom > 4 GB (uses up to 25% of available headroom) - Writes resolved values to .env for docker compose; single-service mode (--service streamlit) for scripted port queries - Exit 0 unless an owned port genuinely can't be resolved scripts/manage-ui.sh: - Calls preflight.py --service streamlit before bind; falls back to pure-bash port scan if Python/yaml unavailable compose.yml: - vllm command: adds --cpu-offload-gb ${CPU_OFFLOAD_GB:-0} Makefile: - start / restart depend on preflight target - PYTHON variable for env portability - test target uses PYTHON variable
2026-02-24 20:36:16 -08:00 · 2026-02-24 20:36:16 -08:00 · e332b8a069
commit e332b8a069
parent c7fb9a00f1
4 changed files with 339 additions and 5 deletions
--- a/14
+++ b/14
@ -1,27 +1,31 @@
 # Makefile — Peregrine convenience targets
 # Usage: make <target>

-.PHONY: setup start stop restart logs test clean
+.PHONY: setup preflight start stop restart logs test clean help

 PROFILE ?= remote
+PYTHON  ?= python3

 setup:          ## Install dependencies (Docker, NVIDIA toolkit)
 	@bash setup.sh

-start:          ## Start Peregrine (PROFILE=remote|cpu|single-gpu|dual-gpu)
+preflight:      ## Check ports + system resources; write .env
+	@$(PYTHON) scripts/preflight.py
+
+start: preflight  ## Preflight check then start Peregrine (PROFILE=remote|cpu|single-gpu|dual-gpu)
 	docker compose --profile $(PROFILE) up -d

 stop:           ## Stop all Peregrine services
 	docker compose down

-restart:        ## Restart all services
+restart: preflight  ## Preflight check then restart all services
 	docker compose down && docker compose --profile $(PROFILE) up -d

 logs:           ## Tail app logs
 	docker compose logs -f app

-test:           ## Run the test suite (requires conda env)
-	/devl/miniconda3/envs/job-seeker/bin/pytest tests/ -v
+test:           ## Run the test suite
+	$(PYTHON) -m pytest tests/ -v

 clean:          ## Remove containers, images, and data volumes (DESTRUCTIVE)
 	@echo "WARNING: This will delete all Peregrine containers and data."
--- a/compose.yml
+++ b/compose.yml
@ -91,6 +91,7 @@ services:
      --gpu-memory-utilization 0.75
      --enforce-eager
      --max-num-seqs 8
+      --cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
    deploy:
      resources:
        reservations:
--- a/scripts/manage-ui.sh
+++ b/scripts/manage-ui.sh
@ -22,12 +22,40 @@ PID_FILE="$REPO_DIR/.streamlit.pid"
 LOG_FILE="$REPO_DIR/.streamlit.log"
 PORT="${STREAMLIT_PORT:-8501}"

+_resolve_port() {
+    # Ask preflight.py for the next free port near the configured port.
+    # Falls back to a pure-bash scan if Python/yaml is not available.
+    local python_bin
+    for python_bin in python3 python; do
+        if command -v "$python_bin" &>/dev/null && \
+           "$python_bin" -c "import yaml" &>/dev/null 2>&1; then
+            local resolved
+            resolved=$("$python_bin" "$REPO_DIR/scripts/preflight.py" --service streamlit 2>/dev/null)
+            if [[ -n "$resolved" && "$resolved" =~ ^[0-9]+$ ]]; then
+                echo "$resolved"; return
+            fi
+        fi
+    done
+    # Pure-bash fallback: scan for a free port
+    local p="$PORT"
+    while (echo >/dev/tcp/127.0.0.1/"$p") 2>/dev/null; do
+        ((p++))
+        [[ $p -gt $((PORT + 20)) ]] && break
+    done
+    echo "$p"
+}
+
 start() {
    if is_running; then
        echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
        return 0
    fi

+    PORT=$(_resolve_port)
+    if [[ "$PORT" != "${STREAMLIT_PORT:-8501}" ]]; then
+        echo "Port ${STREAMLIT_PORT:-8501} in use — using $PORT instead."
+    fi
+
    echo "Starting Streamlit on http://localhost:$PORT …"
    "$STREAMLIT_BIN" run "$APP_ENTRY" \
        --server.port "$PORT" \
--- a/scripts/preflight.py
+++ b/scripts/preflight.py
@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""
+Peregrine preflight check.
+
+Scans for port conflicts, assesses system resources (RAM / CPU / GPU),
+recommends a Docker Compose profile, and calculates optional vLLM KV-cache
+CPU offload when VRAM is tight.  Writes resolved settings to .env so docker
+compose picks them up automatically.
+
+Usage:
+    python scripts/preflight.py              # full report + write .env
+    python scripts/preflight.py --check-only # report only, no .env write
+    python scripts/preflight.py --service streamlit  # print resolved port, exit
+    python scripts/preflight.py --quiet      # machine-readable, exit 0/1
+
+Exit codes:
+  0 — all checks passed (or issues auto-resolved)
+  1 — manual action required (unresolvable port conflict on external service)
+"""
+import argparse
+import platform
+import socket
+import subprocess
+import sys
+from pathlib import Path
+
+import yaml
+
+ROOT = Path(__file__).parent.parent
+USER_YAML = ROOT / "config" / "user.yaml"
+ENV_FILE = ROOT / ".env"
+
+# ── Port table ────────────────────────────────────────────────────────────────
+# (yaml_key, default, env_var, peregrine_owns_it)
+_PORTS: dict[str, tuple[str, int, str, bool]] = {
+    "streamlit": ("streamlit_port",   8501, "STREAMLIT_PORT", True),
+    "searxng":   ("searxng_port",     8888, "SEARXNG_PORT",   True),
+    "vllm":      ("vllm_port",        8000, "VLLM_PORT",      True),
+    "vision":    ("vision_port",      8002, "VISION_PORT",    True),
+    "ollama":    ("ollama_port",     11434, "OLLAMA_PORT",    False),
+}
+
+
+# ── System probes (stdlib only — no psutil) ───────────────────────────────────
+
+def _sh(*cmd: str, timeout: int = 5) -> str:
+    try:
+        r = subprocess.run(list(cmd), capture_output=True, text=True, timeout=timeout)
+        return r.stdout.strip() if r.returncode == 0 else ""
+    except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
+        return ""
+
+
+def get_ram_gb() -> tuple[float, float]:
+    """Return (total_gb, available_gb).  Returns (0, 0) if undetectable."""
+    os_name = platform.system()
+    if os_name == "Linux":
+        try:
+            meminfo = Path("/proc/meminfo").read_text()
+        except OSError:
+            return 0.0, 0.0
+        total = available = 0
+        for line in meminfo.splitlines():
+            if line.startswith("MemTotal:"):
+                total = int(line.split()[1])
+            elif line.startswith("MemAvailable:"):
+                available = int(line.split()[1])
+        return total / 1024 / 1024, available / 1024 / 1024
+    elif os_name == "Darwin":
+        total_bytes = _sh("sysctl", "-n", "hw.memsize")
+        total = int(total_bytes) / 1024 ** 3 if total_bytes.isdigit() else 0.0
+        vm = _sh("vm_stat")
+        free_pages = 0
+        for line in vm.splitlines():
+            if "Pages free" in line or "Pages speculative" in line:
+                try:
+                    free_pages += int(line.split()[-1].rstrip("."))
+                except ValueError:
+                    pass
+        available = free_pages * 4096 / 1024 ** 3
+        return total, available
+    return 0.0, 0.0
+
+
+def get_cpu_cores() -> int:
+    import os
+    return os.cpu_count() or 1
+
+
+def get_gpus() -> list[dict]:
+    """Return list of {name, vram_total_gb, vram_free_gb} via nvidia-smi."""
+    out = _sh(
+        "nvidia-smi",
+        "--query-gpu=name,memory.total,memory.free",
+        "--format=csv,noheader,nounits",
+    )
+    if not out:
+        return []
+    gpus = []
+    for line in out.splitlines():
+        parts = [p.strip() for p in line.split(",")]
+        if len(parts) == 3:
+            try:
+                gpus.append({
+                    "name": parts[0],
+                    "vram_total_gb": round(int(parts[1]) / 1024, 1),
+                    "vram_free_gb":  round(int(parts[2]) / 1024, 1),
+                })
+            except ValueError:
+                pass
+    return gpus
+
+
+# ── Port probes ───────────────────────────────────────────────────────────────
+
+def _load_svc() -> dict:
+    if USER_YAML.exists():
+        return (yaml.safe_load(USER_YAML.read_text()) or {}).get("services", {})
+    return {}
+
+
+def is_port_free(port: int) -> bool:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.settimeout(0.3)
+        return s.connect_ex(("127.0.0.1", port)) != 0
+
+
+def find_free_port(start: int, limit: int = 30) -> int:
+    for p in range(start, start + limit):
+        if is_port_free(p):
+            return p
+    raise RuntimeError(f"No free port found in range {start}–{start + limit - 1}")
+
+
+def check_ports(svc: dict) -> dict[str, dict]:
+    results = {}
+    for name, (yaml_key, default, env_var, owned) in _PORTS.items():
+        configured = int(svc.get(yaml_key, default))
+        free = is_port_free(configured)
+        resolved = configured if (free or not owned) else find_free_port(configured + 1)
+        results[name] = {
+            "configured": configured,
+            "resolved":   resolved,
+            "changed":    resolved != configured,
+            "owned":      owned,
+            "free":       free,
+            "env_var":    env_var,
+        }
+    return results
+
+
+# ── Recommendations ───────────────────────────────────────────────────────────
+
+def recommend_profile(gpus: list[dict], ram_total_gb: float) -> str:
+    if len(gpus) >= 2:
+        return "dual-gpu"
+    if len(gpus) == 1:
+        return "single-gpu"
+    if ram_total_gb >= 8:
+        return "cpu"
+    return "remote"
+
+
+def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
+    """
+    Suggest GBs of KV cache to offload from GPU VRAM → system RAM.
+
+    Enabled when VRAM is tight (< 10 GB free on any GPU) and there is
+    enough RAM headroom (> 4 GB available).  Uses at most 25% of the
+    RAM headroom above 4 GB, capped at 8 GB.
+    """
+    if not gpus or ram_available_gb < 4:
+        return 0
+    min_vram_free = min(g["vram_free_gb"] for g in gpus)
+    if min_vram_free >= 10:
+        return 0
+    headroom = ram_available_gb - 4.0  # reserve 4 GB for OS
+    return min(int(headroom * 0.25), 8)
+
+
+# ── .env writer ───────────────────────────────────────────────────────────────
+
+def write_env(updates: dict[str, str]) -> None:
+    existing: dict[str, str] = {}
+    if ENV_FILE.exists():
+        for line in ENV_FILE.read_text().splitlines():
+            line = line.strip()
+            if "=" in line and not line.startswith("#"):
+                k, _, v = line.partition("=")
+                existing[k.strip()] = v.strip()
+    existing.update(updates)
+    ENV_FILE.write_text(
+        "\n".join(f"{k}={v}" for k, v in sorted(existing.items())) + "\n"
+    )
+
+
+# ── Main ──────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Peregrine preflight check")
+    parser.add_argument("--check-only", action="store_true",
+                        help="Print report; don't write .env")
+    parser.add_argument("--quiet", action="store_true",
+                        help="Suppress output; rely on exit code")
+    parser.add_argument("--service", metavar="NAME",
+                        help="Print resolved port for one service and exit (e.g. streamlit)")
+    args = parser.parse_args()
+
+    svc = _load_svc()
+    ports = check_ports(svc)
+
+    # Single-service mode — used by manage-ui.sh
+    if args.service:
+        info = ports.get(args.service.lower())
+        print(info["resolved"] if info else _PORTS[args.service.lower()][1])
+        return
+
+    ram_total, ram_avail = get_ram_gb()
+    cpu_cores = get_cpu_cores()
+    gpus = get_gpus()
+    profile = recommend_profile(gpus, ram_total)
+    offload_gb = calc_cpu_offload_gb(gpus, ram_avail)
+
+    if not args.quiet:
+        reassigned = [n for n, i in ports.items() if i["changed"]]
+        unresolved  = [n for n, i in ports.items() if not i["free"] and not i["changed"]]
+
+        print("╔══ Peregrine Preflight ══════════════════════════════╗")
+        print("║")
+        print("║  Ports")
+        for name, info in ports.items():
+            tag = "owned " if info["owned"] else "extern"
+            if not info["owned"]:
+                # external: in-use means the service is reachable
+                status = "✓ reachable" if not info["free"] else "⚠ not responding"
+            elif info["free"]:
+                status = "✓ free"
+            elif info["changed"]:
+                status = f"→ reassigned to :{info['resolved']}"
+            else:
+                status = "⚠ in use"
+            print(f"║    {name:<10} :{info['configured']}  [{tag}]  {status}")
+
+        print("║")
+        print("║  Resources")
+        print(f"║    CPU      {cpu_cores} core{'s' if cpu_cores != 1 else ''}")
+        if ram_total:
+            print(f"║    RAM      {ram_total:.0f} GB total  /  {ram_avail:.1f} GB available")
+        else:
+            print("║    RAM      (undetectable)")
+        if gpus:
+            for i, g in enumerate(gpus):
+                print(f"║    GPU {i}    {g['name']}  —  "
+                      f"{g['vram_free_gb']:.1f} / {g['vram_total_gb']:.0f} GB VRAM free")
+        else:
+            print("║    GPU      none detected")
+
+        print("║")
+        print("║  Recommendations")
+        print(f"║    Docker profile   {profile}")
+        if offload_gb > 0:
+            print(f"║    vLLM KV offload  {offload_gb} GB → RAM  (CPU_OFFLOAD_GB={offload_gb})")
+        else:
+            print("║    vLLM KV offload  not needed")
+
+        if reassigned:
+            print("║")
+            print("║  Port reassignments written to .env:")
+            for name in reassigned:
+                info = ports[name]
+                print(f"║    {info['env_var']}={info['resolved']}  (was :{info['configured']})")
+
+        # External services: in-use = ✓ running; free = warn (may be down)
+        ext_down = [n for n, i in ports.items() if not i["owned"] and i["free"]]
+        if ext_down:
+            print("║")
+            print("║  ⚠  External services not detected on configured port:")
+            for name in ext_down:
+                info = ports[name]
+                svc_key = _PORTS[name][0]
+                print(f"║    {name} :{info['configured']} — nothing listening "
+                      f"(start the service or update services.{svc_key} in user.yaml)")
+
+        print("╚════════════════════════════════════════════════════╝")
+
+    if not args.check_only:
+        env_updates: dict[str, str] = {i["env_var"]: str(i["resolved"]) for i in ports.values()}
+        env_updates["RECOMMENDED_PROFILE"] = profile
+        if offload_gb > 0:
+            env_updates["CPU_OFFLOAD_GB"] = str(offload_gb)
+        write_env(env_updates)
+        if not args.quiet:
+            print(f"  wrote {ENV_FILE.relative_to(ROOT)}")
+
+    # Fail only when an owned port can't be resolved (shouldn't happen in practice)
+    owned_stuck = [n for n, i in ports.items() if i["owned"] and not i["free"] and not i["changed"]]
+    sys.exit(1 if owned_stuck else 0)
+
+
+if __name__ == "__main__":
+    main()