peregrine/scripts/preflight.py

#!/usr/bin/env python3
"""
Peregrine preflight check.

Scans for port conflicts, assesses system resources (RAM / CPU / GPU),
recommends a Docker Compose profile, and calculates optional vLLM KV-cache
CPU offload when VRAM is tight.  Writes resolved settings to .env so docker
compose picks them up automatically.

When a managed service (ollama, vllm, vision, searxng) is already running
on its configured port, preflight *adopts* it: the app is configured to reach
it via host.docker.internal, and a compose.override.yml is generated to
prevent Docker from starting a conflicting container.

Usage:
    python scripts/preflight.py              # full report + write .env
    python scripts/preflight.py --check-only # report only, no .env write
    python scripts/preflight.py --service streamlit  # print resolved port, exit
    python scripts/preflight.py --quiet      # machine-readable, exit 0/1

Exit codes:
  0 — all checks passed (or issues auto-resolved)
  1 — manual action required (unresolvable port conflict on external service)
"""
import argparse
import os
import platform
import socket
import subprocess
import sys
from pathlib import Path

import yaml

ROOT = Path(__file__).parent.parent
USER_YAML    = ROOT / "config" / "user.yaml"
LLM_YAML     = ROOT / "config" / "llm.yaml"
ENV_FILE     = ROOT / ".env"
OVERRIDE_YML = ROOT / "compose.override.yml"

# ── Service table ──────────────────────────────────────────────────────────────
# (yaml_key, default_port, env_var, docker_owned, adoptable)
#
# docker_owned  — True if Docker Compose normally starts this service
# adoptable     — True if an existing process on this port should be used instead
#                 of starting a Docker container (and the Docker service disabled)
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
    "streamlit":       ("streamlit_port",       8501,  "STREAMLIT_PORT",       True,  False),
    "searxng":         ("searxng_port",          8888,  "SEARXNG_PORT",         True,  True),
    # vllm removed — now managed by cf-orch (host process), not a Docker service
    "vision":          ("vision_port",           8002,  "VISION_PORT",          True,  True),
    "ollama":          ("ollama_port",          11434,  "OLLAMA_PORT",          True,  True),
    "ollama_research": ("ollama_research_port", 11435,  "OLLAMA_RESEARCH_PORT", True,  True),
}

# LLM yaml backend keys → url suffix, keyed by service name
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
    "ollama":          [("ollama", "/v1")],
    "ollama_research": [("ollama_research", "/v1")],
    "vllm":            [("vllm", "/v1"), ("vllm_research", "/v1")],
    "vision":          [("vision_service", "")],
}

# Docker-internal hostname:port for each service (when running in Docker)
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
    "ollama":          ("ollama",          11434),
    "ollama_research": ("ollama_research", 11434),  # container-internal port is always 11434
    "vision":          ("vision",           8002),
    "searxng":         ("searxng",          8080),  # searxng internal port differs from host port
}


# ── System probes (stdlib only — no psutil) ───────────────────────────────────

def _sh(*cmd: str, timeout: int = 5) -> str:
    try:
        r = subprocess.run(list(cmd), capture_output=True, text=True, timeout=timeout)
        return r.stdout.strip() if r.returncode == 0 else ""
    except (FileNotFoundError, subprocess.TimeoutExpired, OSError):
        return ""


def get_ram_gb() -> tuple[float, float]:
    """Return (total_gb, available_gb).  Returns (0, 0) if undetectable."""
    os_name = platform.system()
    if os_name == "Linux":
        try:
            meminfo = Path("/proc/meminfo").read_text()
        except OSError:
            return 0.0, 0.0
        total = available = 0
        for line in meminfo.splitlines():
            if line.startswith("MemTotal:"):
                total = int(line.split()[1])
            elif line.startswith("MemAvailable:"):
                available = int(line.split()[1])
        return total / 1024 / 1024, available / 1024 / 1024
    elif os_name == "Darwin":
        total_bytes = _sh("sysctl", "-n", "hw.memsize")
        total = int(total_bytes) / 1024 ** 3 if total_bytes.isdigit() else 0.0
        vm = _sh("vm_stat")
        free_pages = 0
        for line in vm.splitlines():
            if "Pages free" in line or "Pages speculative" in line:
                try:
                    free_pages += int(line.split()[-1].rstrip("."))
                except ValueError:
                    pass
        available = free_pages * 4096 / 1024 ** 3
        return total, available
    return 0.0, 0.0


def get_cpu_cores() -> int:
    return os.cpu_count() or 1


def get_gpus() -> list[dict]:
    """Return list of {name, vram_total_gb, vram_free_gb} via nvidia-smi."""
    out = _sh(
        "nvidia-smi",
        "--query-gpu=name,memory.total,memory.free",
        "--format=csv,noheader,nounits",
    )
    if not out:
        return []
    gpus = []
    for line in out.splitlines():
        parts = [p.strip() for p in line.split(",")]
        if len(parts) == 3:
            try:
                gpus.append({
                    "name": parts[0],
                    "vram_total_gb": round(int(parts[1]) / 1024, 1),
                    "vram_free_gb":  round(int(parts[2]) / 1024, 1),
                })
            except ValueError:
                pass
    return gpus


# ── Port probes ───────────────────────────────────────────────────────────────

def _load_svc() -> dict:
    if USER_YAML.exists():
        return (yaml.safe_load(USER_YAML.read_text()) or {}).get("services", {})
    return {}


def is_port_free(port: int) -> bool:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.settimeout(0.3)
        return s.connect_ex(("127.0.0.1", port)) != 0


def find_free_port(start: int, limit: int = 30) -> int:
    for p in range(start, start + limit):
        if is_port_free(p):
            return p
    raise RuntimeError(f"No free port found in range {start}–{start + limit - 1}")


def check_ports(svc: dict) -> dict[str, dict]:
    results = {}
    for name, (yaml_key, default, env_var, docker_owned, adoptable) in _SERVICES.items():
        configured = int(svc.get(yaml_key, default))
        free = is_port_free(configured)

        if free:
            # Port is free — start Docker service as normal
            resolved = configured
            stub_port = configured
            external = False
        elif adoptable:
            # Port is in use by a compatible service — adopt it.
            # resolved = actual external port (used for host.docker.internal URL)
            # stub_port = free port for the no-op stub container (avoids binding conflict)
            resolved = configured
            stub_port = find_free_port(configured + 1)
            external = True
        else:
            # Port in use, not adoptable (e.g. streamlit) — reassign
            resolved = find_free_port(configured + 1)
            stub_port = resolved
            external = False

        results[name] = {
            "configured":   configured,
            "resolved":     resolved,
            "stub_port":    stub_port,
            "changed":      resolved != configured,
            "docker_owned": docker_owned,
            "adoptable":    adoptable,
            "free":         free,
            "external":     external,
            "env_var":      env_var,
        }
    return results


# ── Recommendations ───────────────────────────────────────────────────────────

def recommend_profile(gpus: list[dict], ram_total_gb: float) -> str:
    if len(gpus) >= 2:
        return "dual-gpu"
    if len(gpus) == 1:
        return "single-gpu"
    if ram_total_gb >= 8:
        return "cpu"
    return "remote"


def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
    """
    Suggest GBs of KV cache to offload from GPU VRAM → system RAM.

    Enabled when VRAM is tight (< 10 GB free on any GPU) and there is
    enough RAM headroom (> 4 GB available).  Uses at most 25% of the
    RAM headroom above 4 GB, capped at 8 GB.
    """
    if not gpus or ram_available_gb < 4:
        return 0
    min_vram_free = min(g["vram_free_gb"] for g in gpus)
    if min_vram_free >= 10:
        return 0
    headroom = ram_available_gb - 4.0  # reserve 4 GB for OS
    return min(int(headroom * 0.25), 8)


def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
    """
    Return estimated first-run download sizes in MB, keyed by component name.
    Profile-aware: only includes components that will actually be pulled.
    """
    sizes: dict[str, int] = {
        "searxng": 300,
        "app":     1500,
    }
    if profile in ("cpu", "single-gpu", "dual-gpu"):
        sizes["ollama"]      = 800
        sizes["llama3_2_3b"] = 2000
    if profile in ("single-gpu", "dual-gpu"):
        sizes["vision_image"] = 3000
        sizes["moondream2"]   = 1800
    if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
        sizes["vllm_image"] = 10000
    return sizes


def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
    """
    Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
    Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
    """
    if dual_gpu_mode != "mixed" or len(gpus) < 2:
        return None
    free = gpus[1]["vram_free_gb"]
    if free < 12:
        return (
            f"⚠  DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
            f"running ollama_research + vllm together may cause OOM. "
            f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
        )
    return None


# ── Config writers ─────────────────────────────────────────────────────────────

def write_env(updates: dict[str, str]) -> None:
    existing: dict[str, str] = {}
    if ENV_FILE.exists():
        for line in ENV_FILE.read_text().splitlines():
            line = line.strip()
            if "=" in line and not line.startswith("#"):
                k, _, v = line.partition("=")
                existing[k.strip()] = v.strip()
    existing.update(updates)
    ENV_FILE.write_text(
        "\n".join(f"{k}={v}" for k, v in sorted(existing.items())) + "\n"
    )


def update_llm_yaml(ports: dict[str, dict]) -> None:
    """Rewrite base_url entries in config/llm.yaml to match adopted/internal services."""
    if not LLM_YAML.exists():
        return
    cfg = yaml.safe_load(LLM_YAML.read_text()) or {}
    backends = cfg.get("backends", {})
    changed = False

    for svc_name, backend_list in _LLM_BACKENDS.items():
        if svc_name not in ports:
            continue
        info = ports[svc_name]
        port = info["resolved"]

        if info["external"]:
            # Reach the host service from inside the Docker container
            host = f"host.docker.internal:{port}"
        elif svc_name in _DOCKER_INTERNAL:
            # Use Docker service name + internal port
            docker_host, internal_port = _DOCKER_INTERNAL[svc_name]
            host = f"{docker_host}:{internal_port}"
        else:
            continue

        for backend_name, url_suffix in backend_list:
            if backend_name in backends:
                new_url = f"http://{host}{url_suffix}"
                if backends[backend_name].get("base_url") != new_url:
                    backends[backend_name]["base_url"] = new_url
                    changed = True

    if changed:
        cfg["backends"] = backends
        LLM_YAML.write_text(yaml.dump(cfg, default_flow_style=False, allow_unicode=True,
                                      sort_keys=False))


def write_compose_override(ports: dict[str, dict]) -> None:
    """
    Generate compose.override.yml to stub out Docker services that are being
    adopted from external processes.  Cleans up the file when nothing to disable.

    Stubbing strategy (not profiles): changing a service's profile to an unused
    value breaks depends_on references — Docker treats it as undefined.  Instead
    we replace the service with a no-op stub that:
      - Stays alive (sleep infinity) so depends_on: service_started is satisfied
      - Reports healthy immediately so depends_on: service_healthy is satisfied
      - Binds no ports (no conflict with the external service on the host)
    """
    to_disable = {
        name: info for name, info in ports.items()
        if info["external"] and info["docker_owned"]
    }

    if not to_disable:
        if OVERRIDE_YML.exists():
            OVERRIDE_YML.unlink()
        return

    lines = [
        "# compose.override.yml — AUTO-GENERATED by preflight.py, do not edit manually.",
        "# Stubs out Docker services whose ports are already in use by host services.",
        "# Re-run preflight (make preflight) to regenerate after stopping host services.",
        "services:",
    ]
    for name, info in to_disable.items():
        lines += [
            f"  {name}:  # adopted — host service on :{info['resolved']}",
            f"    entrypoint: [\"/bin/sh\", \"-c\", \"sleep infinity\"]",
            f"    ports: []",
            f"    healthcheck:",
            f"      test: [\"CMD\", \"true\"]",
            f"      interval: 1s",
            f"      timeout: 1s",
            f"      start_period: 0s",
            f"      retries: 1",
        ]

    OVERRIDE_YML.write_text("\n".join(lines) + "\n")


# ── Main ──────────────────────────────────────────────────────────────────────

def main() -> None:
    parser = argparse.ArgumentParser(description="Peregrine preflight check")
    parser.add_argument("--check-only", action="store_true",
                        help="Print report; don't write .env")
    parser.add_argument("--quiet", action="store_true",
                        help="Suppress output; rely on exit code")
    parser.add_argument("--service", metavar="NAME",
                        help="Print resolved port for one service and exit (e.g. streamlit)")
    args = parser.parse_args()

    svc = _load_svc()
    ports = check_ports(svc)

    # Single-service mode — used by manage.sh / manage-ui.sh
    if args.service:
        info = ports.get(args.service.lower())
        if info:
            print(info["resolved"])
        else:
            _, default, *_ = _SERVICES.get(args.service.lower(), (None, 8501, None, None, None))
            print(default)
        return

    ram_total, ram_avail = get_ram_gb()
    cpu_cores = get_cpu_cores()
    gpus = get_gpus()
    profile = recommend_profile(gpus, ram_total)
    offload_gb = calc_cpu_offload_gb(gpus, ram_avail)

    if not args.quiet:
        print("╔══ Peregrine Preflight ══════════════════════════════╗")
        print("║")
        print("║  Ports")
        for name, info in ports.items():
            if info["external"]:
                status = f"✓ adopted  (using host service on :{info['resolved']})"
                tag = "extern"
            elif not info["docker_owned"]:
                status = "⚠ not responding" if info["free"] else "✓ reachable"
                tag = "extern"
            elif info["free"]:
                status = "✓ free"
                tag = "owned "
            elif info["changed"]:
                status = f"→ reassigned to :{info['resolved']}"
                tag = "owned "
            else:
                status = "⚠ in use"
                tag = "owned "
            print(f"║    {name:<10} :{info['configured']}  [{tag}]  {status}")

        print("║")
        print("║  Resources")
        print(f"║    CPU      {cpu_cores} core{'s' if cpu_cores != 1 else ''}")
        if ram_total:
            print(f"║    RAM      {ram_total:.0f} GB total  /  {ram_avail:.1f} GB available")
        else:
            print("║    RAM      (undetectable)")
        if gpus:
            for i, g in enumerate(gpus):
                print(f"║    GPU {i}    {g['name']}  —  "
                      f"{g['vram_free_gb']:.1f} / {g['vram_total_gb']:.0f} GB VRAM free")
        else:
            print("║    GPU      none detected")

        print("║")
        print("║  Recommendations")
        print(f"║    Docker profile   {profile}")
        if offload_gb > 0:
            print(f"║    vLLM KV offload  {offload_gb} GB → RAM  (CPU_OFFLOAD_GB={offload_gb})")
        else:
            print("║    vLLM KV offload  not needed")

        reassigned = [n for n, i in ports.items() if i["changed"]]
        adopted    = [n for n, i in ports.items() if i["external"]]

        if reassigned:
            print("║")
            print("║  Port reassignments written to .env:")
            for name in reassigned:
                info = ports[name]
                print(f"║    {info['env_var']}={info['resolved']}  (was :{info['configured']})")

        if adopted:
            print("║")
            print("║  Adopted external services (Docker containers disabled):")
            for name in adopted:
                info = ports[name]
                print(f"║    {name} :{info['resolved']}  → app will use host.docker.internal:{info['resolved']}")

        # ── Download size warning ──────────────────────────────────────────────
        dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
        sizes = _download_size_mb(profile, dual_gpu_mode)
        total_mb = sum(sizes.values())
        print("║")
        print("║  Download sizes (first-run estimates)")
        print("║    Docker images")
        print(f"║      app (Python build)   ~{sizes.get('app', 0):,} MB")
        if "searxng" in sizes:
            print(f"║      searxng/searxng       ~{sizes['searxng']:,} MB")
        if "ollama" in sizes:
            shared_note = "  (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
            print(f"║      ollama/ollama         ~{sizes['ollama']:,} MB{shared_note}")
        if "vision_image" in sizes:
            print(f"║      vision service        ~{sizes['vision_image']:,} MB  (torch + moondream)")
        if "vllm_image" in sizes:
            print(f"║      vllm/vllm-openai      ~{sizes['vllm_image']:,} MB")
        print("║    Model weights  (lazy-loaded on first use)")
        if "llama3_2_3b" in sizes:
            print(f"║      llama3.2:3b            ~{sizes['llama3_2_3b']:,} MB  → OLLAMA_MODELS_DIR")
        if "moondream2" in sizes:
            print(f"║      moondream2             ~{sizes['moondream2']:,} MB  → vision container cache")
        if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
            print("║    Note: ollama + ollama_research share model dir — no double download")
        print(f"║  ⚠  Total first-run: ~{total_mb / 1024:.1f} GB  (models persist between restarts)")

        # ── Mixed-mode VRAM warning ────────────────────────────────────────────
        vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
        if vram_warn:
            print("║")
            print(f"║  {vram_warn}")

        print("╚════════════════════════════════════════════════════╝")

    if not args.check_only:
        # For adopted services, write stub_port to .env so the no-op container
        # binds a harmless free port instead of conflicting with the external service.
        env_updates: dict[str, str] = {i["env_var"]: str(i["stub_port"]) for i in ports.values()}
        env_updates["RECOMMENDED_PROFILE"] = profile
        # When Ollama is adopted from the host process, write OLLAMA_HOST so
        # LLMRouter's env-var auto-config finds it without needing config/llm.yaml.
        ollama_info = ports.get("ollama")
        if ollama_info and ollama_info.get("external"):
            env_updates["OLLAMA_HOST"] = f"http://host.docker.internal:{ollama_info['resolved']}"

        ollama_research_info = ports.get("ollama_research")
        if ollama_research_info and ollama_research_info.get("external"):
            env_updates["OLLAMA_RESEARCH_HOST"] = f"http://host.docker.internal:{ollama_research_info['resolved']}"

        if offload_gb > 0:
            env_updates["CPU_OFFLOAD_GB"] = str(offload_gb)
        # GPU info for the app container (which lacks nvidia-smi access)
        env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
        env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
        # Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
        if len(gpus) >= 2:
            existing_env: dict[str, str] = {}
            if ENV_FILE.exists():
                for line in ENV_FILE.read_text().splitlines():
                    if "=" in line and not line.startswith("#"):
                        k, _, v = line.partition("=")
                        existing_env[k.strip()] = v.strip()
            if "DUAL_GPU_MODE" not in existing_env:
                env_updates["DUAL_GPU_MODE"] = "ollama"
        write_env(env_updates)
        update_llm_yaml(ports)
        write_compose_override(ports)
        if not args.quiet:
            artifacts = [str(ENV_FILE.relative_to(ROOT))]
            if OVERRIDE_YML.exists():
                artifacts.append(str(OVERRIDE_YML.relative_to(ROOT)))
            print(f"  wrote {', '.join(artifacts)}")

    # Fail only when a non-adoptable owned port couldn't be reassigned
    stuck = [n for n, i in ports.items()
             if not i["free"] and not i["external"] and not i["changed"]]
    sys.exit(1 if stuck else 0)


if __name__ == "__main__":
    main()