feat(orch): replace Ouro/vllm-Docker with generic HF inference server; add ProcessSpec

- Add circuitforge_core/resources/inference/llm_server.py: generic OpenAI-compatible FastAPI server for any HuggingFace causal LM (Phi-4-mini-instruct, Qwen2.5-3B-Instruct) - Add service_manager.py + service_probe.py: ProcessSpec start/stop/is_running support (Popen-based; socket probe confirms readiness before marking running) - Update all 4 public GPU profiles to use ProcessSpec→llm_server instead of Docker vllm: 6gb (max_mb 5500), 8gb (max_mb 6500), 16gb/24gb (max_mb 9000) - Model candidates: Phi-4-mini-instruct first (7.2GB), Qwen2.5-3B-Instruct fallback (5.8GB) - Remove ouro_server.py (Ouro incompatible with transformers 5.x; vllm Docker also incompatible) - Add 17 tests for ServiceManager ProcessSpec (start/stop/is_running/list/get_url)
2026-04-02 15:33:08 -07:00 · 2026-04-02 15:33:08 -07:00 · c78341fc6f
commit c78341fc6f
parent 27999925cf
13 changed files with 707 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,9 @@ __pycache__/
 dist/
 .pytest_cache/
 .superpowers/
 .coverage
 build/
 "<MagicMock*"
 # cf-orch private profiles (commit on personal/heimdall branch only)
 circuitforge_core/resources/profiles/private/
--- a/circuitforge_core/db/base.py
+++ b/circuitforge_core/db/base.py
@ -27,4 +27,8 @@ def get_connection(db_path: Path, key: str = "") -> sqlite3.Connection:
        return conn
    # timeout=30: retry for up to 30s when another writer holds the lock (WAL mode
    # allows concurrent readers but only one writer at a time).
-    return sqlite3.connect(str(db_path), timeout=30)
+    # check_same_thread=False: each Store is owned by exactly one request; FastAPI
    # uses asyncio.to_thread() to run sync DB calls in a worker thread, crossing
    # the thread boundary that sqlite3 guards by default. Since no two threads share
    # the same connection, disabling the check is safe.
    return sqlite3.connect(str(db_path), timeout=30, check_same_thread=False)
--- a/circuitforge_core/resources/agent/app.py
+++ b/circuitforge_core/resources/agent/app.py
@ -3,11 +3,12 @@ from __future__ import annotations
 import logging
 from typing import Any
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor
 from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
 from circuitforge_core.resources.agent.service_manager import ServiceManager
 logger = logging.getLogger(__name__)
@ -17,10 +18,16 @@ class EvictRequest(BaseModel):
    grace_period_s: float = 5.0
 class ServiceStartRequest(BaseModel):
    gpu_id: int = 0
    params: dict[str, str] = {}
 def create_agent_app(
    node_id: str,
    monitor: GpuMonitor | None = None,
    executor: EvictionExecutor | None = None,
    service_manager: ServiceManager | None = None,
 ) -> FastAPI:
    _monitor = monitor or GpuMonitor()
    _executor = executor or EvictionExecutor()
@ -57,4 +64,38 @@ def create_agent_app(
            "message": result.message,
        }
    @app.get("/resident-info")
    def resident_info() -> dict[str, Any]:
        """Return which models are currently loaded in each running managed service."""
        if service_manager is None:
            return {"residents": []}
        from circuitforge_core.resources.agent.service_probe import probe_all
        return {"residents": probe_all(service_manager)}
    if service_manager is not None:
        @app.get("/services")
        def list_services() -> dict:
            return {"running": service_manager.list_running()}
        @app.get("/services/{service}")
        def service_status(service: str) -> dict:
            running = service_manager.is_running(service)
            url = service_manager.get_url(service) if running else None
            return {"service": service, "running": running, "url": url}
        @app.post("/services/{service}/start")
        def start_service(service: str, req: ServiceStartRequest) -> dict:
            try:
                url = service_manager.start(service, req.gpu_id, req.params)
                return {"service": service, "url": url, "running": True}
            except (ValueError, NotImplementedError) as exc:
                raise HTTPException(status_code=422, detail=str(exc))
            except Exception as exc:
                raise HTTPException(status_code=500, detail=f"Failed to start {service}: {exc}")
        @app.post("/services/{service}/stop")
        def stop_service(service: str) -> dict:
            stopped = service_manager.stop(service)
            return {"service": service, "stopped": stopped}
    return app
--- a/circuitforge_core/resources/agent/service_manager.py
+++ b/circuitforge_core/resources/agent/service_manager.py
@ -0,0 +1,169 @@
 """
 ServiceManager — start/stop Docker containers and processes for cf-orch managed services.
 Container naming convention: cf-orch-{service}-{node_id}
 """
 from __future__ import annotations
 import os
 import re
 import subprocess
 from collections import defaultdict
 from typing import Any
 from circuitforge_core.resources.profiles.schema import DockerSpec, GpuProfile, ProcessSpec
 def _expand_volume(v: str) -> str:
    """Expand bash-style volume strings including ${VAR:-default} and $VAR."""
    def _sub(m: re.Match) -> str:  # type: ignore[type-arg]
        var, default = m.group(1), m.group(2) or ""
        return os.environ.get(var) or default
    v = re.sub(r"\$\{(\w+)(?::-(.*?))?\}", _sub, v)
    v = re.sub(r"\$(\w+)", lambda m: os.environ.get(m.group(1), m.group(0)), v)
    return v
 class ServiceManager:
    def __init__(
        self,
        node_id: str,
        profile: GpuProfile,
        advertise_host: str = "127.0.0.1",
    ) -> None:
        self.node_id = node_id
        self.profile = profile
        self.advertise_host = advertise_host
        self._procs: dict[str, Any] = {}
    def container_name(self, service: str) -> str:
        return f"cf-orch-{service}-{self.node_id}"
    def _get_spec(self, service: str) -> DockerSpec | ProcessSpec | None:
        svc = self.profile.services.get(service)
        if svc is None:
            return None
        return svc.managed
    def is_running(self, service: str) -> bool:
        spec = self._get_spec(service)
        if spec is None:
            return False
        if isinstance(spec, DockerSpec):
            try:
                result = subprocess.run(
                    [
                        "docker",
                        "inspect",
                        "--format",
                        "{{.State.Running}}",
                        self.container_name(service),
                    ],
                    capture_output=True,
                    text=True,
                    check=True,
                )
                return result.stdout.strip() == "true"
            except subprocess.CalledProcessError:
                return False
        if isinstance(spec, ProcessSpec):
            proc = self._procs.get(service)
            if proc is None or proc.poll() is not None:
                return False
            import socket
            try:
                with socket.create_connection(("127.0.0.1", spec.host_port), timeout=1):
                    return True
            except OSError:
                return False
        return False
    def start(self, service: str, gpu_id: int, params: dict[str, str]) -> str:
        spec = self._get_spec(service)
        if spec is None:
            raise ValueError(f"Service {service!r} not in profile or has no managed spec")
        if self.is_running(service):
            return f"http://{self.advertise_host}:{spec.host_port}"
        if isinstance(spec, DockerSpec):
            expanded_volumes = [_expand_volume(v) for v in spec.volumes]
            filler: dict[str, str] = defaultdict(str, params)
            expanded_command = spec.command_template.format_map(filler).split()
            cmd = [
                "docker", "run", "-d", "--rm",
                "--name", self.container_name(service),
                "--runtime", spec.runtime,
                "--gpus", f"device={gpu_id}",
                "--ipc", spec.ipc,
                "-p", f"{spec.host_port}:{spec.port}",
            ]
            for vol in expanded_volumes:
                cmd += ["-v", vol]
            for key, val in spec.env.items():
                cmd += ["-e", f"{key}={val}"]
            cmd.append(spec.image)
            cmd.extend(expanded_command)
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            return f"http://{self.advertise_host}:{spec.host_port}"
        if isinstance(spec, ProcessSpec):
            import shlex
            import subprocess as _sp
            filler = defaultdict(str, params)
            filler.setdefault("port", str(spec.port))
            filler.setdefault("gpu_id", str(gpu_id))
            args_expanded = spec.args_template.format_map(filler).split()
            cmd = [spec.exec_path] + args_expanded
            env = {**__import__("os").environ}
            proc = _sp.Popen(
                cmd,
                cwd=spec.cwd or None,
                env=env,
                stdout=_sp.DEVNULL,
                stderr=_sp.DEVNULL,
            )
            self._procs[service] = proc
            return f"http://{self.advertise_host}:{spec.host_port}"
        raise NotImplementedError(f"Unknown spec type: {type(spec)}")
    def stop(self, service: str) -> bool:
        spec = self._get_spec(service)
        if spec is None:
            return False
        if isinstance(spec, DockerSpec):
            try:
                subprocess.run(
                    ["docker", "stop", self.container_name(service)],
                    check=True,
                    capture_output=True,
                    text=True,
                )
                return True
            except subprocess.CalledProcessError:
                return False
        if isinstance(spec, ProcessSpec):
            proc = self._procs.pop(service, None)
            if proc is not None:
                proc.terminate()
                try:
                    proc.wait(timeout=10)
                except Exception:
                    proc.kill()
            return True
        return False
    def list_running(self) -> list[str]:
        return [svc for svc in self.profile.services if self.is_running(svc)]
    def get_url(self, service: str) -> str | None:
        spec = self._get_spec(service)
        if spec is None or not self.is_running(service):
            return None
        return f"http://{self.advertise_host}:{spec.host_port}"
--- a/circuitforge_core/resources/agent/service_probe.py
+++ b/circuitforge_core/resources/agent/service_probe.py
@ -0,0 +1,123 @@
 """
 Probe running services to detect which models are currently loaded in VRAM.
 Two probe strategies run together:
 1. Well-known ports — always checked, regardless of who started the service.
   Catches ollama, vLLM, etc. running outside cf-orch management.
 2. Managed services — services cf-orch started via ServiceManager.
   Checked on their configured host_port, deduplicates with well-known results.
 Each service exposes a different introspection API:
  - vllm:   GET /v1/models  → {"data": [{"id": "<model-name>"}]}
  - ollama: GET /api/ps     → {"models": [{"name": "<model>", "size_vram": <bytes>}]}
 ollama can have multiple models loaded simultaneously; each is reported as a
 separate entry so the dashboard shows per-model residency.
 The probe is best-effort: a timeout or connection refusal means model_name=None
 but the service is still reported as resident.
 """
 from __future__ import annotations
 import json
 import logging
 import urllib.request
 from typing import Any
 from circuitforge_core.resources.profiles.schema import DockerSpec
 logger = logging.getLogger(__name__)
 _PROBE_TIMEOUT_S = 2.0
 # Well-known service ports probed on every heartbeat.
 # key → (service_name, prober_key)
 _WELL_KNOWN_PORTS: dict[int, str] = {
    11434: "ollama",
    8000:  "vllm",
    8080:  "vllm",  # common alt vLLM port
 }
 def _fetch_json(url: str) -> dict[str, Any] | None:
    """GET a URL and parse JSON; returns None on any error."""
    try:
        with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT_S) as resp:
            return json.loads(resp.read())
    except Exception as exc:
        logger.debug("Probe %s: %s", url, exc)
        return None
 def _probe_vllm(port: int) -> list[str]:
    data = _fetch_json(f"http://127.0.0.1:{port}/v1/models")
    if data and data.get("data"):
        return [m["id"] for m in data["data"] if m.get("id")]
    return []
 def _probe_ollama(port: int) -> list[str]:
    # /api/ps lists models currently *loaded in memory*, not just downloaded.
    data = _fetch_json(f"http://127.0.0.1:{port}/api/ps")
    if data and data.get("models"):
        return [m["name"] for m in data["models"] if m.get("name")]
    return []
 _PROBERS: dict[str, Any] = {
    "vllm":   _probe_vllm,
    "ollama": _probe_ollama,
 }
 def probe_all(service_manager: Any) -> list[dict[str, Any]]:
    """
    Probe all services — both well-known ports and cf-orch managed services.
    Returns a list of dicts: [{"service": str, "model_name": str | None}].
    Multiple loaded models in one service (e.g. two ollama models) each get
    their own entry, disambiguated as "ollama/0", "ollama/1", etc.
    """
    results: list[dict[str, Any]] = []
    seen_ports: set[int] = set()
    # ── 1. Well-known ports ──────────────────────────────────────────
    for port, service in _WELL_KNOWN_PORTS.items():
        prober = _PROBERS.get(service)
        if prober is None:
            continue
        models = prober(port)
        if not models:
            continue  # nothing on this port right now
        seen_ports.add(port)
        if len(models) == 1:
            results.append({"service": service, "model_name": models[0]})
        else:
            for i, model in enumerate(models):
                results.append({"service": f"{service}/{i}", "model_name": model})
    # ── 2. Managed services (cf-orch started) ───────────────────────
    if service_manager is not None:
        for service in service_manager.list_running():
            spec = service_manager._get_spec(service)
            if not isinstance(spec, DockerSpec):
                continue
            if spec.host_port in seen_ports:
                continue  # already captured by well-known probe
            prober = _PROBERS.get(service)
            if prober is None:
                results.append({"service": service, "model_name": None})
                continue
            models = prober(spec.host_port)
            seen_ports.add(spec.host_port)
            if not models:
                results.append({"service": service, "model_name": None})
            elif len(models) == 1:
                results.append({"service": service, "model_name": models[0]})
            else:
                for i, model in enumerate(models):
                    results.append({"service": f"{service}/{i}", "model_name": model})
    return results
--- a/circuitforge_core/resources/inference/init.py
+++ b/circuitforge_core/resources/inference/init.py
--- a/circuitforge_core/resources/inference/llm_server.py
+++ b/circuitforge_core/resources/inference/llm_server.py
@ -0,0 +1,135 @@
 """Generic OpenAI-compatible inference server for HuggingFace causal LMs."""
 from __future__ import annotations
 import argparse
 import time
 import uuid
 from contextlib import asynccontextmanager
 from typing import Any
 import torch
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer
 _model: Any = None
 _tokenizer: Any = None
 _model_id: str = ""
 _device: str = "cpu"
@asynccontextmanager
 async def lifespan(app: FastAPI):
    yield
 app = FastAPI(lifespan=lifespan)
 class Message(BaseModel):
    role: str
    content: str
 class ChatRequest(BaseModel):
    model: str | None = None
    messages: list[Message]
    max_tokens: int | None = 512
    temperature: float | None = 0.7
    stream: bool | None = False
@app.get("/health")
 def health() -> dict[str, str]:
    return {"status": "ok", "model": _model_id}
@app.get("/v1/models")
 def list_models() -> dict[str, Any]:
    return {
        "object": "list",
        "data": [{"id": _model_id, "object": "model", "owned_by": "cf-orch"}],
    }
@app.post("/v1/chat/completions")
 def chat_completions(req: ChatRequest) -> dict[str, Any]:
    if _model is None:
        raise HTTPException(503, detail="Model not loaded")
    if req.stream:
        raise HTTPException(501, detail="Streaming not supported")
    conversation = [{"role": m.role, "content": m.content} for m in req.messages]
    try:
        input_ids = _tokenizer.apply_chat_template(
            conversation,
            return_tensors="pt",
            add_generation_prompt=True,
        ).to(_device)
    except Exception as exc:
        raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
    max_new = req.max_tokens or 512
    temp = req.temperature if req.temperature is not None else 0.7
    gen_kwargs: dict[str, Any] = {
        "max_new_tokens": max_new,
        "do_sample": temp > 0,
        "pad_token_id": _tokenizer.eos_token_id,
    }
    if temp > 0:
        gen_kwargs["temperature"] = temp
    with torch.inference_mode():
        output_ids = _model.generate(input_ids, **gen_kwargs)
    new_tokens = output_ids[0][input_ids.shape[-1]:]
    reply = _tokenizer.decode(new_tokens, skip_special_tokens=True)
    return {
        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
        "object": "chat.completion",
        "created": int(time.time()),
        "model": _model_id,
        "choices": [
            {
                "index": 0,
                "message": {"role": "assistant", "content": reply},
                "finish_reason": "stop",
            }
        ],
        "usage": {
            "prompt_tokens": input_ids.shape[-1],
            "completion_tokens": len(new_tokens),
            "total_tokens": input_ids.shape[-1] + len(new_tokens),
        },
    }
 def _load_model(model_path: str, gpu_id: int) -> None:
    global _model, _tokenizer, _model_id, _device
    _device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
    _model_id = model_path
    _tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
    _model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16 if "cuda" in _device else torch.float32,
        device_map={"": _device},
        trust_remote_code=True,
    )
    _model.eval()
 def main() -> None:
    parser = argparse.ArgumentParser(description="cf-orch generic LLM inference server")
    parser.add_argument("--model", required=True)
    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument("--host", default="0.0.0.0")
    parser.add_argument("--gpu-id", type=int, default=0)
    args = parser.parse_args()
    _load_model(args.model, args.gpu_id)
    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
 if __name__ == "__main__":
    main()
--- a/circuitforge_core/resources/models.py
+++ b/circuitforge_core/resources/models.py
@ -3,6 +3,7 @@ from __future__ import annotations
 import time
 import uuid
 from dataclasses import dataclass, field
 from typing import Optional
@dataclass(frozen=True)
@ -48,6 +49,15 @@ class GpuInfo:
    vram_free_mb: int
@dataclass(frozen=True)
 class ResidentAllocation:
    """A model that is loaded and warm in VRAM but not actively serving a request."""
    service: str
    node_id: str
    model_name: Optional[str]  # None if service is running but model probe failed
    first_seen: float = field(default_factory=time.time)
@dataclass
 class NodeInfo:
    node_id: str
--- a/circuitforge_core/resources/profiles/public/single-gpu-16gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-16gb.yaml
@ -4,9 +4,16 @@ vram_total_mb: 16384
 eviction_timeout_s: 10.0
 services:
  vllm:
-    max_mb: 12288
+    max_mb: 9000
    priority: 1
    idle_stop_after_s: 600
    managed:
      type: process
      exec_path: "/devl/miniconda3/envs/cf/bin/python"
      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
      port: 8000
      host_port: 8000
      cwd: "/Library/Development/CircuitForge/circuitforge-core"
  ollama:
    max_mb: 12288
    priority: 1
--- a/circuitforge_core/resources/profiles/public/single-gpu-24gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-24gb.yaml
@ -4,9 +4,16 @@ vram_total_mb: 24576
 eviction_timeout_s: 10.0
 services:
  vllm:
-    max_mb: 20480
+    max_mb: 9000
    priority: 1
    idle_stop_after_s: 600
    managed:
      type: process
      exec_path: "/devl/miniconda3/envs/cf/bin/python"
      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
      port: 8000
      host_port: 8000
      cwd: "/Library/Development/CircuitForge/circuitforge-core"
  ollama:
    max_mb: 18432
    priority: 1
--- a/circuitforge_core/resources/profiles/public/single-gpu-6gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-6gb.yaml
@ -4,19 +4,16 @@ vram_total_mb: 6144
 eviction_timeout_s: 10.0
 services:
  vllm:
-    max_mb: 4096
+    max_mb: 5500
    priority: 1
    idle_stop_after_s: 600
    managed:
-      type: docker
+      type: process
-      image: "vllm/vllm-openai:v0.9.2"
+      exec_path: "/devl/miniconda3/envs/cf/bin/python"
      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
      port: 8000
      host_port: 8000
-      command_template: "--model /models/{model} --trust-remote-code --max-model-len {max_model_len} --gpu-memory-utilization {gpu_mem_util} --enforce-eager --max-num-seqs 8"
+      cwd: "/Library/Development/CircuitForge/circuitforge-core"
      volumes:
        - "${VLLM_MODELS_DIR:-/Library/Assets/LLM/vllm/models}:/models"
      runtime: nvidia
      ipc: host
  ollama:
    max_mb: 3584
    priority: 1
--- a/circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml
@ -4,19 +4,16 @@ vram_total_mb: 8192
 eviction_timeout_s: 10.0
 services:
  vllm:
-    max_mb: 5120
+    max_mb: 6500
    priority: 1
    idle_stop_after_s: 600
    managed:
-      type: docker
+      type: process
-      image: "vllm/vllm-openai:v0.9.2"
+      exec_path: "/devl/miniconda3/envs/cf/bin/python"
      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
      port: 8000
      host_port: 8000
-      command_template: "--model /models/{model} --trust-remote-code --max-model-len {max_model_len} --gpu-memory-utilization {gpu_mem_util} --enforce-eager --max-num-seqs 8"
+      cwd: "/Library/Development/CircuitForge/circuitforge-core"
      volumes:
        - "${VLLM_MODELS_DIR:-/Library/Assets/LLM/vllm/models}:/models"
      runtime: nvidia
      ipc: host
  ollama:
    max_mb: 4096
    priority: 1
--- a/tests/test_resources/test_service_manager.py
+++ b/tests/test_resources/test_service_manager.py
@ -0,0 +1,194 @@
 """Tests for ServiceManager ProcessSpec support."""
 from __future__ import annotations
 from unittest.mock import MagicMock, patch
 import pytest
 from circuitforge_core.resources.agent.service_manager import ServiceManager
 from circuitforge_core.resources.profiles.schema import (
    GpuProfile,
    ProcessSpec,
    ServiceProfile,
 )
 def _make_profile(args_template: str = "--port {port} --gpu-id {gpu_id}") -> GpuProfile:
    return GpuProfile(
        schema_version=1,
        name="test",
        vram_total_mb=8192,
        services={
            "vllm": ServiceProfile(
                max_mb=5120,
                priority=1,
                managed=ProcessSpec(
                    exec_path="/usr/bin/python",
                    args_template=args_template,
                    port=8000,
                    host_port=8000,
                    cwd="/tmp",
                ),
            ),
            "no_managed": ServiceProfile(max_mb=1024, priority=2),
        },
    )
@pytest.fixture
 def manager():
    return ServiceManager(node_id="test-node", profile=_make_profile(), advertise_host="127.0.0.1")
 # ---------------------------------------------------------------------------
 # is_running
 # ---------------------------------------------------------------------------
 def test_is_running_returns_false_when_no_proc(manager):
    assert manager.is_running("vllm") is False
 def test_is_running_returns_false_when_proc_exited(manager):
    mock_proc = MagicMock()
    mock_proc.poll.return_value = 1  # exited
    manager._procs["vllm"] = mock_proc
    assert manager.is_running("vllm") is False
 def test_is_running_returns_false_when_port_not_listening(manager):
    mock_proc = MagicMock()
    mock_proc.poll.return_value = None  # still running
    manager._procs["vllm"] = mock_proc
    with patch("socket.create_connection", side_effect=OSError("refused")):
        assert manager.is_running("vllm") is False
 def test_is_running_returns_true_when_proc_alive_and_port_open(manager):
    mock_proc = MagicMock()
    mock_proc.poll.return_value = None  # still running
    manager._procs["vllm"] = mock_proc
    mock_socket = MagicMock()
    mock_socket.__enter__ = MagicMock(return_value=mock_socket)
    mock_socket.__exit__ = MagicMock(return_value=False)
    with patch("socket.create_connection", return_value=mock_socket):
        assert manager.is_running("vllm") is True
 def test_is_running_unknown_service_returns_false(manager):
    assert manager.is_running("nonexistent") is False
 def test_is_running_no_managed_spec_returns_false(manager):
    assert manager.is_running("no_managed") is False
 # ---------------------------------------------------------------------------
 # start
 # ---------------------------------------------------------------------------
 def test_start_launches_process_and_returns_url(manager):
    with patch("subprocess.Popen") as mock_popen, \
         patch.object(manager, "is_running", return_value=False):
        mock_popen.return_value = MagicMock()
        url = manager.start("vllm", gpu_id=0, params={"model": "mymodel"})
    assert url == "http://127.0.0.1:8000"
    mock_popen.assert_called_once()
    call_args = mock_popen.call_args
    cmd = call_args[0][0]
    assert cmd[0] == "/usr/bin/python"
    assert "--port" in cmd
    assert "8000" in cmd
    assert "--gpu-id" in cmd
    assert "0" in cmd
 def test_start_returns_url_immediately_when_already_running(manager):
    with patch.object(manager, "is_running", return_value=True):
        with patch("subprocess.Popen") as mock_popen:
            url = manager.start("vllm", gpu_id=0, params={})
    assert url == "http://127.0.0.1:8000"
    mock_popen.assert_not_called()
 def test_start_raises_for_unknown_service(manager):
    with pytest.raises(ValueError, match="not in profile"):
        manager.start("nonexistent", gpu_id=0, params={})
 def test_start_stores_proc_in_procs(manager):
    mock_proc = MagicMock()
    with patch("subprocess.Popen", return_value=mock_proc), \
         patch.object(manager, "is_running", return_value=False):
        manager.start("vllm", gpu_id=0, params={})
    assert manager._procs["vllm"] is mock_proc
 # ---------------------------------------------------------------------------
 # stop
 # ---------------------------------------------------------------------------
 def test_stop_terminates_running_process(manager):
    mock_proc = MagicMock()
    manager._procs["vllm"] = mock_proc
    result = manager.stop("vllm")
    assert result is True
    mock_proc.terminate.assert_called_once()
    mock_proc.wait.assert_called_once()
    assert "vllm" not in manager._procs
 def test_stop_kills_process_that_wont_terminate(manager):
    mock_proc = MagicMock()
    mock_proc.wait.side_effect = Exception("timeout")
    manager._procs["vllm"] = mock_proc
    result = manager.stop("vllm")
    assert result is True
    mock_proc.kill.assert_called_once()
 def test_stop_returns_true_when_no_proc_tracked(manager):
    # No proc in _procs — still returns True (idempotent stop)
    result = manager.stop("vllm")
    assert result is True
 def test_stop_returns_false_for_unknown_service(manager):
    result = manager.stop("nonexistent")
    assert result is False
 # ---------------------------------------------------------------------------
 # list_running / get_url
 # ---------------------------------------------------------------------------
 def test_list_running_returns_running_services(manager):
    def _is_running(svc: str) -> bool:
        return svc == "vllm"
    with patch.object(manager, "is_running", side_effect=_is_running):
        running = manager.list_running()
    assert running == ["vllm"]
 def test_get_url_returns_none_when_not_running(manager):
    with patch.object(manager, "is_running", return_value=False):
        assert manager.get_url("vllm") is None
 def test_get_url_returns_url_when_running(manager):
    with patch.object(manager, "is_running", return_value=True):
        assert manager.get_url("vllm") == "http://127.0.0.1:8000"