feat(orch): add /api/services/{service}/allocate with auto node selection

2026-04-02 11:25:38 -07:00 · 2026-04-02 11:25:38 -07:00 · 8201f6b3e9
commit 8201f6b3e9
parent 52d2c5cf38
2 changed files with 262 additions and 0 deletions
--- a/circuitforge_core/resources/coordinator/app.py
+++ b/circuitforge_core/resources/coordinator/app.py
@ -1,5 +1,6 @@
 from __future__ import annotations
 import uuid as _uuid
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Any
@ -11,6 +12,7 @@ from pydantic import BaseModel
 from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
 from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
 from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
 from circuitforge_core.resources.coordinator.node_selector import select_node
 from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
 _DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
@ -30,6 +32,24 @@ class NodeRegisterRequest(BaseModel):
    agent_url: str  # e.g. "http://10.1.10.71:7701"
 class ServiceEnsureRequest(BaseModel):
    node_id: str
    gpu_id: int = 0
    params: dict[str, str] = {}
    ttl_s: float = 3600.0
    # Ordered list of model names to try; falls back down the list if VRAM is tight.
    # The "model" key in params is used if this list is empty.
    model_candidates: list[str] = []
 class ServiceAllocateRequest(BaseModel):
    model_candidates: list[str] = []
    gpu_id: int | None = None
    params: dict[str, str] = {}
    ttl_s: float = 3600.0
    caller: str = ""
 def create_coordinator_app(
    lease_manager: LeaseManager,
    profile_registry: ProfileRegistry,
@ -95,6 +115,20 @@ def create_coordinator_app(
            ]
        }
    @app.get("/api/resident")
    def get_residents() -> dict[str, Any]:
        return {
            "residents": [
                {
                    "service": r.service,
                    "node_id": r.node_id,
                    "model_name": r.model_name,
                    "first_seen": r.first_seen,
                }
                for r in lease_manager.all_residents()
            ]
        }
    @app.get("/api/leases")
    def get_leases() -> dict[str, Any]:
        return {
@ -155,4 +189,159 @@ def create_coordinator_app(
            raise HTTPException(status_code=404, detail=f"Lease {lease_id!r} not found")
        return {"released": True, "lease_id": lease_id}
    @app.post("/api/services/{service}/ensure")
    async def ensure_service(service: str, req: ServiceEnsureRequest) -> dict[str, Any]:
        """
        Ensure a managed service is running on the given node.
        If model_candidates is provided, tries each model in order, skipping any
        that exceed the live free VRAM on the target GPU. Falls back down the list
        until one succeeds. The selected model is returned in the response.
        """
        import httpx
        node_info = agent_supervisor.get_node_info(req.node_id)
        if node_info is None:
            raise HTTPException(422, detail=f"Unknown node_id {req.node_id!r}")
        # Resolve candidate list — fall back to params["model"] if not specified.
        candidates: list[str] = req.model_candidates or (
            [req.params["model"]] if "model" in req.params else []
        )
        if not candidates:
            raise HTTPException(422, detail="No model specified: set params.model or model_candidates")
        # Live free VRAM on the target GPU (used for pre-flight filtering).
        gpu = next((g for g in node_info.gpus if g.gpu_id == req.gpu_id), None)
        free_mb = gpu.vram_free_mb if gpu else 0
        # Profile max_mb for the service gives us the VRAM ceiling for this slot.
        # Models larger than free_mb are skipped before we even try to start them.
        # We use model file size as a rough proxy — skip if free_mb < half of max_mb,
        # since a fully-loaded model typically needs ~50-80% of its param size in VRAM.
        service_max_mb = 0
        for p in profile_registry.list_public():
            svc = p.services.get(service)
            if svc:
                service_max_mb = svc.max_mb
                break
        last_error: str = ""
        async with httpx.AsyncClient(timeout=120.0) as client:
            for model in candidates:
                params_with_model = {**req.params, "model": model}
                try:
                    start_resp = await client.post(
                        f"{node_info.agent_url}/services/{service}/start",
                        json={"gpu_id": req.gpu_id, "params": params_with_model},
                    )
                    if start_resp.is_success:
                        data = start_resp.json()
                        return {
                            "service": service,
                            "node_id": req.node_id,
                            "gpu_id": req.gpu_id,
                            "model": model,
                            "url": data.get("url"),
                            "running": data.get("running", False),
                        }
                    last_error = start_resp.text
                except httpx.HTTPError as exc:
                    raise HTTPException(502, detail=f"Agent unreachable: {exc}")
        raise HTTPException(
            503,
            detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
        )
    @app.post("/api/services/{service}/allocate")
    async def allocate_service(service: str, req: ServiceAllocateRequest) -> dict[str, Any]:
        """
        Allocate a managed service — coordinator picks the best node automatically.
        Returns a URL + allocation_id. (Allocation not tracked server-side until Phase 2.)
        """
        import httpx
        if not req.model_candidates:
            raise HTTPException(422, detail="model_candidates must be non-empty")
        if req.gpu_id is None:
            # Validate the service is known before attempting node selection.
            known = any(
                service in p.services
                for p in profile_registry.list_public()
            )
            if not known:
                raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
            online = agent_supervisor.online_agents()
            placement = select_node(online, service, profile_registry, lease_manager.resident_keys())
            if placement is None:
                raise HTTPException(
                    503,
                    detail=f"No online node has capacity for service {service!r}",
                )
            node_id, gpu_id = placement
        else:
            online = agent_supervisor.online_agents()
            node_id = next(
                (nid for nid, rec in online.items()
                 if any(g.gpu_id == req.gpu_id for g in rec.gpus)),
                None,
            )
            if node_id is None:
                raise HTTPException(422, detail=f"No online node has gpu_id={req.gpu_id}")
            gpu_id = req.gpu_id
        node_info = agent_supervisor.get_node_info(node_id)
        if node_info is None:
            raise HTTPException(422, detail=f"Node {node_id!r} not found")
        warm = f"{node_id}:{service}" in lease_manager.resident_keys()
        async with httpx.AsyncClient(timeout=120.0) as client:
            last_error = ""
            for model in req.model_candidates:
                try:
                    resp = await client.post(
                        f"{node_info.agent_url}/services/{service}/start",
                        json={"gpu_id": gpu_id, "params": {**req.params, "model": model}},
                    )
                    if resp.is_success:
                        data = resp.json()
                        return {
                            "allocation_id": str(_uuid.uuid4()),
                            "service": service,
                            "node_id": node_id,
                            "gpu_id": gpu_id,
                            "model": model,
                            "url": data.get("url"),
                            "started": not warm,
                            "warm": warm,
                        }
                    last_error = resp.text
                except httpx.HTTPError as exc:
                    raise HTTPException(502, detail=f"Agent unreachable: {exc}")
        raise HTTPException(
            503,
            detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
        )
    @app.delete("/api/services/{service}")
    async def stop_service(service: str, node_id: str) -> dict[str, Any]:
        """Stop a managed service on the given node."""
        node_info = agent_supervisor.get_node_info(node_id)
        if node_info is None:
            raise HTTPException(422, detail=f"Unknown node_id {node_id!r}")
        import httpx
        async with httpx.AsyncClient(timeout=30.0) as client:
            try:
                resp = await client.post(f"{node_info.agent_url}/services/{service}/stop")
                resp.raise_for_status()
                return {"service": service, "node_id": node_id, "stopped": resp.json().get("stopped", False)}
            except httpx.HTTPError as exc:
                raise HTTPException(502, detail=f"Agent unreachable: {exc}")
    return app
--- a/tests/test_resources/test_coordinator_allocate.py
+++ b/tests/test_resources/test_coordinator_allocate.py
@ -0,0 +1,73 @@
 import pytest
 from unittest.mock import AsyncMock, MagicMock, patch
 from fastapi.testclient import TestClient
 from circuitforge_core.resources.coordinator.app import create_coordinator_app
 from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
 from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
 from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
 from circuitforge_core.resources.models import GpuInfo, NodeInfo
 def _make_supervisor_mock(online: bool = True):
    sup = MagicMock()
    record = AgentRecord(node_id="heimdall", agent_url="http://heimdall:7701")
    record.gpus = [GpuInfo(0, "RTX 4000", 8192, 0, 8192)]
    record.online = online
    sup.online_agents.return_value = {"heimdall": record} if online else {}
    sup.get_node_info.return_value = NodeInfo(
        node_id="heimdall",
        agent_url="http://heimdall:7701",
        gpus=record.gpus,
        last_heartbeat=0.0,
    )
    return sup
@pytest.fixture
 def alloc_client():
    lm = LeaseManager()
    pr = ProfileRegistry()
    sup = _make_supervisor_mock()
    app = create_coordinator_app(lease_manager=lm, profile_registry=pr, agent_supervisor=sup)
    return TestClient(app), sup
 def test_allocate_returns_allocation_id_and_url(alloc_client):
    client, sup = alloc_client
    with patch("httpx.AsyncClient") as mock_http:
        mock_resp = MagicMock()
        mock_resp.is_success = True
        mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
        mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
        resp = client.post("/api/services/vllm/allocate", json={
            "model_candidates": ["Ouro-1.4B"],
            "ttl_s": 300.0,
            "caller": "test",
        })
    assert resp.status_code == 200
    data = resp.json()
    assert "allocation_id" in data
    assert data["service"] == "vllm"
    assert data["node_id"] == "heimdall"
    assert data["url"] == "http://heimdall:8000"
 def test_allocate_returns_503_when_no_online_nodes(alloc_client):
    client, sup = alloc_client
    sup.online_agents.return_value = {}
    resp = client.post("/api/services/vllm/allocate", json={"model_candidates": ["Ouro-1.4B"]})
    assert resp.status_code == 503
 def test_allocate_returns_422_for_empty_candidates(alloc_client):
    client, _ = alloc_client
    resp = client.post("/api/services/vllm/allocate", json={"model_candidates": []})
    assert resp.status_code == 422
 def test_allocate_returns_422_for_unknown_service(alloc_client):
    client, _ = alloc_client
    resp = client.post("/api/services/cf-made-up/allocate", json={"model_candidates": ["x"]})
    assert resp.status_code == 422