feat(orch): add NodeSelector — warm-first GPU scoring

2026-04-02 11:18:44 -07:00 · 2026-04-02 11:18:44 -07:00 · 13eb0c85f1
commit 13eb0c85f1
parent 427182aae7
2 changed files with 126 additions and 0 deletions
--- a/circuitforge_core/resources/coordinator/node_selector.py
+++ b/circuitforge_core/resources/coordinator/node_selector.py
@ -0,0 +1,70 @@
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
    from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
 _WARM_BONUS_MB = 1000
@dataclass
 class _Scored:
    node_id: str
    gpu_id: int
    vram_free_mb: int
    effective_free_mb: int
    can_fit: bool
    warm: bool
 def select_node(
    agents: "dict[str, AgentRecord]",
    service: str,
    profile_registry: "ProfileRegistry",
    resident_keys: set[str],
 ) -> tuple[str, int] | None:
    """
    Pick the best (node_id, gpu_id) for the requested service.
    Warm nodes (service already running) get priority, then sorted by free VRAM.
    Returns None if no suitable node exists.
    """
    candidates: list[_Scored] = []
    for node_id, record in agents.items():
        if not record.online:
            continue
        service_max_mb = _find_service_max_mb(service, profile_registry)
        if service_max_mb is None:
            continue
        for gpu in record.gpus:
            warm = f"{node_id}:{service}" in resident_keys
            effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
            can_fit = gpu.vram_free_mb >= service_max_mb // 2
            candidates.append(_Scored(
                node_id=node_id,
                gpu_id=gpu.gpu_id,
                vram_free_mb=gpu.vram_free_mb,
                effective_free_mb=effective,
                can_fit=can_fit,
                warm=warm,
            ))
    if not candidates:
        return None
    # Warm nodes are always eligible (they already have the service resident).
    # Cold nodes must pass the can_fit threshold. If no node passes either
    # criterion, fall back to the full candidate set.
    preferred = [c for c in candidates if c.warm or c.can_fit]
    pool = preferred if preferred else candidates
    # Warm nodes take priority; within the same warmth tier, prefer more free VRAM.
    best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
    return best.node_id, best.gpu_id
 def _find_service_max_mb(service: str, profile_registry: "ProfileRegistry") -> int | None:
    for profile in profile_registry.list_public():
        svc = profile.services.get(service)
        if svc is not None:
            return svc.max_mb
    return None
--- a/tests/test_resources/test_node_selector.py
+++ b/tests/test_resources/test_node_selector.py
@ -0,0 +1,56 @@
 import pytest
 from circuitforge_core.resources.coordinator.node_selector import select_node
 from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
 from circuitforge_core.resources.models import GpuInfo
 from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
 def _make_agent(node_id: str, free_mb: int, online: bool = True) -> AgentRecord:
    r = AgentRecord(node_id=node_id, agent_url=f"http://{node_id}:7701")
    r.gpus = [GpuInfo(gpu_id=0, name="RTX", vram_total_mb=8192,
                      vram_used_mb=8192 - free_mb, vram_free_mb=free_mb)]
    r.online = online
    return r
 def test_selects_node_with_most_free_vram():
    agents = {
        "a": _make_agent("a", free_mb=2000),
        "b": _make_agent("b", free_mb=6000),
    }
    registry = ProfileRegistry()
    result = select_node(agents, "vllm", registry, resident_keys=set())
    assert result == ("b", 0)
 def test_prefers_warm_node_even_with_less_free_vram():
    agents = {
        "a": _make_agent("a", free_mb=2000),
        "b": _make_agent("b", free_mb=6000),
    }
    registry = ProfileRegistry()
    result = select_node(agents, "vllm", registry, resident_keys={"a:vllm"})
    assert result == ("a", 0)
 def test_excludes_offline_nodes():
    agents = {
        "a": _make_agent("a", free_mb=8000, online=False),
        "b": _make_agent("b", free_mb=2000, online=True),
    }
    registry = ProfileRegistry()
    result = select_node(agents, "vllm", registry, resident_keys=set())
    assert result == ("b", 0)
 def test_returns_none_when_no_node_has_profile_for_service():
    agents = {"a": _make_agent("a", free_mb=8000)}
    registry = ProfileRegistry()
    result = select_node(agents, "cf-nonexistent-service", registry, resident_keys=set())
    assert result is None
 def test_returns_none_when_no_agents():
    registry = ProfileRegistry()
    result = select_node({}, "vllm", registry, resident_keys=set())
    assert result is None