diff --git a/circuitforge_core/resources/coordinator/app.py b/circuitforge_core/resources/coordinator/app.py index c51f32d..b893652 100644 --- a/circuitforge_core/resources/coordinator/app.py +++ b/circuitforge_core/resources/coordinator/app.py @@ -227,12 +227,12 @@ def create_coordinator_app( service_max_mb = svc.max_mb break - # Filter candidates by VRAM headroom — skip models where free VRAM - # is less than half of the service's max_mb ceiling. - if service_max_mb > 0 and free_mb < service_max_mb // 2: + # Filter candidates by VRAM headroom — require free VRAM >= service ceiling + # so the model can actually load without competing for VRAM with other processes. + if service_max_mb > 0 and free_mb < service_max_mb: raise HTTPException( 503, - detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need at least {service_max_mb // 2}MB", + detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB", ) last_error: str = "" diff --git a/circuitforge_core/resources/coordinator/node_selector.py b/circuitforge_core/resources/coordinator/node_selector.py index 9cdb9f4..52ab224 100644 --- a/circuitforge_core/resources/coordinator/node_selector.py +++ b/circuitforge_core/resources/coordinator/node_selector.py @@ -42,7 +42,7 @@ def select_node( for gpu in record.gpus: warm = f"{node_id}:{service}" in resident_keys effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0) - can_fit = gpu.vram_free_mb >= service_max_mb // 2 + can_fit = gpu.vram_free_mb >= service_max_mb candidates.append(_Scored( node_id=node_id, gpu_id=gpu.gpu_id, diff --git a/tests/test_resources/test_coordinator_app.py b/tests/test_resources/test_coordinator_app.py index 49eacbf..598ea50 100644 --- a/tests/test_resources/test_coordinator_app.py +++ b/tests/test_resources/test_coordinator_app.py @@ -149,8 +149,8 @@ def test_single_gpu_8gb_profile_has_idle_stop_after_s(): def test_ensure_service_returns_503_when_vram_too_low(): - """VRAM pre-flight guard fires before any HTTP request when free VRAM < max_mb // 2.""" - # vllm max_mb = 5120 → threshold = 2560 MB; 100 MB free triggers 503. + """VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb.""" + # Threshold = full max_mb (not half); 100 MB free on any profile triggers 503. lease_manager = LeaseManager() lease_manager.register_gpu("low-vram-node", 0, 512) profile_registry = ProfileRegistry() diff --git a/tests/test_resources/test_node_selector.py b/tests/test_resources/test_node_selector.py index 9e18a3a..50b500e 100644 --- a/tests/test_resources/test_node_selector.py +++ b/tests/test_resources/test_node_selector.py @@ -54,3 +54,29 @@ def test_returns_none_when_no_agents(): registry = ProfileRegistry() result = select_node({}, "vllm", registry, resident_keys=set()) assert result is None + + +def test_prefers_node_that_fully_fits_service_over_one_that_does_not(): + """can_fit requires free_mb >= service max_mb (full ceiling, not half). + 9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all. + """ + agents = { + "a": _make_agent("a", free_mb=1000), + "b": _make_agent("b", free_mb=9500), + } + registry = ProfileRegistry() + result = select_node(agents, "vllm", registry, resident_keys=set()) + # "b" is the only node in the preferred (can_fit) pool + assert result == ("b", 0) + + +def test_falls_back_to_best_effort_when_no_node_fully_fits(): + """When nothing can_fit, select_node returns the best-VRAM node as fallback.""" + agents = { + "a": _make_agent("a", free_mb=1000), + "b": _make_agent("b", free_mb=2000), + } + registry = ProfileRegistry() + # Neither has enough free VRAM; fallback picks highest effective_free_mb + result = select_node(agents, "vllm", registry, resident_keys=set()) + assert result == ("b", 0)