fix(orch): tighten VRAM pre-flight to require full max_mb free (not half)

max_mb // 2 was too loose — Qwen2.5-3B needs ~5.9 GB on an 8 GB card but the threshold only required 3.25 GB free, allowing Ollama to hold 4.5 GB while a load attempt was still dispatched (causing OOM crash). - node_selector: can_fit = free_mb >= service_max_mb (was // 2) - coordinator /start: same threshold fix + updated error message - tests: two new node_selector tests pin the full-ceiling semantics; updated stale docstring in coordinator app test
2026-04-02 16:44:36 -07:00 · 2026-04-02 16:44:36 -07:00 · bd132851ec
commit bd132851ec
parent 2d095f0090
4 changed files with 33 additions and 7 deletions
--- a/circuitforge_core/resources/coordinator/app.py
+++ b/circuitforge_core/resources/coordinator/app.py
@ -227,12 +227,12 @@ def create_coordinator_app(
                service_max_mb = svc.max_mb
                break

-        # Filter candidates by VRAM headroom — skip models where free VRAM
-        # is less than half of the service's max_mb ceiling.
-        if service_max_mb > 0 and free_mb < service_max_mb // 2:
+        # Filter candidates by VRAM headroom — require free VRAM >= service ceiling
+        # so the model can actually load without competing for VRAM with other processes.
+        if service_max_mb > 0 and free_mb < service_max_mb:
            raise HTTPException(
                503,
-                detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need at least {service_max_mb // 2}MB",
+                detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
            )

        last_error: str = ""
--- a/circuitforge_core/resources/coordinator/node_selector.py
+++ b/circuitforge_core/resources/coordinator/node_selector.py
@ -42,7 +42,7 @@ def select_node(
        for gpu in record.gpus:
            warm = f"{node_id}:{service}" in resident_keys
            effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
-            can_fit = gpu.vram_free_mb >= service_max_mb // 2
+            can_fit = gpu.vram_free_mb >= service_max_mb
            candidates.append(_Scored(
                node_id=node_id,
                gpu_id=gpu.gpu_id,
--- a/tests/test_resources/test_coordinator_app.py
+++ b/tests/test_resources/test_coordinator_app.py
@ -149,8 +149,8 @@ def test_single_gpu_8gb_profile_has_idle_stop_after_s():


 def test_ensure_service_returns_503_when_vram_too_low():
-    """VRAM pre-flight guard fires before any HTTP request when free VRAM < max_mb // 2."""
-    # vllm max_mb = 5120 → threshold = 2560 MB; 100 MB free triggers 503.
+    """VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
+    # Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
    lease_manager = LeaseManager()
    lease_manager.register_gpu("low-vram-node", 0, 512)
    profile_registry = ProfileRegistry()
--- a/tests/test_resources/test_node_selector.py
+++ b/tests/test_resources/test_node_selector.py
@ -54,3 +54,29 @@ def test_returns_none_when_no_agents():
    registry = ProfileRegistry()
    result = select_node({}, "vllm", registry, resident_keys=set())
    assert result is None
+
+
+def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
+    """can_fit requires free_mb >= service max_mb (full ceiling, not half).
+    9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
+    """
+    agents = {
+        "a": _make_agent("a", free_mb=1000),
+        "b": _make_agent("b", free_mb=9500),
+    }
+    registry = ProfileRegistry()
+    result = select_node(agents, "vllm", registry, resident_keys=set())
+    # "b" is the only node in the preferred (can_fit) pool
+    assert result == ("b", 0)
+
+
+def test_falls_back_to_best_effort_when_no_node_fully_fits():
+    """When nothing can_fit, select_node returns the best-VRAM node as fallback."""
+    agents = {
+        "a": _make_agent("a", free_mb=1000),
+        "b": _make_agent("b", free_mb=2000),
+    }
+    registry = ProfileRegistry()
+    # Neither has enough free VRAM; fallback picks highest effective_free_mb
+    result = select_node(agents, "vllm", registry, resident_keys=set())
+    assert result == ("b", 0)