fix(orch): tighten VRAM pre-flight to require full max_mb free (not half)

max_mb // 2 was too loose — Qwen2.5-3B needs ~5.9 GB on an 8 GB card
but the threshold only required 3.25 GB free, allowing Ollama to hold
4.5 GB while a load attempt was still dispatched (causing OOM crash).

- node_selector: can_fit = free_mb >= service_max_mb (was // 2)
- coordinator /start: same threshold fix + updated error message
- tests: two new node_selector tests pin the full-ceiling semantics;
  updated stale docstring in coordinator app test
This commit is contained in:
pyr0ball 2026-04-02 16:44:36 -07:00
parent 2d095f0090
commit bd132851ec
4 changed files with 33 additions and 7 deletions

View file

@ -227,12 +227,12 @@ def create_coordinator_app(
service_max_mb = svc.max_mb
break
# Filter candidates by VRAM headroom — skip models where free VRAM
# is less than half of the service's max_mb ceiling.
if service_max_mb > 0 and free_mb < service_max_mb // 2:
# Filter candidates by VRAM headroom — require free VRAM >= service ceiling
# so the model can actually load without competing for VRAM with other processes.
if service_max_mb > 0 and free_mb < service_max_mb:
raise HTTPException(
503,
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need at least {service_max_mb // 2}MB",
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
)
last_error: str = ""

View file

@ -42,7 +42,7 @@ def select_node(
for gpu in record.gpus:
warm = f"{node_id}:{service}" in resident_keys
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
can_fit = gpu.vram_free_mb >= service_max_mb // 2
can_fit = gpu.vram_free_mb >= service_max_mb
candidates.append(_Scored(
node_id=node_id,
gpu_id=gpu.gpu_id,

View file

@ -149,8 +149,8 @@ def test_single_gpu_8gb_profile_has_idle_stop_after_s():
def test_ensure_service_returns_503_when_vram_too_low():
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < max_mb // 2."""
# vllm max_mb = 5120 → threshold = 2560 MB; 100 MB free triggers 503.
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
# Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
lease_manager = LeaseManager()
lease_manager.register_gpu("low-vram-node", 0, 512)
profile_registry = ProfileRegistry()

View file

@ -54,3 +54,29 @@ def test_returns_none_when_no_agents():
registry = ProfileRegistry()
result = select_node({}, "vllm", registry, resident_keys=set())
assert result is None
def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
"""can_fit requires free_mb >= service max_mb (full ceiling, not half).
9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
"""
agents = {
"a": _make_agent("a", free_mb=1000),
"b": _make_agent("b", free_mb=9500),
}
registry = ProfileRegistry()
result = select_node(agents, "vllm", registry, resident_keys=set())
# "b" is the only node in the preferred (can_fit) pool
assert result == ("b", 0)
def test_falls_back_to_best_effort_when_no_node_fully_fits():
"""When nothing can_fit, select_node returns the best-VRAM node as fallback."""
agents = {
"a": _make_agent("a", free_mb=1000),
"b": _make_agent("b", free_mb=2000),
}
registry = ProfileRegistry()
# Neither has enough free VRAM; fallback picks highest effective_free_mb
result = select_node(agents, "vllm", registry, resident_keys=set())
assert result == ("b", 0)