fix(orch): tighten VRAM pre-flight to require full max_mb free (not half)
max_mb // 2 was too loose — Qwen2.5-3B needs ~5.9 GB on an 8 GB card but the threshold only required 3.25 GB free, allowing Ollama to hold 4.5 GB while a load attempt was still dispatched (causing OOM crash). - node_selector: can_fit = free_mb >= service_max_mb (was // 2) - coordinator /start: same threshold fix + updated error message - tests: two new node_selector tests pin the full-ceiling semantics; updated stale docstring in coordinator app test
This commit is contained in:
parent
2d095f0090
commit
bd132851ec
4 changed files with 33 additions and 7 deletions
|
|
@ -227,12 +227,12 @@ def create_coordinator_app(
|
|||
service_max_mb = svc.max_mb
|
||||
break
|
||||
|
||||
# Filter candidates by VRAM headroom — skip models where free VRAM
|
||||
# is less than half of the service's max_mb ceiling.
|
||||
if service_max_mb > 0 and free_mb < service_max_mb // 2:
|
||||
# Filter candidates by VRAM headroom — require free VRAM >= service ceiling
|
||||
# so the model can actually load without competing for VRAM with other processes.
|
||||
if service_max_mb > 0 and free_mb < service_max_mb:
|
||||
raise HTTPException(
|
||||
503,
|
||||
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need at least {service_max_mb // 2}MB",
|
||||
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
|
||||
)
|
||||
|
||||
last_error: str = ""
|
||||
|
|
|
|||
|
|
@ -42,7 +42,7 @@ def select_node(
|
|||
for gpu in record.gpus:
|
||||
warm = f"{node_id}:{service}" in resident_keys
|
||||
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
|
||||
can_fit = gpu.vram_free_mb >= service_max_mb // 2
|
||||
can_fit = gpu.vram_free_mb >= service_max_mb
|
||||
candidates.append(_Scored(
|
||||
node_id=node_id,
|
||||
gpu_id=gpu.gpu_id,
|
||||
|
|
|
|||
|
|
@ -149,8 +149,8 @@ def test_single_gpu_8gb_profile_has_idle_stop_after_s():
|
|||
|
||||
|
||||
def test_ensure_service_returns_503_when_vram_too_low():
|
||||
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < max_mb // 2."""
|
||||
# vllm max_mb = 5120 → threshold = 2560 MB; 100 MB free triggers 503.
|
||||
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
|
||||
# Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
|
||||
lease_manager = LeaseManager()
|
||||
lease_manager.register_gpu("low-vram-node", 0, 512)
|
||||
profile_registry = ProfileRegistry()
|
||||
|
|
|
|||
|
|
@ -54,3 +54,29 @@ def test_returns_none_when_no_agents():
|
|||
registry = ProfileRegistry()
|
||||
result = select_node({}, "vllm", registry, resident_keys=set())
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
|
||||
"""can_fit requires free_mb >= service max_mb (full ceiling, not half).
|
||||
9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
|
||||
"""
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=1000),
|
||||
"b": _make_agent("b", free_mb=9500),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
# "b" is the only node in the preferred (can_fit) pool
|
||||
assert result == ("b", 0)
|
||||
|
||||
|
||||
def test_falls_back_to_best_effort_when_no_node_fully_fits():
|
||||
"""When nothing can_fit, select_node returns the best-VRAM node as fallback."""
|
||||
agents = {
|
||||
"a": _make_agent("a", free_mb=1000),
|
||||
"b": _make_agent("b", free_mb=2000),
|
||||
}
|
||||
registry = ProfileRegistry()
|
||||
# Neither has enough free VRAM; fallback picks highest effective_free_mb
|
||||
result = select_node(agents, "vllm", registry, resident_keys=set())
|
||||
assert result == ("b", 0)
|
||||
|
|
|
|||
Loading…
Reference in a new issue