refactor(orch): hoist service_max_mb lookup; clarify warm-fallback comments

This commit is contained in:
pyr0ball 2026-04-02 11:21:20 -07:00
parent 13eb0c85f1
commit d600fb6651

View file

@ -10,7 +10,7 @@ if TYPE_CHECKING:
_WARM_BONUS_MB = 1000 _WARM_BONUS_MB = 1000
@dataclass @dataclass(frozen=True)
class _Scored: class _Scored:
node_id: str node_id: str
gpu_id: int gpu_id: int
@ -31,13 +31,14 @@ def select_node(
Warm nodes (service already running) get priority, then sorted by free VRAM. Warm nodes (service already running) get priority, then sorted by free VRAM.
Returns None if no suitable node exists. Returns None if no suitable node exists.
""" """
service_max_mb = _find_service_max_mb(service, profile_registry)
if service_max_mb is None:
return None # service not in any profile
candidates: list[_Scored] = [] candidates: list[_Scored] = []
for node_id, record in agents.items(): for node_id, record in agents.items():
if not record.online: if not record.online:
continue continue
service_max_mb = _find_service_max_mb(service, profile_registry)
if service_max_mb is None:
continue
for gpu in record.gpus: for gpu in record.gpus:
warm = f"{node_id}:{service}" in resident_keys warm = f"{node_id}:{service}" in resident_keys
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0) effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
@ -52,12 +53,15 @@ def select_node(
)) ))
if not candidates: if not candidates:
return None return None
# Warm nodes are always eligible (they already have the service resident). # Prefer: (1) warm nodes (model already resident — no cold start)
# Cold nodes must pass the can_fit threshold. If no node passes either # (2) cold nodes that can fit the service (free >= half of max_mb)
# criterion, fall back to the full candidate set. # Fallback: best-effort node when nothing fits and nothing is warm
# (coordinator will attempt to start the service anyway; it may evict or fail)
# Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
# bonus applies to all GPUs on the node. This is a known coarseness —
# per-GPU resident tracking requires a resident_key format change.
preferred = [c for c in candidates if c.warm or c.can_fit] preferred = [c for c in candidates if c.warm or c.can_fit]
pool = preferred if preferred else candidates pool = preferred if preferred else candidates
# Warm nodes take priority; within the same warmth tier, prefer more free VRAM.
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb)) best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
return best.node_id, best.gpu_id return best.node_id, best.gpu_id