refactor(orch): hoist service_max_mb lookup; clarify warm-fallback comments
This commit is contained in:
parent
13eb0c85f1
commit
d600fb6651
1 changed files with 12 additions and 8 deletions
|
|
@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
||||||
_WARM_BONUS_MB = 1000
|
_WARM_BONUS_MB = 1000
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass(frozen=True)
|
||||||
class _Scored:
|
class _Scored:
|
||||||
node_id: str
|
node_id: str
|
||||||
gpu_id: int
|
gpu_id: int
|
||||||
|
|
@ -31,13 +31,14 @@ def select_node(
|
||||||
Warm nodes (service already running) get priority, then sorted by free VRAM.
|
Warm nodes (service already running) get priority, then sorted by free VRAM.
|
||||||
Returns None if no suitable node exists.
|
Returns None if no suitable node exists.
|
||||||
"""
|
"""
|
||||||
|
service_max_mb = _find_service_max_mb(service, profile_registry)
|
||||||
|
if service_max_mb is None:
|
||||||
|
return None # service not in any profile
|
||||||
|
|
||||||
candidates: list[_Scored] = []
|
candidates: list[_Scored] = []
|
||||||
for node_id, record in agents.items():
|
for node_id, record in agents.items():
|
||||||
if not record.online:
|
if not record.online:
|
||||||
continue
|
continue
|
||||||
service_max_mb = _find_service_max_mb(service, profile_registry)
|
|
||||||
if service_max_mb is None:
|
|
||||||
continue
|
|
||||||
for gpu in record.gpus:
|
for gpu in record.gpus:
|
||||||
warm = f"{node_id}:{service}" in resident_keys
|
warm = f"{node_id}:{service}" in resident_keys
|
||||||
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
|
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
|
||||||
|
|
@ -52,12 +53,15 @@ def select_node(
|
||||||
))
|
))
|
||||||
if not candidates:
|
if not candidates:
|
||||||
return None
|
return None
|
||||||
# Warm nodes are always eligible (they already have the service resident).
|
# Prefer: (1) warm nodes (model already resident — no cold start)
|
||||||
# Cold nodes must pass the can_fit threshold. If no node passes either
|
# (2) cold nodes that can fit the service (free >= half of max_mb)
|
||||||
# criterion, fall back to the full candidate set.
|
# Fallback: best-effort node when nothing fits and nothing is warm
|
||||||
|
# (coordinator will attempt to start the service anyway; it may evict or fail)
|
||||||
|
# Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
|
||||||
|
# bonus applies to all GPUs on the node. This is a known coarseness —
|
||||||
|
# per-GPU resident tracking requires a resident_key format change.
|
||||||
preferred = [c for c in candidates if c.warm or c.can_fit]
|
preferred = [c for c in candidates if c.warm or c.can_fit]
|
||||||
pool = preferred if preferred else candidates
|
pool = preferred if preferred else candidates
|
||||||
# Warm nodes take priority; within the same warmth tier, prefer more free VRAM.
|
|
||||||
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
|
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
|
||||||
return best.node_id, best.gpu_id
|
return best.node_id, best.gpu_id
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue