fix(orch): hoist service-known check; capture resident_keys once in allocate

This commit is contained in:
pyr0ball 2026-04-02 11:45:48 -07:00
parent defaf39883
commit f741e6a80b

View file

@ -265,17 +265,15 @@ def create_coordinator_app(
if not req.model_candidates: if not req.model_candidates:
raise HTTPException(422, detail="model_candidates must be non-empty") raise HTTPException(422, detail="model_candidates must be non-empty")
if req.gpu_id is None: # Validate service is known in at least one profile, regardless of gpu_id
# Validate the service is known before attempting node selection. if not any(service in p.services for p in profile_registry.list_public()):
known = any( raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
service in p.services
for p in profile_registry.list_public()
)
if not known:
raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
residents = lease_manager.resident_keys()
if req.gpu_id is None:
online = agent_supervisor.online_agents() online = agent_supervisor.online_agents()
placement = select_node(online, service, profile_registry, lease_manager.resident_keys()) placement = select_node(online, service, profile_registry, residents)
if placement is None: if placement is None:
raise HTTPException( raise HTTPException(
503, 503,
@ -297,7 +295,7 @@ def create_coordinator_app(
if node_info is None: if node_info is None:
raise HTTPException(422, detail=f"Node {node_id!r} not found") raise HTTPException(422, detail=f"Node {node_id!r} not found")
warm = f"{node_id}:{service}" in lease_manager.resident_keys() warm = f"{node_id}:{service}" in residents
async with httpx.AsyncClient(timeout=120.0) as client: async with httpx.AsyncClient(timeout=120.0) as client:
last_error = "" last_error = ""