From f741e6a80bf1d20f3f83ea7e69dad83f1070f380 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Thu, 2 Apr 2026 11:45:48 -0700 Subject: [PATCH] fix(orch): hoist service-known check; capture resident_keys once in allocate --- circuitforge_core/resources/coordinator/app.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/circuitforge_core/resources/coordinator/app.py b/circuitforge_core/resources/coordinator/app.py index 3bc6eb5..6f4e66e 100644 --- a/circuitforge_core/resources/coordinator/app.py +++ b/circuitforge_core/resources/coordinator/app.py @@ -265,17 +265,15 @@ def create_coordinator_app( if not req.model_candidates: raise HTTPException(422, detail="model_candidates must be non-empty") - if req.gpu_id is None: - # Validate the service is known before attempting node selection. - known = any( - service in p.services - for p in profile_registry.list_public() - ) - if not known: - raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile") + # Validate service is known in at least one profile, regardless of gpu_id + if not any(service in p.services for p in profile_registry.list_public()): + raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile") + residents = lease_manager.resident_keys() + + if req.gpu_id is None: online = agent_supervisor.online_agents() - placement = select_node(online, service, profile_registry, lease_manager.resident_keys()) + placement = select_node(online, service, profile_registry, residents) if placement is None: raise HTTPException( 503, @@ -297,7 +295,7 @@ def create_coordinator_app( if node_info is None: raise HTTPException(422, detail=f"Node {node_id!r} not found") - warm = f"{node_id}:{service}" in lease_manager.resident_keys() + warm = f"{node_id}:{service}" in residents async with httpx.AsyncClient(timeout=120.0) as client: last_error = ""