feat(orch): background health probe loop — starting → running transition

Coordinator now polls all 'starting' instances every 5 s via GET /health. On 200: state → running. After 300 s without a healthy response: state → stopped. Closes #10.
2026-04-02 17:18:16 -07:00 · 2026-04-02 17:18:16 -07:00 · a7290c1240
commit a7290c1240
parent bd132851ec
1 changed files with 57 additions and 2 deletions
--- a/circuitforge_core/resources/coordinator/app.py
+++ b/circuitforge_core/resources/coordinator/app.py
@ -1,9 +1,14 @@
 from __future__ import annotations

+import logging
+import time
+import urllib.request
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Any

+logger = logging.getLogger(__name__)
+
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import HTMLResponse
 from pydantic import BaseModel
@ -17,6 +22,54 @@ from circuitforge_core.resources.coordinator.service_registry import ServiceRegi

 _DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()

+_PROBE_INTERVAL_S = 5.0    # how often to poll starting instances
+_PROBE_TIMEOUT_S = 300.0   # give up and mark stopped after this many seconds
+
+
+async def _run_instance_probe_loop(service_registry: ServiceRegistry) -> None:
+    """
+    Background loop: transition 'starting' instances to 'running' once their
+    /health endpoint responds, or to 'stopped' after PROBE_TIMEOUT_S.
+    """
+    import asyncio
+
+    start_times: dict[str, float] = {}  # instance key → time first seen as starting
+
+    while True:
+        await asyncio.sleep(_PROBE_INTERVAL_S)
+        now = time.time()
+        for inst in service_registry.all_instances():
+            if inst.state != "starting":
+                start_times.pop(f"{inst.service}:{inst.node_id}:{inst.gpu_id}", None)
+                continue
+            key = f"{inst.service}:{inst.node_id}:{inst.gpu_id}"
+            start_times.setdefault(key, now)
+
+            healthy = False
+            if inst.url:
+                try:
+                    with urllib.request.urlopen(
+                        inst.url.rstrip("/") + "/health", timeout=2.0
+                    ) as resp:
+                        healthy = resp.status == 200
+                except Exception:
+                    pass
+
+            if healthy:
+                service_registry.upsert_instance(
+                    service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
+                    state="running", model=inst.model, url=inst.url,
+                )
+                start_times.pop(key, None)
+                logger.info("Instance %s/%s gpu=%s transitioned to running", inst.service, inst.node_id, inst.gpu_id)
+            elif now - start_times[key] > _PROBE_TIMEOUT_S:
+                service_registry.upsert_instance(
+                    service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
+                    state="stopped", model=inst.model, url=inst.url,
+                )
+                start_times.pop(key, None)
+                logger.warning("Instance %s/%s gpu=%s timed out in starting state — marked stopped", inst.service, inst.node_id, inst.gpu_id)
+

 class LeaseRequest(BaseModel):
    node_id: str
@ -61,10 +114,12 @@ def create_coordinator_app(
    @asynccontextmanager
    async def _lifespan(app: FastAPI):  # type: ignore[type-arg]
        import asyncio
-        task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
+        heartbeat_task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
+        probe_task = asyncio.create_task(_run_instance_probe_loop(service_registry))
        yield
        agent_supervisor.stop()
-        task.cancel()
+        heartbeat_task.cancel()
+        probe_task.cancel()

    app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)