feat: cf-orch agent registration + VRAM lease wiring

Merges feature/orch-agent-registration into main. - Agent self-registration and coordinator heartbeat loop - TaskScheduler acquires/releases cf-orch VRAM lease per batch worker - shutdown() now joins batch worker threads for clean teardown - 94 tests passing
2026-04-01 11:21:38 -07:00 · 2026-04-01 11:21:38 -07:00 · 427182aae7
commit 427182aae7
parent 4596aad290 aa51794f45
3 changed files with 159 additions and 8 deletions
--- a/circuitforge_core/resources/cli.py
+++ b/circuitforge_core/resources/cli.py
@ -32,9 +32,14 @@ def start(
    profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
    host: str = "0.0.0.0",
    port: int = 7700,
+    node_id: str = "local",
    agent_port: int = 7701,
 ) -> None:
-    """Start the cf-orch coordinator (auto-detects GPU profile if not specified)."""
+    """Start the cf-orch coordinator (auto-detects GPU profile if not specified).
+
+    Automatically pre-registers the local agent so its GPUs appear on the
+    dashboard immediately. Remote nodes self-register via POST /api/nodes.
+    """
    from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
    from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
@ -52,8 +57,6 @@ def start(
            "Warning: no GPUs detected via nvidia-smi — coordinator running with 0 VRAM"
        )
    else:
-        for gpu in gpus:
-            lease_manager.register_gpu("local", gpu.gpu_id, gpu.vram_total_mb)
        typer.echo(f"Detected {len(gpus)} GPU(s)")

    if profile:
@ -67,6 +70,11 @@ def start(
        )
        typer.echo(f"Auto-selected profile: {active_profile.name}")

+    # Pre-register the local agent — the heartbeat loop will poll it for live GPU data.
+    local_agent_url = f"http://127.0.0.1:{agent_port}"
+    supervisor.register(node_id, local_agent_url)
+    typer.echo(f"Registered local node '{node_id}' → {local_agent_url}")
+
    coordinator_app = create_coordinator_app(
        lease_manager=lease_manager,
        profile_registry=profile_registry,
@ -83,10 +91,47 @@ def agent(
    node_id: str = "local",
    host: str = "0.0.0.0",
    port: int = 7701,
+    advertise_host: Optional[str] = None,
 ) -> None:
-    """Start a cf-orch node agent (for remote nodes like Navi, Huginn)."""
+    """Start a cf-orch node agent and self-register with the coordinator.
+
+    The agent starts its HTTP server, then POSTs its URL to the coordinator
+    so it appears on the dashboard without manual configuration.
+
+    Use --advertise-host to override the IP the coordinator should use to
+    reach this agent (e.g. on a multi-homed or NATted host).
+    """
+    import asyncio
+    import threading
+    import httpx
    from circuitforge_core.resources.agent.app import create_agent_app

+    # The URL the coordinator should use to reach this agent.
+    reach_host = advertise_host or ("127.0.0.1" if host in ("0.0.0.0", "::") else host)
+    agent_url = f"http://{reach_host}:{port}"
+
+    def _register_in_background() -> None:
+        """POST registration to coordinator after a short delay (uvicorn needs ~1s to bind)."""
+        import time
+        time.sleep(2.0)
+        try:
+            resp = httpx.post(
+                f"{coordinator}/api/nodes",
+                json={"node_id": node_id, "agent_url": agent_url},
+                timeout=5.0,
+            )
+            if resp.is_success:
+                typer.echo(f"Registered with coordinator at {coordinator} as '{node_id}'")
+            else:
+                typer.echo(
+                    f"Warning: coordinator registration returned {resp.status_code}", err=True
+                )
+        except Exception as exc:
+            typer.echo(f"Warning: could not reach coordinator at {coordinator}: {exc}", err=True)
+
+    # Fire registration in a daemon thread so uvicorn.run() can start blocking immediately.
+    threading.Thread(target=_register_in_background, daemon=True).start()
+
    agent_app = create_agent_app(node_id=node_id)
    typer.echo(f"Starting cf-orch agent [{node_id}] on {host}:{port}")
    uvicorn.run(agent_app, host=host, port=port)
--- a/circuitforge_core/resources/coordinator/app.py
+++ b/circuitforge_core/resources/coordinator/app.py
@ -1,5 +1,6 @@
 from __future__ import annotations

+from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Any

@ -7,13 +8,13 @@ from fastapi import FastAPI, HTTPException
 from fastapi.responses import HTMLResponse
 from pydantic import BaseModel

-_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
-
 from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
 from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
 from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
 from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry

+_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
+

 class LeaseRequest(BaseModel):
    node_id: str
@ -24,6 +25,11 @@ class LeaseRequest(BaseModel):
    ttl_s: float = 0.0


+class NodeRegisterRequest(BaseModel):
+    node_id: str
+    agent_url: str  # e.g. "http://10.1.10.71:7701"
+
+
 def create_coordinator_app(
    lease_manager: LeaseManager,
    profile_registry: ProfileRegistry,
@ -31,7 +37,15 @@ def create_coordinator_app(
 ) -> FastAPI:
    eviction_engine = EvictionEngine(lease_manager=lease_manager)

-    app = FastAPI(title="cf-orch-coordinator")
+    @asynccontextmanager
+    async def _lifespan(app: FastAPI):  # type: ignore[type-arg]
+        import asyncio
+        task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
+        yield
+        agent_supervisor.stop()
+        task.cancel()
+
+    app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)

    @app.get("/", response_class=HTMLResponse, include_in_schema=False)
    def dashboard() -> HTMLResponse:
@ -65,6 +79,13 @@ def create_coordinator_app(
            ]
        }

+    @app.post("/api/nodes")
+    async def register_node(req: NodeRegisterRequest) -> dict[str, Any]:
+        """Agents call this to self-register. Coordinator immediately polls for GPU info."""
+        agent_supervisor.register(req.node_id, req.agent_url)
+        await agent_supervisor.poll_agent(req.node_id)
+        return {"registered": True, "node_id": req.node_id}
+
    @app.get("/api/profiles")
    def get_profiles() -> dict[str, Any]:
        return {
--- a/circuitforge_core/tasks/scheduler.py
+++ b/circuitforge_core/tasks/scheduler.py
@ -127,6 +127,9 @@ class TaskScheduler:
        vram_budgets: dict[str, float],
        available_vram_gb: Optional[float] = None,
        max_queue_depth: int = _DEFAULT_MAX_QUEUE_DEPTH,
+        coordinator_url: str = "http://localhost:7700",
+        service_name: str = "peregrine",
+        lease_priority: int = 2,
    ) -> None:
        self._db_path = db_path
        self._run_task = run_task_fn
@ -134,6 +137,10 @@ class TaskScheduler:
        self._budgets: dict[str, float] = dict(vram_budgets)
        self._max_queue_depth = max_queue_depth

+        self._coordinator_url = coordinator_url.rstrip("/")
+        self._service_name = service_name
+        self._lease_priority = lease_priority
+
        self._lock = threading.Lock()
        self._wake = threading.Event()
        self._stop = threading.Event()
@ -196,11 +203,21 @@ class TaskScheduler:
                self._wake.set()

    def shutdown(self, timeout: float = 5.0) -> None:
-        """Signal the scheduler to stop and wait for it to exit."""
+        """Signal the scheduler to stop and wait for it to exit.
+
+        Joins both the scheduler loop thread and any active batch worker
+        threads so callers can rely on clean state (e.g. _reserved_vram == 0)
+        immediately after this returns.
+        """
        self._stop.set()
        self._wake.set()
        if self._thread and self._thread.is_alive():
            self._thread.join(timeout=timeout)
+        # Join active batch workers so _reserved_vram is settled on return
+        with self._lock:
+            workers = list(self._active.values())
+        for worker in workers:
+            worker.join(timeout=timeout)

    def _scheduler_loop(self) -> None:
        while not self._stop.is_set():
@ -240,8 +257,70 @@ class TaskScheduler:
                        self._reserved_vram += budget
                        thread.start()

+    def _acquire_lease(self, task_type: str) -> Optional[str]:
+        """Request a VRAM lease from the coordinator. Returns lease_id or None."""
+        if httpx is None:
+            return None
+        budget_gb = self._budgets.get(task_type, 0.0)
+        if budget_gb <= 0:
+            return None
+        mb = int(budget_gb * 1024)
+        try:
+            # Pick the GPU with the most free VRAM on the first registered node
+            resp = httpx.get(f"{self._coordinator_url}/api/nodes", timeout=2.0)
+            if resp.status_code != 200:
+                return None
+            nodes = resp.json().get("nodes", [])
+            if not nodes:
+                return None
+            best_node = best_gpu = best_free = None
+            for node in nodes:
+                for gpu in node.get("gpus", []):
+                    free = gpu.get("vram_free_mb", 0)
+                    if best_free is None or free > best_free:
+                        best_node = node["node_id"]
+                        best_gpu = gpu["gpu_id"]
+                        best_free = free
+            if best_node is None:
+                return None
+            lease_resp = httpx.post(
+                f"{self._coordinator_url}/api/leases",
+                json={
+                    "node_id": best_node,
+                    "gpu_id": best_gpu,
+                    "mb": mb,
+                    "service": self._service_name,
+                    "priority": self._lease_priority,
+                },
+                timeout=3.0,
+            )
+            if lease_resp.status_code == 200:
+                lease_id = lease_resp.json()["lease"]["lease_id"]
+                logger.debug(
+                    "Acquired VRAM lease %s for task_type=%s (%d MB)",
+                    lease_id, task_type, mb,
+                )
+                return lease_id
+        except Exception as exc:
+            logger.debug("Lease acquire failed (non-fatal): %s", exc)
+        return None
+
+    def _release_lease(self, lease_id: str) -> None:
+        """Release a coordinator VRAM lease. Best-effort; failures are logged only."""
+        if httpx is None or not lease_id:
+            return
+        try:
+            httpx.delete(
+                f"{self._coordinator_url}/api/leases/{lease_id}",
+                timeout=3.0,
+            )
+            logger.debug("Released VRAM lease %s", lease_id)
+        except Exception as exc:
+            logger.debug("Lease release failed (non-fatal): %s", exc)
+
    def _batch_worker(self, task_type: str) -> None:
        """Serial consumer for one task type. Runs until the type's deque is empty."""
+        lease_id: Optional[str] = self._acquire_lease(task_type)
        try:
            while True:
                with self._lock:
@ -253,6 +332,8 @@ class TaskScheduler:
                    self._db_path, task.id, task_type, task.job_id, task.params
                )
        finally:
+            if lease_id:
+                self._release_lease(lease_id)
            with self._lock:
                self._active.pop(task_type, None)
                self._reserved_vram -= self._budgets.get(task_type, 0.0)
@ -298,6 +379,8 @@ def get_scheduler(
    task_types: Optional[frozenset[str]] = None,
    vram_budgets: Optional[dict[str, float]] = None,
    max_queue_depth: int = _DEFAULT_MAX_QUEUE_DEPTH,
+    coordinator_url: str = "http://localhost:7700",
+    service_name: str = "peregrine",
 ) -> TaskScheduler:
    """Return the process-level TaskScheduler singleton.

@ -324,6 +407,8 @@ def get_scheduler(
        task_types=task_types,
        vram_budgets=vram_budgets,
        max_queue_depth=max_queue_depth,
+        coordinator_url=coordinator_url,
+        service_name=service_name,
    )
    candidate.start()
    with _scheduler_lock: