feat!: strip resources/ from MIT core — moves to circuitforge-orch (v0.8.0)

BREAKING CHANGE: circuitforge_core.resources is no longer available. Import CFOrchClient from circuitforge_orch.client instead. cf-orch CLI entry point is now in the circuitforge-orch package.
2026-04-04 22:34:27 -07:00 · 2026-04-04 22:34:27 -07:00 · c244260d1c
commit c244260d1c
parent 2259382d0b
63 changed files with 34 additions and 6571 deletions
--- a/README.md
+++ b/README.md
@ -2,15 +2,29 @@

 Shared scaffold for CircuitForge products.

+**Current version: 0.7.0**
+
 ## Modules

+### Implemented
+
 - `circuitforge_core.db` — SQLite connection factory and migration runner
- `circuitforge_core.llm` — LLM router with fallback chain
+- `circuitforge_core.llm` — LLM router with fallback chain (Ollama, vLLM, Anthropic, OpenAI-compatible)
 - `circuitforge_core.tiers` — Tier system with BYOK and local vision unlocks
 - `circuitforge_core.config` — Env validation and .env loader
- `circuitforge_core.vision` — Vision router stub (v0.2+)
- `circuitforge_core.wizard` — First-run wizard base class stub
- `circuitforge_core.pipeline` — Staging queue stub (v0.2+)
+- `circuitforge_core.hardware` — Hardware detection and LLM backend profile generation (VRAM tiers, GPU/CPU auto-select)
+- `circuitforge_core.documents` — Document ingestion pipeline: PDF, DOCX, and image OCR → `StructuredDocument`
+- `circuitforge_core.affiliates` — Affiliate URL wrapping with opt-out, BYOK user IDs, and CF env-var fallback (`wrap_url`)
+- `circuitforge_core.preferences` — User preference store (local YAML file, pluggable backend); dot-path get/set API
+- `circuitforge_core.tasks` — VRAM-aware LLM task scheduler; shared slot manager across services (`TaskScheduler`)
+- `circuitforge_core.manage` — Cross-platform product process manager (Docker and native modes)
+- `circuitforge_core.resources` — Resource coordinator and agent: VRAM allocation, eviction engine, GPU profile registry
+
+### Stubs (in-tree, not yet implemented)
+
+- `circuitforge_core.vision` — Vision router base class (planned: moondream2 / Claude vision dispatch)
+- `circuitforge_core.wizard` — First-run wizard base class (products subclass `BaseWizard`)
+- `circuitforge_core.pipeline` — Staging queue base (`StagingDB`; products provide concrete schema)

 ## Install

--- a/circuitforge_core/init.py
+++ b/circuitforge_core/init.py
@ -1 +1 @@
-__version__ = "0.7.0"
+__version__ = "0.8.0"
--- a/circuitforge_core/affiliates/programs.py
+++ b/circuitforge_core/affiliates/programs.py
@ -56,6 +56,12 @@ def _build_ebay_url(url: str, affiliate_id: str) -> str:
    return f"{url}{sep}{params}"


+def _build_instacart_url(url: str, affiliate_id: str) -> str:
+    """Append Instacart affiliate parameter to a search URL."""
+    sep = "&" if "?" in url else "?"
+    return f"{url}{sep}aff={affiliate_id}"
+
+
 def _build_amazon_url(url: str, affiliate_id: str) -> str:
    """Merge an Amazon Associates tag into a product URL's query string."""
    parsed = urlparse(url)
@ -101,3 +107,10 @@ register_program(AffiliateProgram(
    env_var="AMAZON_ASSOCIATES_TAG",
    build_url=_build_amazon_url,
 ))
+
+register_program(AffiliateProgram(
+    name="Instacart",
+    retailer_key="instacart",
+    env_var="INSTACART_AFFILIATE_ID",
+    build_url=_build_instacart_url,
+))
--- a/circuitforge_core/resources/init.py
+++ b/circuitforge_core/resources/init.py
@ -1 +0,0 @@
-from circuitforge_core.resources.client import CFOrchClient, Allocation  # noqa: F401
--- a/circuitforge_core/resources/agent/init.py
+++ b/circuitforge_core/resources/agent/init.py
--- a/circuitforge_core/resources/agent/app.py
+++ b/circuitforge_core/resources/agent/app.py
@ -1,105 +0,0 @@
-from __future__ import annotations
-
-import logging
-from typing import Any
-
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor
-from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
-from circuitforge_core.resources.agent.service_manager import ServiceManager
-
-logger = logging.getLogger(__name__)
-
-
-class EvictRequest(BaseModel):
-    pid: int
-    grace_period_s: float = 5.0
-
-
-class ServiceStartRequest(BaseModel):
-    gpu_id: int = 0
-    params: dict[str, str] = {}
-
-
-def create_agent_app(
-    node_id: str,
-    monitor: GpuMonitor | None = None,
-    executor: EvictionExecutor | None = None,
-    service_manager: ServiceManager | None = None,
-) -> FastAPI:
-    _monitor = monitor or GpuMonitor()
-    _executor = executor or EvictionExecutor()
-
-    app = FastAPI(title=f"cf-orch-agent [{node_id}]")
-
-    @app.get("/health")
-    def health() -> dict[str, Any]:
-        return {"status": "ok", "node_id": node_id}
-
-    @app.get("/gpu-info")
-    def gpu_info() -> dict[str, Any]:
-        gpus = _monitor.poll()
-        return {
-            "node_id": node_id,
-            "gpus": [
-                {
-                    "gpu_id": g.gpu_id,
-                    "name": g.name,
-                    "vram_total_mb": g.vram_total_mb,
-                    "vram_used_mb": g.vram_used_mb,
-                    "vram_free_mb": g.vram_free_mb,
-                }
-                for g in gpus
-            ],
-        }
-
-    @app.post("/evict")
-    def evict(req: EvictRequest) -> dict[str, Any]:
-        result = _executor.evict_pid(pid=req.pid, grace_period_s=req.grace_period_s)
-        return {
-            "success": result.success,
-            "method": result.method,
-            "message": result.message,
-        }
-
-    @app.get("/resident-info")
-    def resident_info() -> dict[str, Any]:
-        """Return which models are currently loaded in each running managed service."""
-        if service_manager is None:
-            return {"residents": []}
-        from circuitforge_core.resources.agent.service_probe import probe_all
-        return {"residents": probe_all(service_manager)}
-
-    if service_manager is not None:
-        @app.get("/services")
-        def list_services() -> dict:
-            return {"running": service_manager.list_running()}
-
-        @app.get("/services/{service}")
-        def service_status(service: str) -> dict:
-            running = service_manager.is_running(service)
-            url = service_manager.get_url(service) if running else None
-            return {"service": service, "running": running, "url": url}
-
-        @app.post("/services/{service}/start")
-        def start_service(service: str, req: ServiceStartRequest) -> dict:
-            try:
-                already_running = service_manager.is_running(service)
-                url = service_manager.start(service, req.gpu_id, req.params)
-                # adopted=True signals the coordinator to treat this instance as
-                # immediately running rather than waiting for the probe loop.
-                adopted = already_running and service_manager.is_running(service)
-                return {"service": service, "url": url, "running": True, "adopted": adopted}
-            except (ValueError, NotImplementedError) as exc:
-                raise HTTPException(status_code=422, detail=str(exc))
-            except Exception as exc:
-                raise HTTPException(status_code=500, detail=f"Failed to start {service}: {exc}")
-
-        @app.post("/services/{service}/stop")
-        def stop_service(service: str) -> dict:
-            stopped = service_manager.stop(service)
-            return {"service": service, "stopped": stopped}
-
-    return app
--- a/circuitforge_core/resources/agent/eviction_executor.py
+++ b/circuitforge_core/resources/agent/eviction_executor.py
@ -1,85 +0,0 @@
-from __future__ import annotations
-
-import logging
-import os
-import signal
-import time
-from dataclasses import dataclass
-
-import psutil
-
-logger = logging.getLogger(__name__)
-
-_DEFAULT_GRACE_S = 5.0
-
-
-@dataclass(frozen=True)
-class EvictionResult:
-    success: bool
-    method: str   # "sigterm", "sigkill", "already_gone", "not_found", "error"
-    message: str
-
-
-class EvictionExecutor:
-    def __init__(self, grace_period_s: float = _DEFAULT_GRACE_S) -> None:
-        self._default_grace = grace_period_s
-
-    def evict_pid(
-        self,
-        pid: int,
-        grace_period_s: float | None = None,
-    ) -> EvictionResult:
-        grace = grace_period_s if grace_period_s is not None else self._default_grace
-
-        if pid <= 0:
-            return EvictionResult(
-                success=False, method="error",
-                message=f"Refusing to signal invalid PID {pid}"
-            )
-
-        if not psutil.pid_exists(pid):
-            return EvictionResult(
-                success=False, method="not_found",
-                message=f"PID {pid} not found"
-            )
-
-        try:
-            os.kill(pid, signal.SIGTERM)
-        except ProcessLookupError:
-            return EvictionResult(
-                success=True, method="already_gone",
-                message=f"PID {pid} vanished before SIGTERM"
-            )
-        except PermissionError as exc:
-            return EvictionResult(
-                success=False, method="error",
-                message=f"Permission denied terminating PID {pid}: {exc}"
-            )
-
-        # Wait for grace period
-        deadline = time.monotonic() + grace
-        while time.monotonic() < deadline:
-            if not psutil.pid_exists(pid):
-                logger.info("PID %d exited cleanly after SIGTERM", pid)
-                return EvictionResult(
-                    success=True, method="sigterm",
-                    message=f"PID {pid} exited after SIGTERM"
-                )
-            time.sleep(0.05)
-
-        # Escalate to SIGKILL
-        if psutil.pid_exists(pid):
-            try:
-                os.kill(pid, signal.SIGKILL)
-                logger.warning("PID %d required SIGKILL", pid)
-                return EvictionResult(
-                    success=True, method="sigkill",
-                    message=f"PID {pid} killed with SIGKILL"
-                )
-            except ProcessLookupError:
-                pass
-
-        return EvictionResult(
-            success=True, method="sigkill",
-            message=f"PID {pid} is gone"
-        )
--- a/circuitforge_core/resources/agent/gpu_monitor.py
+++ b/circuitforge_core/resources/agent/gpu_monitor.py
@ -1,52 +0,0 @@
-from __future__ import annotations
-
-import logging
-import subprocess
-
-from circuitforge_core.resources.models import GpuInfo
-
-logger = logging.getLogger(__name__)
-
-_NVIDIA_SMI_CMD = [
-    "nvidia-smi",
-    "--query-gpu=index,name,memory.total,memory.used,memory.free",
-    "--format=csv,noheader,nounits",
-]
-
-
-class GpuMonitor:
-    def poll(self) -> list[GpuInfo]:
-        try:
-            result = subprocess.run(
-                _NVIDIA_SMI_CMD,
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-        except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
-            logger.warning("nvidia-smi unavailable: %s", exc)
-            return []
-
-        if result.returncode != 0:
-            logger.warning("nvidia-smi exited %d", result.returncode)
-            return []
-
-        return self._parse(result.stdout)
-
-    def _parse(self, output: str) -> list[GpuInfo]:
-        gpus: list[GpuInfo] = []
-        for line in output.strip().splitlines():
-            parts = [p.strip() for p in line.split(",")]
-            if len(parts) != 5:
-                continue
-            try:
-                gpus.append(GpuInfo(
-                    gpu_id=int(parts[0]),
-                    name=parts[1],
-                    vram_total_mb=int(parts[2]),
-                    vram_used_mb=int(parts[3]),
-                    vram_free_mb=int(parts[4]),
-                ))
-            except ValueError:
-                logger.debug("Skipping malformed nvidia-smi line: %r", line)
-        return gpus
--- a/circuitforge_core/resources/agent/service_manager.py
+++ b/circuitforge_core/resources/agent/service_manager.py
@ -1,186 +0,0 @@
-"""
-ServiceManager — start/stop Docker containers and processes for cf-orch managed services.
-
-Container naming convention: cf-orch-{service}-{node_id}
-"""
-from __future__ import annotations
-
-import os
-import re
-import subprocess
-from collections import defaultdict
-from typing import Any
-
-from circuitforge_core.resources.profiles.schema import DockerSpec, GpuProfile, ProcessSpec
-
-
-def _expand_volume(v: str) -> str:
-    """Expand bash-style volume strings including ${VAR:-default} and $VAR."""
-    def _sub(m: re.Match) -> str:  # type: ignore[type-arg]
-        var, default = m.group(1), m.group(2) or ""
-        return os.environ.get(var) or default
-    v = re.sub(r"\$\{(\w+)(?::-(.*?))?\}", _sub, v)
-    v = re.sub(r"\$(\w+)", lambda m: os.environ.get(m.group(1), m.group(0)), v)
-    return v
-
-
-class ServiceManager:
-    def __init__(
-        self,
-        node_id: str,
-        profile: GpuProfile,
-        advertise_host: str = "127.0.0.1",
-    ) -> None:
-        self.node_id = node_id
-        self.profile = profile
-        self.advertise_host = advertise_host
-        self._procs: dict[str, Any] = {}
-
-    def container_name(self, service: str) -> str:
-        return f"cf-orch-{service}-{self.node_id}"
-
-    def _get_spec(self, service: str) -> DockerSpec | ProcessSpec | None:
-        svc = self.profile.services.get(service)
-        if svc is None:
-            return None
-        return svc.managed
-
-    def is_running(self, service: str) -> bool:
-        spec = self._get_spec(service)
-        if spec is None:
-            return False
-        if isinstance(spec, DockerSpec):
-            try:
-                result = subprocess.run(
-                    [
-                        "docker",
-                        "inspect",
-                        "--format",
-                        "{{.State.Running}}",
-                        self.container_name(service),
-                    ],
-                    capture_output=True,
-                    text=True,
-                    check=True,
-                )
-                return result.stdout.strip() == "true"
-            except subprocess.CalledProcessError:
-                return False
-        if isinstance(spec, ProcessSpec):
-            # For adopt=True services, check the health endpoint regardless of whether
-            # we spawned the process (it may be a system daemon we didn't start).
-            if spec.adopt:
-                return self._probe_health(spec.host_port, spec.health_path)
-            proc = self._procs.get(service)
-            if proc is None or proc.poll() is not None:
-                return False
-            import socket
-            try:
-                with socket.create_connection(("127.0.0.1", spec.host_port), timeout=1):
-                    return True
-            except OSError:
-                return False
-        return False
-
-    def _probe_health(self, port: int, health_path: str = "/health") -> bool:
-        """Return True if the service at localhost:port responds 200 on health_path."""
-        import urllib.request
-        try:
-            url = f"http://127.0.0.1:{port}{health_path}"
-            with urllib.request.urlopen(url, timeout=2.0) as resp:
-                return resp.status == 200
-        except Exception:
-            return False
-
-    def start(self, service: str, gpu_id: int, params: dict[str, str]) -> str:
-        spec = self._get_spec(service)
-        if spec is None:
-            raise ValueError(f"Service {service!r} not in profile or has no managed spec")
-
-        if self.is_running(service):
-            return f"http://{self.advertise_host}:{spec.host_port}"
-
-        if isinstance(spec, DockerSpec):
-            expanded_volumes = [_expand_volume(v) for v in spec.volumes]
-
-            filler: dict[str, str] = defaultdict(str, params)
-            expanded_command = spec.command_template.format_map(filler).split()
-
-            cmd = [
-                "docker", "run", "-d", "--rm",
-                "--name", self.container_name(service),
-                "--runtime", spec.runtime,
-                "--gpus", f"device={gpu_id}",
-                "--ipc", spec.ipc,
-                "-p", f"{spec.host_port}:{spec.port}",
-            ]
-            for vol in expanded_volumes:
-                cmd += ["-v", vol]
-            for key, val in spec.env.items():
-                cmd += ["-e", f"{key}={val}"]
-            cmd.append(spec.image)
-            cmd.extend(expanded_command)
-
-            subprocess.run(cmd, check=True, capture_output=True, text=True)
-            return f"http://{self.advertise_host}:{spec.host_port}"
-
-        if isinstance(spec, ProcessSpec):
-            # adopt=True: if the service is already healthy, claim it without spawning.
-            if spec.adopt and self._probe_health(spec.host_port, spec.health_path):
-                return f"http://{self.advertise_host}:{spec.host_port}"
-
-            import subprocess as _sp
-
-            filler = defaultdict(str, params)
-            filler.setdefault("port", str(spec.port))
-            filler.setdefault("gpu_id", str(gpu_id))
-            args_expanded = spec.args_template.format_map(filler).split()
-
-            cmd = [spec.exec_path] + args_expanded
-            env = {**__import__("os").environ}
-            proc = _sp.Popen(
-                cmd,
-                cwd=spec.cwd or None,
-                env=env,
-                stdout=_sp.DEVNULL,
-                stderr=_sp.DEVNULL,
-            )
-            self._procs[service] = proc
-            return f"http://{self.advertise_host}:{spec.host_port}"
-
-        raise NotImplementedError(f"Unknown spec type: {type(spec)}")
-
-    def stop(self, service: str) -> bool:
-        spec = self._get_spec(service)
-        if spec is None:
-            return False
-        if isinstance(spec, DockerSpec):
-            try:
-                subprocess.run(
-                    ["docker", "stop", self.container_name(service)],
-                    check=True,
-                    capture_output=True,
-                    text=True,
-                )
-                return True
-            except subprocess.CalledProcessError:
-                return False
-        if isinstance(spec, ProcessSpec):
-            proc = self._procs.pop(service, None)
-            if proc is not None:
-                proc.terminate()
-                try:
-                    proc.wait(timeout=10)
-                except Exception:
-                    proc.kill()
-            return True
-        return False
-
-    def list_running(self) -> list[str]:
-        return [svc for svc in self.profile.services if self.is_running(svc)]
-
-    def get_url(self, service: str) -> str | None:
-        spec = self._get_spec(service)
-        if spec is None or not self.is_running(service):
-            return None
-        return f"http://{self.advertise_host}:{spec.host_port}"
--- a/circuitforge_core/resources/agent/service_probe.py
+++ b/circuitforge_core/resources/agent/service_probe.py
@ -1,123 +0,0 @@
-"""
-Probe running services to detect which models are currently loaded in VRAM.
-
-Two probe strategies run together:
-
-1. Well-known ports — always checked, regardless of who started the service.
-   Catches ollama, vLLM, etc. running outside cf-orch management.
-
-2. Managed services — services cf-orch started via ServiceManager.
-   Checked on their configured host_port, deduplicates with well-known results.
-
-Each service exposes a different introspection API:
-  - vllm:   GET /v1/models  → {"data": [{"id": "<model-name>"}]}
-  - ollama: GET /api/ps     → {"models": [{"name": "<model>", "size_vram": <bytes>}]}
-
-ollama can have multiple models loaded simultaneously; each is reported as a
-separate entry so the dashboard shows per-model residency.
-
-The probe is best-effort: a timeout or connection refusal means model_name=None
-but the service is still reported as resident.
-"""
-from __future__ import annotations
-
-import json
-import logging
-import urllib.request
-from typing import Any
-
-from circuitforge_core.resources.profiles.schema import DockerSpec
-
-logger = logging.getLogger(__name__)
-
-_PROBE_TIMEOUT_S = 2.0
-
-# Well-known service ports probed on every heartbeat.
-# key → (service_name, prober_key)
-_WELL_KNOWN_PORTS: dict[int, str] = {
-    11434: "ollama",
-    8000:  "vllm",
-    8080:  "vllm",  # common alt vLLM port
-}
-
-
-def _fetch_json(url: str) -> dict[str, Any] | None:
-    """GET a URL and parse JSON; returns None on any error."""
-    try:
-        with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT_S) as resp:
-            return json.loads(resp.read())
-    except Exception as exc:
-        logger.debug("Probe %s: %s", url, exc)
-        return None
-
-
-def _probe_vllm(port: int) -> list[str]:
-    data = _fetch_json(f"http://127.0.0.1:{port}/v1/models")
-    if data and data.get("data"):
-        return [m["id"] for m in data["data"] if m.get("id")]
-    return []
-
-
-def _probe_ollama(port: int) -> list[str]:
-    # /api/ps lists models currently *loaded in memory*, not just downloaded.
-    data = _fetch_json(f"http://127.0.0.1:{port}/api/ps")
-    if data and data.get("models"):
-        return [m["name"] for m in data["models"] if m.get("name")]
-    return []
-
-
-_PROBERS: dict[str, Any] = {
-    "vllm":   _probe_vllm,
-    "ollama": _probe_ollama,
-}
-
-
-def probe_all(service_manager: Any) -> list[dict[str, Any]]:
-    """
-    Probe all services — both well-known ports and cf-orch managed services.
-
-    Returns a list of dicts: [{"service": str, "model_name": str | None}].
-    Multiple loaded models in one service (e.g. two ollama models) each get
-    their own entry, disambiguated as "ollama/0", "ollama/1", etc.
-    """
-    results: list[dict[str, Any]] = []
-    seen_ports: set[int] = set()
-
-    # ── 1. Well-known ports ──────────────────────────────────────────
-    for port, service in _WELL_KNOWN_PORTS.items():
-        prober = _PROBERS.get(service)
-        if prober is None:
-            continue
-        models = prober(port)
-        if not models:
-            continue  # nothing on this port right now
-        seen_ports.add(port)
-        if len(models) == 1:
-            results.append({"service": service, "model_name": models[0]})
-        else:
-            for i, model in enumerate(models):
-                results.append({"service": f"{service}/{i}", "model_name": model})
-
-    # ── 2. Managed services (cf-orch started) ───────────────────────
-    if service_manager is not None:
-        for service in service_manager.list_running():
-            spec = service_manager._get_spec(service)
-            if not isinstance(spec, DockerSpec):
-                continue
-            if spec.host_port in seen_ports:
-                continue  # already captured by well-known probe
-            prober = _PROBERS.get(service)
-            if prober is None:
-                results.append({"service": service, "model_name": None})
-                continue
-            models = prober(spec.host_port)
-            seen_ports.add(spec.host_port)
-            if not models:
-                results.append({"service": service, "model_name": None})
-            elif len(models) == 1:
-                results.append({"service": service, "model_name": models[0]})
-            else:
-                for i, model in enumerate(models):
-                    results.append({"service": f"{service}/{i}", "model_name": model})
-
-    return results
--- a/circuitforge_core/resources/cli.py
+++ b/circuitforge_core/resources/cli.py
@ -1,234 +0,0 @@
-from __future__ import annotations
-
-import logging
-import sys
-from pathlib import Path
-from typing import Annotated, Optional
-
-import typer
-import uvicorn
-
-logger = logging.getLogger(__name__)
-
-app = typer.Typer(name="cf-orch", help="CircuitForge GPU resource orchestrator")
-
-_SYSTEMD_UNIT_PATH = Path("/etc/systemd/system/cf-orch.service")
-
-_SYSTEMD_UNIT_TEMPLATE = """\
-[Unit]
-Description=CircuitForge GPU Resource Orchestrator
-After=network.target
-
-[Service]
-Type=simple
-ExecStart={python} -m circuitforge_core.resources.cli start
-Restart=on-failure
-RestartSec=5
-
-[Install]
-WantedBy=multi-user.target
-"""
-
-
-@app.command()
-def start(
-    profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
-    host: str = "0.0.0.0",
-    port: int = 7700,
-    node_id: str = "local",
-    agent_port: int = 7701,
-) -> None:
-    """Start the cf-orch coordinator (auto-detects GPU profile if not specified).
-
-    Automatically pre-registers the local agent so its GPUs appear on the
-    dashboard immediately. Remote nodes self-register via POST /api/nodes.
-    """
-    from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-    from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-    from circuitforge_core.resources.coordinator.app import create_coordinator_app
-    from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-    from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
-
-    from circuitforge_core.resources.coordinator.node_store import NodeStore
-
-    lease_manager = LeaseManager()
-    profile_registry = ProfileRegistry()
-    service_registry = ServiceRegistry()
-    node_store = NodeStore()
-    supervisor = AgentSupervisor(
-        lease_manager=lease_manager,
-        service_registry=service_registry,
-        profile_registry=profile_registry,
-        node_store=node_store,
-    )
-    restored = supervisor.restore_from_store()
-    if restored:
-        typer.echo(f"Restored {restored} known node(s) from previous session")
-
-    monitor = GpuMonitor()
-    gpus = monitor.poll()
-    if not gpus:
-        typer.echo(
-            "Warning: no GPUs detected via nvidia-smi — coordinator running with 0 VRAM"
-        )
-    else:
-        typer.echo(f"Detected {len(gpus)} GPU(s)")
-
-    if profile:
-        active_profile = profile_registry.load(profile)
-        typer.echo(f"Using profile: {active_profile.name} (from {profile})")
-    else:
-        active_profile = (
-            profile_registry.auto_detect(gpus)
-            if gpus
-            else profile_registry.list_public()[-1]
-        )
-        typer.echo(f"Auto-selected profile: {active_profile.name}")
-
-    # Pre-register the local agent — the heartbeat loop will poll it for live GPU data.
-    local_agent_url = f"http://127.0.0.1:{agent_port}"
-    supervisor.register(node_id, local_agent_url)
-    typer.echo(f"Registered local node '{node_id}' → {local_agent_url}")
-
-    coordinator_app = create_coordinator_app(
-        lease_manager=lease_manager,
-        profile_registry=profile_registry,
-        agent_supervisor=supervisor,
-        service_registry=service_registry,
-    )
-
-    typer.echo(f"Starting cf-orch coordinator on {host}:{port}")
-    uvicorn.run(coordinator_app, host=host, port=port)
-
-
-@app.command()
-def agent(
-    coordinator: str = "http://localhost:7700",
-    node_id: str = "local",
-    host: str = "0.0.0.0",
-    port: int = 7701,
-    advertise_host: Optional[str] = None,
-    profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
-) -> None:
-    """Start a cf-orch node agent and self-register with the coordinator.
-
-    The agent starts its HTTP server, then POSTs its URL to the coordinator
-    so it appears on the dashboard without manual configuration.
-
-    Use --advertise-host to override the IP the coordinator should use to
-    reach this agent (e.g. on a multi-homed or NATted host).
-    """
-    import threading
-    import httpx
-    from circuitforge_core.resources.agent.app import create_agent_app
-    from circuitforge_core.resources.agent.service_manager import ServiceManager
-    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-
-    # The URL the coordinator should use to reach this agent.
-    reach_host = advertise_host or ("127.0.0.1" if host in ("0.0.0.0", "::") else host)
-    agent_url = f"http://{reach_host}:{port}"
-
-    _RECONNECT_INTERVAL_S = 30.0
-
-    def _reconnect_loop() -> None:
-        """
-        Persistently re-register this agent with the coordinator.
-
-        Runs as a daemon thread for the lifetime of the agent process:
-        - Waits 2 s on first run (uvicorn needs time to bind)
-        - Re-registers every 30 s thereafter
-        - If the coordinator is down, silently retries — no crashing
-        - When the coordinator restarts, the agent re-appears within one cycle
-
-        This means coordinator restarts require no manual intervention on agent hosts.
-        """
-        import time
-        first = True
-        while True:
-            time.sleep(2.0 if first else _RECONNECT_INTERVAL_S)
-            first = False
-            try:
-                resp = httpx.post(
-                    f"{coordinator}/api/nodes",
-                    json={"node_id": node_id, "agent_url": agent_url},
-                    timeout=5.0,
-                )
-                if resp.is_success:
-                    logger.debug("Registered with coordinator at %s as '%s'", coordinator, node_id)
-                else:
-                    logger.warning(
-                        "Coordinator registration returned %s", resp.status_code
-                    )
-            except Exception as exc:
-                logger.debug("Coordinator at %s unreachable, will retry: %s", coordinator, exc)
-
-    # Fire reconnect loop in a daemon thread so uvicorn.run() can start blocking immediately.
-    threading.Thread(target=_reconnect_loop, daemon=True, name="cf-orch-reconnect").start()
-    typer.echo(f"Reconnect loop started — will register with {coordinator} every {int(_RECONNECT_INTERVAL_S)}s")
-
-    service_manager = None
-    try:
-        from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
-        pr = ProfileRegistry()
-        gpus = GpuMonitor().poll()
-        p = pr.load(Path(profile)) if profile else pr.auto_detect(gpus)
-        service_manager = ServiceManager(node_id=node_id, profile=p, advertise_host=reach_host)
-        typer.echo(f"ServiceManager ready with profile: {p.name}")
-    except Exception as exc:
-        typer.echo(f"Warning: ServiceManager unavailable ({exc})", err=True)
-
-    agent_app = create_agent_app(node_id=node_id, service_manager=service_manager)
-    typer.echo(f"Starting cf-orch agent [{node_id}] on {host}:{port}")
-    uvicorn.run(agent_app, host=host, port=port)
-
-
-@app.command()
-def status(coordinator: str = "http://localhost:7700") -> None:
-    """Show GPU and lease status from the coordinator."""
-    import httpx
-
-    try:
-        resp = httpx.get(f"{coordinator}/api/nodes", timeout=5.0)
-        resp.raise_for_status()
-        nodes = resp.json().get("nodes", [])
-        for node in nodes:
-            typer.echo(f"\nNode: {node['node_id']}")
-            for gpu in node.get("gpus", []):
-                typer.echo(
-                    f"  GPU {gpu['gpu_id']}: {gpu['name']} — "
-                    f"{gpu['vram_used_mb']}/{gpu['vram_total_mb']} MB used"
-                )
-    except Exception as exc:
-        typer.echo(f"Coordinator unreachable at {coordinator}: {exc}", err=True)
-        raise typer.Exit(1)
-
-
-@app.command("install-service")
-def install_service(
-    dry_run: bool = typer.Option(
-        False, "--dry-run", help="Print unit file without writing"
-    ),
-) -> None:
-    """Write a systemd unit file for cf-orch (requires root)."""
-    python = sys.executable
-    unit_content = _SYSTEMD_UNIT_TEMPLATE.format(python=python)
-    if dry_run:
-        typer.echo(f"Would write to {_SYSTEMD_UNIT_PATH}:\n")
-        typer.echo(unit_content)
-        return
-    try:
-        _SYSTEMD_UNIT_PATH.write_text(unit_content)
-        typer.echo(f"Written: {_SYSTEMD_UNIT_PATH}")
-        typer.echo(
-            "Run: sudo systemctl daemon-reload && sudo systemctl enable --now cf-orch"
-        )
-    except PermissionError:
-        typer.echo(
-            f"Permission denied writing to {_SYSTEMD_UNIT_PATH}. Run as root.", err=True
-        )
-        raise typer.Exit(1)
-
-
-if __name__ == "__main__":
-    app()
--- a/circuitforge_core/resources/client.py
+++ b/circuitforge_core/resources/client.py
@ -1,143 +0,0 @@
-from __future__ import annotations
-
-import logging
-import os
-from contextlib import contextmanager, asynccontextmanager
-from dataclasses import dataclass
-
-import httpx
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Allocation:
-    allocation_id: str
-    service: str
-    node_id: str
-    gpu_id: int
-    model: str | None
-    url: str
-    started: bool
-    warm: bool
-
-
-class CFOrchClient:
-    """
-    Client for cf-orch coordinator allocation.
-
-    Sync usage (in LLMRouter or other sync code):
-        client = CFOrchClient(os.environ["CF_ORCH_URL"])
-        with client.allocate("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
-            # alloc.url is the inference endpoint
-
-    Async usage (in FastAPI apps):
-        async with client.allocate_async("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
-            ...
-
-    Authentication:
-        Pass api_key explicitly, or set CF_LICENSE_KEY env var. When set, every
-        request carries Authorization: Bearer <key>. Required for the hosted
-        CircuitForge coordinator (orch.circuitforge.tech); optional for local
-        self-hosted coordinators.
-
-    Raises ValueError immediately if coordinator_url is empty.
-    """
-
-    def __init__(self, coordinator_url: str, api_key: str | None = None) -> None:
-        if not coordinator_url:
-            raise ValueError("coordinator_url is empty — cf-orch not configured")
-        self._url = coordinator_url.rstrip("/")
-        self._api_key = api_key or os.environ.get("CF_LICENSE_KEY", "")
-
-    def _headers(self) -> dict[str, str]:
-        if self._api_key:
-            return {"Authorization": f"Bearer {self._api_key}"}
-        return {}
-
-    def _build_body(self, model_candidates: list[str] | None, ttl_s: float, caller: str) -> dict:
-        return {
-            "model_candidates": model_candidates or [],
-            "ttl_s": ttl_s,
-            "caller": caller,
-        }
-
-    def _parse_allocation(self, data: dict, service: str) -> Allocation:
-        return Allocation(
-            allocation_id=data["allocation_id"],
-            service=service,
-            node_id=data["node_id"],
-            gpu_id=data["gpu_id"],
-            model=data.get("model"),
-            url=data["url"],
-            started=data.get("started", False),
-            warm=data.get("warm", False),
-        )
-
-    @contextmanager
-    def allocate(
-        self,
-        service: str,
-        *,
-        model_candidates: list[str] | None = None,
-        ttl_s: float = 3600.0,
-        caller: str = "",
-    ):
-        """Sync context manager. Allocates on enter, releases on exit."""
-        resp = httpx.post(
-            f"{self._url}/api/services/{service}/allocate",
-            json=self._build_body(model_candidates, ttl_s, caller),
-            headers=self._headers(),
-            timeout=120.0,
-        )
-        if not resp.is_success:
-            raise RuntimeError(
-                f"cf-orch allocation failed for {service!r}: "
-                f"HTTP {resp.status_code} — {resp.text[:200]}"
-            )
-        alloc = self._parse_allocation(resp.json(), service)
-        try:
-            yield alloc
-        finally:
-            try:
-                httpx.delete(
-                    f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
-                    headers=self._headers(),
-                    timeout=10.0,
-                )
-            except Exception as exc:
-                logger.debug("cf-orch release failed (non-fatal): %s", exc)
-
-    @asynccontextmanager
-    async def allocate_async(
-        self,
-        service: str,
-        *,
-        model_candidates: list[str] | None = None,
-        ttl_s: float = 3600.0,
-        caller: str = "",
-    ):
-        """Async context manager. Allocates on enter, releases on exit."""
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            resp = await client.post(
-                f"{self._url}/api/services/{service}/allocate",
-                json=self._build_body(model_candidates, ttl_s, caller),
-                headers=self._headers(),
-            )
-            if not resp.is_success:
-                raise RuntimeError(
-                    f"cf-orch allocation failed for {service!r}: "
-                    f"HTTP {resp.status_code} — {resp.text[:200]}"
-                )
-            alloc = self._parse_allocation(resp.json(), service)
-            try:
-                yield alloc
-            finally:
-                try:
-                    await client.delete(
-                        f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
-                        headers=self._headers(),
-                        timeout=10.0,
-                    )
-                except Exception as exc:
-                    logger.debug("cf-orch async release failed (non-fatal): %s", exc)
--- a/circuitforge_core/resources/compose.yml
+++ b/circuitforge_core/resources/compose.yml
@ -1,44 +0,0 @@
-# circuitforge_core/resources/compose.yml
-# One-command cf-orch deployment for Docker self-hosters:
-#   docker compose -f path/to/compose.yml up cf-orch-coordinator
-
-services:
-  cf-orch-coordinator:
-    image: python:3.12-slim
-    command: >
-      sh -c "pip install 'circuitforge-core[orch]' &&
-             cf-orch start --host 0.0.0.0 --port 7700"
-    ports:
-      - "7700:7700"
-    volumes:
-      - /run/docker.sock:/var/run/docker.sock:ro
-      - cf-orch-data:/data
-    environment:
-      - CFORCH_PROFILE=${CFORCH_PROFILE:-}
-    restart: unless-stopped
-    devices:
-      - /dev/nvidia0:/dev/nvidia0
-      - /dev/nvidiactl:/dev/nvidiactl
-    runtime: nvidia
-
-  cf-orch-agent:
-    image: python:3.12-slim
-    command: >
-      sh -c "pip install 'circuitforge-core[orch]' &&
-             cf-orch agent --coordinator http://cf-orch-coordinator:7700
-                           --node-id ${CFORCH_NODE_ID:-local}
-                           --host 0.0.0.0 --port 7701"
-    ports:
-      - "7701:7701"
-    depends_on:
-      - cf-orch-coordinator
-    environment:
-      - CFORCH_NODE_ID=${CFORCH_NODE_ID:-local}
-    restart: unless-stopped
-    devices:
-      - /dev/nvidia0:/dev/nvidia0
-      - /dev/nvidiactl:/dev/nvidiactl
-    runtime: nvidia
-
-volumes:
-  cf-orch-data:
--- a/circuitforge_core/resources/coordinator/init.py
+++ b/circuitforge_core/resources/coordinator/init.py
--- a/circuitforge_core/resources/coordinator/agent_supervisor.py
+++ b/circuitforge_core/resources/coordinator/agent_supervisor.py
@ -1,209 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-import logging
-import time
-from dataclasses import dataclass, field
-
-import httpx
-
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.node_store import NodeStore
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.models import GpuInfo, NodeInfo, ResidentAllocation
-
-logger = logging.getLogger(__name__)
-
-_HEARTBEAT_INTERVAL_S = 10.0
-_AGENT_TIMEOUT_S = 5.0
-
-
-@dataclass
-class AgentRecord:
-    node_id: str
-    agent_url: str
-    last_seen: float = field(default_factory=time.time)
-    gpus: list[GpuInfo] = field(default_factory=list)
-    online: bool = False
-
-
-class AgentSupervisor:
-    def __init__(
-        self,
-        lease_manager: LeaseManager,
-        service_registry: ServiceRegistry | None = None,
-        profile_registry: ProfileRegistry | None = None,
-        node_store: NodeStore | None = None,
-    ) -> None:
-        self._agents: dict[str, AgentRecord] = {}
-        self._lease_manager = lease_manager
-        self._running = False
-        self._service_registry = service_registry
-        self._profile_registry = profile_registry
-        self._node_store = node_store
-        self._heartbeat_tick = 0
-
-    def restore_from_store(self) -> int:
-        """
-        Load previously-known nodes from NodeStore into the in-memory registry.
-
-        All restored nodes start as offline=False. The heartbeat loop will poll
-        them on its first tick and promote any that respond to online=True.
-
-        Returns the number of nodes restored.
-        """
-        if self._node_store is None:
-            return 0
-        restored = 0
-        for node_id, agent_url in self._node_store.all():
-            if node_id not in self._agents:
-                self._agents[node_id] = AgentRecord(
-                    node_id=node_id, agent_url=agent_url, online=False
-                )
-                restored += 1
-        if restored:
-            logger.info("NodeStore: restored %d known node(s) from previous session", restored)
-        return restored
-
-    def register(self, node_id: str, agent_url: str) -> None:
-        if node_id not in self._agents:
-            self._agents[node_id] = AgentRecord(node_id=node_id, agent_url=agent_url)
-            logger.info("Registered agent node: %s @ %s", node_id, agent_url)
-        else:
-            if self._agents[node_id].agent_url != agent_url:
-                self._agents[node_id].agent_url = agent_url
-                logger.info("Updated agent URL for %s → %s", node_id, agent_url)
-        if self._node_store is not None:
-            self._node_store.upsert(node_id, agent_url)
-
-    def get_node_info(self, node_id: str) -> NodeInfo | None:
-        record = self._agents.get(node_id)
-        if record is None:
-            return None
-        return NodeInfo(
-            node_id=record.node_id,
-            agent_url=record.agent_url,
-            gpus=record.gpus,
-            last_heartbeat=record.last_seen,
-        )
-
-    def all_nodes(self) -> list[NodeInfo]:
-        return [
-            NodeInfo(
-                node_id=r.node_id,
-                agent_url=r.agent_url,
-                gpus=r.gpus,
-                last_heartbeat=r.last_seen,
-            )
-            for r in self._agents.values()
-        ]
-
-    def online_agents(self) -> "dict[str, AgentRecord]":
-        """Return only currently-online agents, keyed by node_id."""
-        return {nid: rec for nid, rec in self._agents.items() if rec.online}
-
-    async def poll_agent(self, node_id: str) -> bool:
-        record = self._agents.get(node_id)
-        if record is None:
-            return False
-        try:
-            async with httpx.AsyncClient(timeout=_AGENT_TIMEOUT_S) as client:
-                gpu_resp = await client.get(f"{record.agent_url}/gpu-info")
-                gpu_resp.raise_for_status()
-
-                # Resident-info is best-effort — older agents may not have the endpoint.
-                try:
-                    res_resp = await client.get(f"{record.agent_url}/resident-info")
-                    resident_data = res_resp.json() if res_resp.is_success else {}
-                except Exception:
-                    resident_data = {}
-
-            data = gpu_resp.json()
-            gpus = [
-                GpuInfo(
-                    gpu_id=g["gpu_id"],
-                    name=g["name"],
-                    vram_total_mb=g["vram_total_mb"],
-                    vram_used_mb=g["vram_used_mb"],
-                    vram_free_mb=g["vram_free_mb"],
-                )
-                for g in data.get("gpus", [])
-            ]
-            record.gpus = gpus
-            record.last_seen = time.time()
-            record.online = True
-            for gpu in gpus:
-                self._lease_manager.register_gpu(node_id, gpu.gpu_id, gpu.vram_total_mb)
-
-            residents = [
-                (r["service"], r.get("model_name"))
-                for r in resident_data.get("residents", [])
-            ]
-            self._lease_manager.set_residents_for_node(node_id, residents)
-
-            return True
-        except Exception as exc:
-            logger.warning("Agent %s unreachable: %s", node_id, exc)
-            record.online = False
-            return False
-
-    async def poll_all(self) -> None:
-        await asyncio.gather(*[self.poll_agent(nid) for nid in self._agents])
-
-    def _build_idle_stop_config(self) -> dict[str, int]:
-        if self._profile_registry is None:
-            return {}
-        config: dict[str, int] = {}
-        for profile in self._profile_registry.list_public():
-            for svc_name, svc in profile.services.items():
-                if svc.idle_stop_after_s > 0:
-                    existing = config.get(svc_name, 0)
-                    config[svc_name] = min(existing, svc.idle_stop_after_s) if existing > 0 else svc.idle_stop_after_s
-        return config
-
-    async def _http_post(self, url: str) -> bool:
-        try:
-            async with httpx.AsyncClient(timeout=10.0) as client:
-                resp = await client.post(url)
-                return resp.is_success
-        except Exception as exc:
-            logger.warning("HTTP POST %s failed: %s", url, exc)
-            return False
-
-    async def _run_idle_sweep(self) -> None:
-        if self._service_registry is None:
-            return
-        expired = self._service_registry.sweep_expired_allocations()
-        if expired:
-            logger.info("TTL sweep: expired %d allocation(s): %s", len(expired), expired)
-        idle_stop_config = self._build_idle_stop_config()
-        if not idle_stop_config:
-            return
-        timed_out = self._service_registry.idle_past_timeout(idle_stop_config)
-        for instance in timed_out:
-            node_info = self.get_node_info(instance.node_id)
-            if node_info is None:
-                continue
-            stop_url = f"{node_info.agent_url}/services/{instance.service}/stop"
-            logger.info(
-                "Idle sweep: stopping %s on %s gpu%s (idle timeout)",
-                instance.service, instance.node_id, instance.gpu_id,
-            )
-            success = await self._http_post(stop_url)
-            if success:
-                self._service_registry.mark_stopped(
-                    instance.service, instance.node_id, instance.gpu_id
-                )
-
-    async def run_heartbeat_loop(self) -> None:
-        self._running = True
-        while self._running:
-            await self.poll_all()
-            self._heartbeat_tick += 1
-            if self._heartbeat_tick % 3 == 0:
-                await self._run_idle_sweep()
-            await asyncio.sleep(_HEARTBEAT_INTERVAL_S)
-
-    def stop(self) -> None:
-        self._running = False
--- a/circuitforge_core/resources/coordinator/app.py
+++ b/circuitforge_core/resources/coordinator/app.py
@ -1,509 +0,0 @@
-from __future__ import annotations
-
-import logging
-import time
-import urllib.request
-from contextlib import asynccontextmanager
-from pathlib import Path
-from typing import Any
-
-logger = logging.getLogger(__name__)
-
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import HTMLResponse
-from pydantic import BaseModel
-
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.node_selector import select_node
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.profiles.schema import ProcessSpec
-
-_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
-
-
-def _get_health_path(profile_registry: ProfileRegistry, service: str) -> str:
-    """Return the health_path for a service from the first matching profile spec."""
-    for profile in profile_registry.list_public():
-        svc = profile.services.get(service)
-        if svc and isinstance(svc.managed, ProcessSpec):
-            return svc.managed.health_path
-    return "/health"
-
-_PROBE_INTERVAL_S = 5.0    # how often to poll starting instances
-_PROBE_TIMEOUT_S = 300.0   # give up and mark stopped after this many seconds
-
-
-async def _run_instance_probe_loop(service_registry: ServiceRegistry) -> None:
-    """
-    Background loop: transition 'starting' instances to 'running' once their
-    /health endpoint responds, or to 'stopped' after PROBE_TIMEOUT_S.
-    """
-    import asyncio
-
-    start_times: dict[str, float] = {}  # instance key → time first seen as starting
-
-    while True:
-        await asyncio.sleep(_PROBE_INTERVAL_S)
-        now = time.time()
-        for inst in service_registry.all_instances():
-            if inst.state != "starting":
-                start_times.pop(f"{inst.service}:{inst.node_id}:{inst.gpu_id}", None)
-                continue
-            key = f"{inst.service}:{inst.node_id}:{inst.gpu_id}"
-            start_times.setdefault(key, now)
-
-            healthy = False
-            if inst.url:
-                try:
-                    with urllib.request.urlopen(
-                        inst.url.rstrip("/") + inst.health_path, timeout=2.0
-                    ) as resp:
-                        healthy = resp.status == 200
-                except Exception:
-                    pass
-
-            if healthy:
-                service_registry.upsert_instance(
-                    service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
-                    state="running", model=inst.model, url=inst.url,
-                )
-                start_times.pop(key, None)
-                logger.info("Instance %s/%s gpu=%s transitioned to running", inst.service, inst.node_id, inst.gpu_id)
-            elif now - start_times[key] > _PROBE_TIMEOUT_S:
-                service_registry.upsert_instance(
-                    service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
-                    state="stopped", model=inst.model, url=inst.url,
-                )
-                start_times.pop(key, None)
-                logger.warning("Instance %s/%s gpu=%s timed out in starting state — marked stopped", inst.service, inst.node_id, inst.gpu_id)
-
-
-class LeaseRequest(BaseModel):
-    node_id: str
-    gpu_id: int
-    mb: int
-    service: str
-    priority: int = 2
-    ttl_s: float = 0.0
-
-
-class NodeRegisterRequest(BaseModel):
-    node_id: str
-    agent_url: str  # e.g. "http://10.1.10.71:7701"
-
-
-class ServiceEnsureRequest(BaseModel):
-    node_id: str
-    gpu_id: int = 0
-    params: dict[str, str] = {}
-    ttl_s: float = 3600.0
-    # Ordered list of model names to try; falls back down the list if VRAM is tight.
-    # The "model" key in params is used if this list is empty.
-    model_candidates: list[str] = []
-
-
-class ServiceAllocateRequest(BaseModel):
-    model_candidates: list[str] = []
-    gpu_id: int | None = None
-    params: dict[str, str] = {}
-    ttl_s: float = 3600.0
-    caller: str = ""
-
-
-def create_coordinator_app(
-    lease_manager: LeaseManager,
-    profile_registry: ProfileRegistry,
-    agent_supervisor: AgentSupervisor,
-    service_registry: ServiceRegistry,
-) -> FastAPI:
-    eviction_engine = EvictionEngine(lease_manager=lease_manager)
-
-    @asynccontextmanager
-    async def _lifespan(app: FastAPI):  # type: ignore[type-arg]
-        import asyncio
-        heartbeat_task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
-        probe_task = asyncio.create_task(_run_instance_probe_loop(service_registry))
-        yield
-        agent_supervisor.stop()
-        heartbeat_task.cancel()
-        probe_task.cancel()
-
-    app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)
-
-    # Optional Heimdall auth — enabled when HEIMDALL_URL env var is set.
-    # Self-hosted coordinators skip this entirely; the CF-hosted public endpoint
-    # (orch.circuitforge.tech) sets HEIMDALL_URL to gate paid+ access.
-    from circuitforge_core.resources.coordinator.auth import HeimdallAuthMiddleware
-    _auth = HeimdallAuthMiddleware.from_env()
-    if _auth is not None:
-        app.middleware("http")(_auth)
-
-    @app.get("/", response_class=HTMLResponse, include_in_schema=False)
-    def dashboard() -> HTMLResponse:
-        return HTMLResponse(content=_DASHBOARD_HTML)
-
-    @app.get("/api/health")
-    def health() -> dict[str, Any]:
-        return {"status": "ok"}
-
-    @app.get("/api/nodes")
-    def get_nodes() -> dict[str, Any]:
-        nodes = agent_supervisor.all_nodes()
-        return {
-            "nodes": [
-                {
-                    "node_id": n.node_id,
-                    "agent_url": n.agent_url,
-                    "last_heartbeat": n.last_heartbeat,
-                    "gpus": [
-                        {
-                            "gpu_id": g.gpu_id,
-                            "name": g.name,
-                            "vram_total_mb": g.vram_total_mb,
-                            "vram_used_mb": g.vram_used_mb,
-                            "vram_free_mb": g.vram_free_mb,
-                        }
-                        for g in n.gpus
-                    ],
-                }
-                for n in nodes
-            ]
-        }
-
-    @app.post("/api/nodes")
-    async def register_node(req: NodeRegisterRequest) -> dict[str, Any]:
-        """Agents call this to self-register. Coordinator immediately polls for GPU info."""
-        agent_supervisor.register(req.node_id, req.agent_url)
-        await agent_supervisor.poll_agent(req.node_id)
-        return {"registered": True, "node_id": req.node_id}
-
-    @app.get("/api/profiles")
-    def get_profiles() -> dict[str, Any]:
-        return {
-            "profiles": [
-                {"name": p.name, "vram_total_mb": p.vram_total_mb}
-                for p in profile_registry.list_public()
-            ]
-        }
-
-    @app.get("/api/resident")
-    def get_residents() -> dict[str, Any]:
-        return {
-            "residents": [
-                {
-                    "service": r.service,
-                    "node_id": r.node_id,
-                    "model_name": r.model_name,
-                    "first_seen": r.first_seen,
-                }
-                for r in lease_manager.all_residents()
-            ]
-        }
-
-    @app.get("/api/leases")
-    def get_leases() -> dict[str, Any]:
-        return {
-            "leases": [
-                {
-                    "lease_id": lease.lease_id,
-                    "node_id": lease.node_id,
-                    "gpu_id": lease.gpu_id,
-                    "mb_granted": lease.mb_granted,
-                    "holder_service": lease.holder_service,
-                    "priority": lease.priority,
-                    "expires_at": lease.expires_at,
-                }
-                for lease in lease_manager.all_leases()
-            ]
-        }
-
-    @app.post("/api/leases")
-    async def request_lease(req: LeaseRequest) -> dict[str, Any]:
-        node_info = agent_supervisor.get_node_info(req.node_id)
-        if node_info is None:
-            raise HTTPException(
-                status_code=422,
-                detail=f"Unknown node_id {req.node_id!r} — node not registered",
-            )
-        agent_url = node_info.agent_url
-
-        lease = await eviction_engine.request_lease(
-            node_id=req.node_id,
-            gpu_id=req.gpu_id,
-            mb=req.mb,
-            service=req.service,
-            priority=req.priority,
-            agent_url=agent_url,
-            ttl_s=req.ttl_s,
-        )
-        if lease is None:
-            raise HTTPException(
-                status_code=503,
-                detail="Insufficient VRAM — no eviction candidates available",
-            )
-        return {
-            "lease": {
-                "lease_id": lease.lease_id,
-                "node_id": lease.node_id,
-                "gpu_id": lease.gpu_id,
-                "mb_granted": lease.mb_granted,
-                "holder_service": lease.holder_service,
-                "priority": lease.priority,
-                "expires_at": lease.expires_at,
-            }
-        }
-
-    @app.delete("/api/leases/{lease_id}")
-    async def release_lease(lease_id: str) -> dict[str, Any]:
-        released = await lease_manager.release(lease_id)
-        if not released:
-            raise HTTPException(status_code=404, detail=f"Lease {lease_id!r} not found")
-        return {"released": True, "lease_id": lease_id}
-
-    @app.post("/api/services/{service}/ensure")
-    async def ensure_service(service: str, req: ServiceEnsureRequest) -> dict[str, Any]:
-        """
-        Ensure a managed service is running on the given node.
-
-        If model_candidates is provided, tries each model in order, skipping any
-        that exceed the live free VRAM on the target GPU. Falls back down the list
-        until one succeeds. The selected model is returned in the response.
-        """
-        import httpx
-
-        node_info = agent_supervisor.get_node_info(req.node_id)
-        if node_info is None:
-            raise HTTPException(422, detail=f"Unknown node_id {req.node_id!r}")
-
-        # Resolve candidate list — fall back to params["model"] if not specified.
-        candidates: list[str] = req.model_candidates or (
-            [req.params["model"]] if "model" in req.params else []
-        )
-        if not candidates:
-            raise HTTPException(422, detail="No model specified: set params.model or model_candidates")
-
-        # Live free VRAM on the target GPU (used for pre-flight filtering).
-        gpu = next((g for g in node_info.gpus if g.gpu_id == req.gpu_id), None)
-        free_mb = gpu.vram_free_mb if gpu else 0
-
-        # Profile max_mb for the service gives us the VRAM ceiling for this slot.
-        # Models larger than free_mb are skipped before we even try to start them.
-        # We use model file size as a rough proxy — skip if free_mb < half of max_mb,
-        # since a fully-loaded model typically needs ~50-80% of its param size in VRAM.
-        service_max_mb = 0
-        for p in profile_registry.list_public():
-            svc = p.services.get(service)
-            if svc:
-                service_max_mb = svc.max_mb
-                break
-
-        # Filter candidates by VRAM headroom — require free VRAM >= service ceiling
-        # so the model can actually load without competing for VRAM with other processes.
-        if service_max_mb > 0 and free_mb < service_max_mb:
-            raise HTTPException(
-                503,
-                detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
-            )
-
-        last_error: str = ""
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            for model in candidates:
-                params_with_model = {**req.params, "model": model}
-                try:
-                    start_resp = await client.post(
-                        f"{node_info.agent_url}/services/{service}/start",
-                        json={"gpu_id": req.gpu_id, "params": params_with_model},
-                    )
-                    if start_resp.is_success:
-                        data = start_resp.json()
-                        return {
-                            "service": service,
-                            "node_id": req.node_id,
-                            "gpu_id": req.gpu_id,
-                            "model": model,
-                            "url": data.get("url"),
-                            "running": data.get("running", False),
-                        }
-                    last_error = start_resp.text
-                except httpx.HTTPError as exc:
-                    raise HTTPException(502, detail=f"Agent unreachable: {exc}")
-
-        raise HTTPException(
-            503,
-            detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
-        )
-
-    @app.post("/api/services/{service}/allocate")
-    async def allocate_service(service: str, req: ServiceAllocateRequest) -> dict[str, Any]:
-        """
-        Allocate a managed service — coordinator picks the best node automatically.
-        Returns a URL + allocation_id. (Allocation not tracked server-side until Phase 2.)
-        """
-        import httpx
-
-        if not req.model_candidates:
-            raise HTTPException(422, detail="model_candidates must be non-empty")
-
-        # Validate service is known in at least one profile, regardless of gpu_id
-        if not any(service in p.services for p in profile_registry.list_public()):
-            raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
-
-        residents = lease_manager.resident_keys()
-
-        if req.gpu_id is None:
-            online = agent_supervisor.online_agents()
-            placement = select_node(online, service, profile_registry, residents)
-            if placement is None:
-                raise HTTPException(
-                    503,
-                    detail=f"No online node has capacity for service {service!r}",
-                )
-            node_id, gpu_id = placement
-        else:
-            online = agent_supervisor.online_agents()
-            node_id = next(
-                (nid for nid, rec in online.items()
-                 if any(g.gpu_id == req.gpu_id for g in rec.gpus)),
-                None,
-            )
-            if node_id is None:
-                raise HTTPException(422, detail=f"No online node has gpu_id={req.gpu_id}")
-            gpu_id = req.gpu_id
-
-        node_info = agent_supervisor.get_node_info(node_id)
-        if node_info is None:
-            raise HTTPException(422, detail=f"Node {node_id!r} not found")
-
-        warm = f"{node_id}:{service}" in residents
-
-        async with httpx.AsyncClient(timeout=120.0) as client:
-            last_error = ""
-            for model in req.model_candidates:
-                try:
-                    resp = await client.post(
-                        f"{node_info.agent_url}/services/{service}/start",
-                        json={"gpu_id": gpu_id, "params": {**req.params, "model": model}},
-                    )
-                    if resp.is_success:
-                        data = resp.json()
-                        svc_url = data.get("url", "")
-                        alloc = service_registry.allocate(
-                            service=service,
-                            node_id=node_id,
-                            gpu_id=gpu_id,
-                            model=model,
-                            caller=req.caller,
-                            url=svc_url,
-                            ttl_s=req.ttl_s,
-                        )
-                        # Seed the instance state for first-time starts.
-                        # adopted=True means the agent found it already running.
-                        adopted = data.get("adopted", False)
-                        instance_state = "running" if (warm or adopted) else "starting"
-                        health_path = _get_health_path(profile_registry, service)
-                        service_registry.upsert_instance(
-                            service=service,
-                            node_id=node_id,
-                            gpu_id=gpu_id,
-                            state=instance_state,
-                            model=model,
-                            url=svc_url,
-                            health_path=health_path,
-                        )
-                        return {
-                            "allocation_id": alloc.allocation_id,
-                            "service": service,
-                            "node_id": node_id,
-                            "gpu_id": gpu_id,
-                            "model": model,
-                            "url": data.get("url"),
-                            "started": not warm,
-                            "warm": warm,
-                        }
-                    last_error = resp.text
-                except httpx.HTTPError as exc:
-                    raise HTTPException(502, detail=f"Agent unreachable: {exc}")
-
-        raise HTTPException(
-            503,
-            detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
-        )
-
-    @app.delete("/api/services/{service}/allocations/{allocation_id}")
-    async def release_allocation(service: str, allocation_id: str) -> dict[str, Any]:
-        existing = service_registry.get_allocation(allocation_id)
-        if existing is None or existing.service != service:
-            raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found for service {service!r}")
-        released = service_registry.release(allocation_id)
-        if not released:
-            raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found")
-        return {"released": True, "allocation_id": allocation_id}
-
-    @app.get("/api/services/{service}/status")
-    def get_service_status(service: str) -> dict[str, Any]:
-        instances = [i for i in service_registry.all_instances() if i.service == service]
-        allocations = [a for a in service_registry.all_allocations() if a.service == service]
-        return {
-            "service": service,
-            "instances": [
-                {
-                    "node_id": i.node_id,
-                    "gpu_id": i.gpu_id,
-                    "state": i.state,
-                    "model": i.model,
-                    "url": i.url,
-                    "idle_since": i.idle_since,
-                }
-                for i in instances
-            ],
-            "allocations": [
-                {
-                    "allocation_id": a.allocation_id,
-                    "node_id": a.node_id,
-                    "gpu_id": a.gpu_id,
-                    "model": a.model,
-                    "caller": a.caller,
-                    "url": a.url,
-                    "expires_at": a.expires_at,
-                }
-                for a in allocations
-            ],
-        }
-
-    @app.get("/api/services")
-    def list_services() -> dict[str, Any]:
-        instances = service_registry.all_instances()
-        return {
-            "services": [
-                {
-                    "service": i.service,
-                    "node_id": i.node_id,
-                    "gpu_id": i.gpu_id,
-                    "state": i.state,
-                    "model": i.model,
-                    "url": i.url,
-                }
-                for i in instances
-            ]
-        }
-
-    @app.delete("/api/services/{service}")
-    async def stop_service(service: str, node_id: str) -> dict[str, Any]:
-        """Stop a managed service on the given node."""
-        node_info = agent_supervisor.get_node_info(node_id)
-        if node_info is None:
-            raise HTTPException(422, detail=f"Unknown node_id {node_id!r}")
-
-        import httpx
-        async with httpx.AsyncClient(timeout=30.0) as client:
-            try:
-                resp = await client.post(f"{node_info.agent_url}/services/{service}/stop")
-                resp.raise_for_status()
-                return {"service": service, "node_id": node_id, "stopped": resp.json().get("stopped", False)}
-            except httpx.HTTPError as exc:
-                raise HTTPException(502, detail=f"Agent unreachable: {exc}")
-
-    return app
--- a/circuitforge_core/resources/coordinator/auth.py
+++ b/circuitforge_core/resources/coordinator/auth.py
@ -1,197 +0,0 @@
-"""
-cf-orch coordinator auth middleware.
-
-When HEIMDALL_URL is set, all /api/* requests (except /api/health) must carry:
-    Authorization: Bearer <CF license key>
-
-The key is validated against Heimdall and the result cached for
-CACHE_TTL_S seconds (default 300 / 5 min). This keeps Heimdall out of the
-per-allocation hot path while keeping revocation latency bounded.
-
-When HEIMDALL_URL is not set, auth is disabled — self-hosted deployments work
-with no configuration change.
-
-Environment variables
---------------------
-HEIMDALL_URL          Heimdall base URL, e.g. https://license.circuitforge.tech
-                      When absent, auth is skipped entirely.
-HEIMDALL_MIN_TIER     Minimum tier required (default: "paid").
-                      Accepted values: free, paid, premium, ultra.
-CF_ORCH_AUTH_SECRET   Shared secret sent to Heimdall so it can distinguish
-                      coordinator service calls from end-user requests.
-                      Must match the COORDINATOR_SECRET env var on Heimdall.
-"""
-from __future__ import annotations
-
-import logging
-import os
-import time
-from dataclasses import dataclass, field
-from threading import Lock
-
-import httpx
-from fastapi import Request
-from fastapi.responses import JSONResponse
-
-logger = logging.getLogger(__name__)
-
-# Unauthenticated paths — health check must always be accessible for monitoring.
-_EXEMPT_PATHS: frozenset[str] = frozenset({"/api/health", "/", "/openapi.json", "/docs", "/redoc"})
-
-_TIER_ORDER: dict[str, int] = {"free": 0, "paid": 1, "premium": 2, "ultra": 3}
-
-CACHE_TTL_S: float = 300.0  # 5 minutes — matches Kiwi cloud session TTL
-
-
-@dataclass
-class _CacheEntry:
-    valid: bool
-    tier: str
-    user_id: str
-    expires_at: float
-
-
-class _ValidationCache:
-    """Thread-safe TTL cache for Heimdall validation results."""
-
-    def __init__(self, ttl_s: float = CACHE_TTL_S) -> None:
-        self._ttl = ttl_s
-        self._store: dict[str, _CacheEntry] = {}
-        self._lock = Lock()
-
-    def get(self, key: str) -> _CacheEntry | None:
-        with self._lock:
-            entry = self._store.get(key)
-            if entry is None or time.monotonic() > entry.expires_at:
-                return None
-            return entry
-
-    def set(self, key: str, valid: bool, tier: str, user_id: str) -> None:
-        with self._lock:
-            self._store[key] = _CacheEntry(
-                valid=valid,
-                tier=tier,
-                user_id=user_id,
-                expires_at=time.monotonic() + self._ttl,
-            )
-
-    def evict(self, key: str) -> None:
-        with self._lock:
-            self._store.pop(key, None)
-
-    def prune(self) -> int:
-        """Remove expired entries. Returns count removed."""
-        now = time.monotonic()
-        with self._lock:
-            expired = [k for k, e in self._store.items() if now > e.expires_at]
-            for k in expired:
-                del self._store[k]
-        return len(expired)
-
-
-class HeimdallAuthMiddleware:
-    """
-    ASGI middleware that validates CF license keys against Heimdall.
-
-    Attach to a FastAPI app via app.middleware("http"):
-
-        middleware = HeimdallAuthMiddleware.from_env()
-        if middleware:
-            app.middleware("http")(middleware)
-    """
-
-    def __init__(
-        self,
-        heimdall_url: str,
-        min_tier: str = "paid",
-        auth_secret: str = "",
-        cache_ttl_s: float = CACHE_TTL_S,
-    ) -> None:
-        self._heimdall = heimdall_url.rstrip("/")
-        self._min_tier_rank = _TIER_ORDER.get(min_tier, 1)
-        self._min_tier = min_tier
-        self._auth_secret = auth_secret
-        self._cache = _ValidationCache(ttl_s=cache_ttl_s)
-        logger.info(
-            "[cf-orch auth] Heimdall auth enabled — url=%s min_tier=%s ttl=%ss",
-            self._heimdall, min_tier, cache_ttl_s,
-        )
-
-    @classmethod
-    def from_env(cls) -> "HeimdallAuthMiddleware | None":
-        """Return a configured middleware instance, or None if HEIMDALL_URL is not set."""
-        url = os.environ.get("HEIMDALL_URL", "")
-        if not url:
-            logger.info("[cf-orch auth] HEIMDALL_URL not set — auth disabled (self-hosted mode)")
-            return None
-        return cls(
-            heimdall_url=url,
-            min_tier=os.environ.get("HEIMDALL_MIN_TIER", "paid"),
-            auth_secret=os.environ.get("CF_ORCH_AUTH_SECRET", ""),
-        )
-
-    def _validate_against_heimdall(self, license_key: str) -> tuple[bool, str, str]:
-        """
-        Call Heimdall's /licenses/verify endpoint.
-
-        Returns (valid, tier, user_id).
-        On any network or parse error, returns (False, "", "") — fail closed.
-        """
-        try:
-            headers: dict[str, str] = {"Content-Type": "application/json"}
-            if self._auth_secret:
-                headers["X-Coordinator-Secret"] = self._auth_secret
-            resp = httpx.post(
-                f"{self._heimdall}/licenses/verify",
-                json={"key": license_key, "min_tier": self._min_tier},
-                headers=headers,
-                timeout=5.0,
-            )
-            if resp.status_code == 200:
-                data = resp.json()
-                return data.get("valid", False), data.get("tier", ""), data.get("user_id", "")
-            # 401/403 from Heimdall = key invalid/insufficient tier
-            logger.debug("[cf-orch auth] Heimdall returned %s for key ...%s", resp.status_code, license_key[-6:])
-            return False, "", ""
-        except Exception as exc:
-            logger.warning("[cf-orch auth] Heimdall unreachable — failing closed: %s", exc)
-            return False, "", ""
-
-    def _check_key(self, license_key: str) -> tuple[bool, str]:
-        """
-        Validate key (cache-first). Returns (authorized, reason_if_denied).
-        """
-        cached = self._cache.get(license_key)
-        if cached is not None:
-            if not cached.valid:
-                return False, "license key invalid or expired"
-            if _TIER_ORDER.get(cached.tier, -1) < self._min_tier_rank:
-                return False, f"feature requires {self._min_tier} tier (have: {cached.tier})"
-            return True, ""
-
-        valid, tier, user_id = self._validate_against_heimdall(license_key)
-        self._cache.set(license_key, valid=valid, tier=tier, user_id=user_id)
-
-        if not valid:
-            return False, "license key invalid or expired"
-        if _TIER_ORDER.get(tier, -1) < self._min_tier_rank:
-            return False, f"feature requires {self._min_tier} tier (have: {tier})"
-        return True, ""
-
-    async def __call__(self, request: Request, call_next):  # type: ignore[no-untyped-def]
-        if request.url.path in _EXEMPT_PATHS:
-            return await call_next(request)
-
-        auth_header = request.headers.get("Authorization", "")
-        if not auth_header.startswith("Bearer "):
-            return JSONResponse(
-                status_code=401,
-                content={"detail": "Authorization: Bearer <license_key> required"},
-            )
-
-        license_key = auth_header.removeprefix("Bearer ").strip()
-        authorized, reason = self._check_key(license_key)
-        if not authorized:
-            return JSONResponse(status_code=403, content={"detail": reason})
-
-        return await call_next(request)
--- a/circuitforge_core/resources/coordinator/dashboard.html
+++ b/circuitforge_core/resources/coordinator/dashboard.html
@ -1,473 +0,0 @@
-<!DOCTYPE html>
-<html lang="en">
-<head>
-<meta charset="UTF-8">
-<meta name="viewport" content="width=device-width, initial-scale=1.0">
-<title>cf-orch · dashboard</title>
-<style>
-  *, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
-
-  :root {
-    --bg:         #0d1117;
-    --bg2:        #161b22;
-    --bg3:        #1c2129;
-    --border:     #30363d;
-    --border-dim: #21262d;
-    --text:       #e6edf3;
-    --muted:      #8b949e;
-    --dim:        #4d5763;
-    --indigo:     #818cf8;
-    --cyan:       #22d3ee;
-    --green:      #4ade80;
-    --amber:      #fbbf24;
-    --red:        #f85149;
-    --orange:     #fb923c;
-    --radius:     6px;
-    --radius-sm:  3px;
-    --font:       'JetBrains Mono', 'Fira Code', ui-monospace, monospace;
-  }
-
-  body { background: var(--bg); color: var(--text); font-family: var(--font); font-size: 13px; line-height: 1.5; padding: 1rem; }
-
-  /* header */
-  header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
-  .logo  { color: var(--indigo); font-size: 1.1em; font-weight: 700; }
-  #refresh-badge { margin-left: auto; font-size: 0.75em; color: var(--dim); }
-  #refresh-badge span { color: var(--green); }
-
-  /* section labels */
-  .section-label { font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.07em; color: var(--dim); margin-bottom: 0.5rem; }
-
-  /* health strip */
-  #health-strip { display: flex; flex-wrap: wrap; gap: 0.4rem; margin-bottom: 1rem; padding: 0.6rem 0.75rem; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); min-height: 36px; }
-  .pill { display: inline-flex; align-items: center; gap: 0.3rem; padding: 2px 10px; border-radius: 99px; font-size: 0.8em; font-weight: 600; }
-  .pill.ok  { background: rgba(74,222,128,.12); color: var(--green); }
-  .pill.err { background: rgba(248,81,73,.12);  color: var(--red); }
-  .pill.off { background: rgba(139,148,158,.1); color: var(--dim); }
-
-  /* GPU grid */
-  #gpu-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 0.6rem; margin-bottom: 1rem; }
-  .gpu-card { background: var(--bg3); border: 1px solid var(--border); border-radius: var(--radius); padding: 0.7rem 0.8rem; }
-  .gpu-card.offline { border-color: #7c2d12; opacity: 0.7; }
-  .gpu-node  { font-size: 0.75em; font-weight: 700; color: var(--indigo); margin-bottom: 1px; }
-  .gpu-offline .gpu-node { color: var(--orange); }
-  .gpu-name  { font-size: 0.78em; color: var(--text); margin-bottom: 0.4rem; }
-  .vram-track    { position: relative; background: var(--bg); border-radius: var(--radius-sm); height: 6px; margin-bottom: 0.3rem; overflow: hidden; }
-  .vram-leased   { position: absolute; left: 0; top: 0; height: 100%; background: var(--cyan);   transition: width 0.4s; }
-  .vram-resident { position: absolute; top: 0; height: 100%; background: var(--amber); transition: left 0.4s, width 0.4s; }
-  .vram-label { font-size: 0.72em; color: var(--muted); margin-bottom: 0.25rem; }
-  .gpu-status { font-size: 0.72em; }
-  .gpu-status.idle { color: var(--green); }
-  .gpu-status.busy { color: var(--amber); }
-  .gpu-status.full { color: var(--red); }
-  .gpu-status.offline { color: var(--orange); }
-  .spark-track { height: 24px; background: var(--bg); border-radius: var(--radius-sm); margin-top: 0.4rem; overflow: hidden; }
-
-  /* shared table base */
-  .cf-table { width: 100%; border-collapse: collapse; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); overflow: hidden; margin-bottom: 1rem; }
-  .cf-table th { background: var(--bg3); color: var(--dim); font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.4rem 0.6rem; text-align: left; border-bottom: 1px solid var(--border); }
-  .cf-table td { padding: 0.35rem 0.6rem; border-bottom: 1px solid var(--border-dim); font-size: 0.8em; vertical-align: middle; }
-  .cf-table tr:last-child td { border-bottom: none; }
-  .td-service  { color: var(--indigo); font-weight: 600; }
-  .td-node     { color: var(--muted); }
-  .td-mb       { color: var(--text); }
-  .td-priority { color: var(--amber); }
-  .td-model    { color: var(--cyan); font-size: 0.75em; }
-  .td-warm     { color: var(--amber); }
-  .td-none     { color: var(--dim); font-style: italic; }
-  .ttl-wrap    { display: flex; align-items: center; gap: 0.5rem; }
-  .ttl-label   { color: var(--cyan); font-variant-numeric: tabular-nums; white-space: nowrap; }
-  .ttl-track   { flex: 1; background: var(--bg); border-radius: var(--radius-sm); height: 4px; }
-  .ttl-fill    { height: 100%; border-radius: var(--radius-sm); background: var(--cyan); transition: width 0.4s; }
-
-  /* service state classes */
-  .state-running  { color: #2ecc40; }
-  .state-idle     { color: #ff851b; }
-  .state-stopped  { color: #aaa; }
-  .state-starting { color: #0074d9; }
-  .state-unknown  { color: #ff4136; }
-
-  /* error */
-  #error-banner { display: none; background: rgba(248,81,73,.1); border: 1px solid var(--red); border-radius: var(--radius); color: var(--red); padding: 0.5rem 0.75rem; font-size: 0.82em; margin-bottom: 1rem; }
-
-  /* footer */
-  footer { border-top: 1px solid var(--border); padding-top: 0.5rem; color: var(--dim); font-size: 0.72em; display: flex; gap: 1.5rem; }
-  footer a { color: var(--indigo); text-decoration: none; }
-  footer a:hover { text-decoration: underline; }
-</style>
-</head>
-<body>
-
-<header>
-  <span class="logo">cf-orch</span>
-  <span id="cluster-label" style="color:var(--muted)">coordinator</span>
-  <div id="refresh-badge">auto-refresh <span id="countdown">5</span>s</div>
-</header>
-
-<div id="error-banner"></div>
-
-<div class="section-label">Services</div>
-<div id="health-strip"></div>
-
-<div class="section-label">GPU Nodes</div>
-<div id="gpu-grid"></div>
-
-<div id="services-section">
-  <div class="section-label">Service Instances</div>
-  <table class="cf-table" id="services-table">
-    <thead>
-      <tr>
-        <th>Service</th><th>Node</th><th>GPU</th><th>State</th><th>Model</th><th>URL</th>
-      </tr>
-    </thead>
-    <tbody id="services-body"></tbody>
-  </table>
-</div>
-
-<div class="section-label">Active Leases</div>
-<table class="cf-table" id="leases-table">
-  <thead>
-    <tr>
-      <th>Service</th><th>Node / GPU</th><th>VRAM</th><th>Priority</th><th>TTL / Expires</th>
-    </tr>
-  </thead>
-  <tbody id="leases-body"></tbody>
-</table>
-
-<div class="section-label">Warm Models</div>
-<table class="cf-table" id="resident-table">
-  <thead>
-    <tr>
-      <th>Service</th><th>Node</th><th>Model</th><th>Warm Since</th>
-    </tr>
-  </thead>
-  <tbody id="resident-body"></tbody>
-</table>
-
-<footer>
-  <span>cf-orch · circuitforge-core</span>
-  <a href="/api/nodes" target="_blank">/api/nodes</a>
-  <a href="/api/leases" target="_blank">/api/leases</a>
-  <a href="/api/resident" target="_blank">/api/resident</a>
-  <a href="/api/services" target="_blank">/api/services</a>
-  <a href="/api/health" target="_blank">/api/health</a>
-</footer>
-
-<script>
-"use strict";
-
-// ── helpers ──────────────────────────────────────────────────────
-
-/** Create an element with optional className and textContent. */
-function el(tag, opts) {
-  const e = document.createElement(tag);
-  if (opts && opts.cls)  { opts.cls.split(' ').forEach(c => c && e.classList.add(c)); }
-  if (opts && opts.text != null) e.textContent = opts.text;
-  if (opts && opts.style) Object.assign(e.style, opts.style);
-  if (opts && opts.attr)  Object.entries(opts.attr).forEach(([k,v]) => e.setAttribute(k, v));
-  return e;
-}
-
-/** Append children to a parent element. Returns parent. */
-function append(parent, ...children) {
-  children.forEach(c => c && parent.appendChild(c));
-  return parent;
-}
-
-/** Replace all children of a DOM node. */
-function setChildren(parent, ...children) {
-  while (parent.firstChild) parent.removeChild(parent.firstChild);
-  append(parent, ...children);
-}
-
-/** Build a sparkline SVG element (no innerHTML). */
-function buildSparkline(history, totalMb) {
-  const ns = 'http://www.w3.org/2000/svg';
-  const svg = document.createElementNS(ns, 'svg');
-  svg.setAttribute('width', '100%');
-  svg.setAttribute('height', '16');
-  svg.setAttribute('viewBox', '0 0 100 16');
-
-  if (!history || history.length < 2) {
-    const line = document.createElementNS(ns, 'line');
-    line.setAttribute('x1', '0'); line.setAttribute('y1', '14');
-    line.setAttribute('x2', '100'); line.setAttribute('y2', '14');
-    line.setAttribute('stroke', '#30363d'); line.setAttribute('stroke-width', '1');
-    svg.appendChild(line);
-    return svg;
-  }
-
-  const max = Math.max(totalMb, 1);
-  const pts = history.map((v, i) => {
-    const x = (i / (history.length - 1)) * 100;
-    const y = 14 - ((v / max) * 12);
-    return x.toFixed(1) + ',' + y.toFixed(1);
-  }).join(' ');
-
-  const poly = document.createElementNS(ns, 'polyline');
-  poly.setAttribute('points', pts);
-  poly.setAttribute('fill', 'none');
-  poly.setAttribute('stroke', '#818cf8');
-  poly.setAttribute('stroke-width', '1.5');
-  poly.setAttribute('stroke-linejoin', 'round');
-  svg.appendChild(poly);
-  return svg;
-}
-
-/** VRAM fill colour based on utilisation fraction. */
-function vramColor(pct) {
-  if (pct >= 0.9) return '#f85149';
-  if (pct >= 0.7) return '#fbbf24';
-  return '#22d3ee';
-}
-
-// ── sparkline history ────────────────────────────────────────────
-// keyed "nodeId:gpuId" → array of vram_used_mb, max 20 samples
-const sparkHistory = {};
-
-// ── countdown ────────────────────────────────────────────────────
-let countdown = 5;
-setInterval(() => {
-  countdown = countdown <= 1 ? 5 : countdown - 1;
-  document.getElementById('countdown').textContent = countdown;
-}, 1000);
-
-// ── state class helper ───────────────────────────────────────────
-function stateClass(state) {
-  const map = { running: 'state-running', idle: 'state-idle', stopped: 'state-stopped', starting: 'state-starting' };
-  return map[state] || 'state-unknown';
-}
-
-// ── render: services table ───────────────────────────────────────
-function renderServices(services) {
-  const tbody = document.getElementById('services-body');
-  if (!services || services.length === 0) {
-    const tr = document.createElement('tr');
-    const td = el('td', { cls: 'td-none', text: 'No service instances registered.' });
-    td.setAttribute('colspan', '6');
-    tr.appendChild(td);
-    setChildren(tbody, tr);
-    return;
-  }
-
-  const rows = services.map(svc => {
-    const tr = document.createElement('tr');
-    const fields = [
-      { text: svc.service,              cls: 'td-service' },
-      { text: svc.node_id,              cls: 'td-node'    },
-      { text: String(svc.gpu_id),       cls: 'td-mb'      },
-      { text: svc.state,                cls: stateClass(svc.state) },
-      { text: svc.model || '\u2014',    cls: 'td-model'   },
-      { text: svc.url   || '\u2014',    cls: 'td-node'    },
-    ];
-    fields.forEach(f => tr.appendChild(el('td', { cls: f.cls, text: f.text })));
-    return tr;
-  });
-
-  setChildren(tbody, ...rows);
-}
-
-// ── render: health strip ─────────────────────────────────────────
-function renderHealth(ok) {
-  const strip = document.getElementById('health-strip');
-  const pill = el('span', { cls: 'pill ' + (ok ? 'ok' : 'err'), text: (ok ? '● ' : '✕ ') + 'coordinator' });
-  setChildren(strip, pill);
-}
-
-// ── render: GPU grid ─────────────────────────────────────────────
-// leasedByGpu: "nodeId:gpuId" → total MB currently leased (from active leases)
-function renderNodes(nodes, leasedByGpu) {
-  const grid = document.getElementById('gpu-grid');
-  if (!nodes || nodes.length === 0) {
-    setChildren(grid, el('div', { text: 'No nodes registered.', style: { color: 'var(--dim)', fontSize: '0.8em', padding: '0.5rem' } }));
-    return;
-  }
-
-  const cards = [];
-  for (const node of nodes) {
-    for (const gpu of node.gpus) {
-      const key      = node.node_id + ':' + gpu.gpu_id;
-      const total    = gpu.vram_total_mb || 1;
-      const used     = gpu.vram_used_mb;
-      const leased   = leasedByGpu[key] || 0;
-      // Resident = nvidia-smi used minus actively leased; clamped to [0, used].
-      const resident = Math.max(0, Math.min(used - leased, used));
-      const pct      = used / total;
-
-      if (!sparkHistory[key]) sparkHistory[key] = [];
-      sparkHistory[key].push(used);
-      if (sparkHistory[key].length > 20) sparkHistory[key].shift();
-
-      const statusCls  = pct >= 0.9 ? 'full' : pct >= 0.1 ? 'busy' : 'idle';
-      const statusText = pct >= 0.9 ? 'saturated' : pct >= 0.1 ? Math.round(pct * 100) + '% used' : 'idle';
-
-      const card      = el('div', { cls: 'gpu-card' });
-      const nodeLabel = el('div', { cls: 'gpu-node', text: node.node_id.toUpperCase() + ' · GPU ' + gpu.gpu_id });
-      const nameLine  = el('div', { cls: 'gpu-name', text: gpu.name || 'Unknown GPU' });
-
-      // Stacked bar: cyan (leased) → amber (resident) → dark bg (free).
-      const leasedPct   = (leased   / total * 100).toFixed(1);
-      const residentPct = (resident / total * 100).toFixed(1);
-      const track        = el('div', { cls: 'vram-track' });
-      const fillLeased   = el('div', { cls: 'vram-leased',   style: { width: leasedPct + '%' } });
-      const fillResident = el('div', { cls: 'vram-resident', style: { left: leasedPct + '%', width: residentPct + '%' } });
-      append(track, fillLeased, fillResident);
-
-      // Breakdown label when something is allocated.
-      let labelText = (used / 1024).toFixed(1) + ' / ' + (total / 1024).toFixed(1) + ' GB';
-      if (leased > 0 || resident > 0) {
-        const parts = [];
-        if (leased   > 0) parts.push((leased   / 1024).toFixed(1) + 'G leased');
-        if (resident > 0) parts.push((resident / 1024).toFixed(1) + 'G resident');
-        labelText += '  (' + parts.join(' · ') + ')';
-      }
-
-      const vramLbl    = el('div', { cls: 'vram-label', text: labelText });
-      const statusEl   = el('div', { cls: 'gpu-status ' + statusCls, text: statusText });
-      const sparkTrack = el('div', { cls: 'spark-track' });
-      sparkTrack.appendChild(buildSparkline(sparkHistory[key], total));
-
-      append(card, nodeLabel, nameLine, track, vramLbl, statusEl, sparkTrack);
-      cards.push(card);
-    }
-  }
-
-  setChildren(grid, ...cards);
-}
-
-// ── render: warm models table ────────────────────────────────────
-function renderResidents(residents) {
-  const tbody = document.getElementById('resident-body');
-  if (!residents || residents.length === 0) {
-    const tr = document.createElement('tr');
-    const td = el('td', { cls: 'td-none', text: 'No warm models detected.' });
-    td.setAttribute('colspan', '4');
-    tr.appendChild(td);
-    setChildren(tbody, tr);
-    return;
-  }
-
-  const now = Date.now() / 1000;
-  const rows = residents.map(r => {
-    const warmSecs = now - (r.first_seen || now);
-    const warmText = warmSecs < 60
-      ? Math.floor(warmSecs) + 's'
-      : warmSecs < 3600
-        ? Math.floor(warmSecs / 60) + 'm ' + String(Math.floor(warmSecs % 60)).padStart(2, '0') + 's'
-        : Math.floor(warmSecs / 3600) + 'h ' + String(Math.floor((warmSecs % 3600) / 60)).padStart(2, '0') + 'm';
-
-    const tr = document.createElement('tr');
-    append(tr,
-      el('td', { cls: 'td-service', text: r.service }),
-      el('td', { cls: 'td-node',    text: r.node_id }),
-      el('td', { cls: 'td-model',   text: r.model_name || '—' }),
-      el('td', { cls: 'td-warm',    text: warmText }),
-    );
-    return tr;
-  });
-
-  setChildren(tbody, ...rows);
-}
-
-// ── render: leases table ─────────────────────────────────────────
-function renderLeases(leases) {
-  const tbody = document.getElementById('leases-body');
-  if (!leases || leases.length === 0) {
-    const tr = document.createElement('tr');
-    const td = el('td', { cls: 'td-none', text: 'No active leases.' });
-    td.setAttribute('colspan', '5');
-    tr.appendChild(td);
-    setChildren(tbody, tr);
-    return;
-  }
-
-  const now = Date.now() / 1000;
-  const rows = leases.map(lease => {
-    const mbGb = lease.mb_granted >= 1024
-      ? (lease.mb_granted / 1024).toFixed(1) + ' GB'
-      : lease.mb_granted + ' MB';
-
-    const tr = document.createElement('tr');
-
-    const tdService  = el('td', { cls: 'td-service',  text: lease.holder_service });
-    const tdNode     = el('td', { cls: 'td-node',     text: lease.node_id + ' / GPU ' + lease.gpu_id });
-    const tdMb       = el('td', { cls: 'td-mb',       text: mbGb });
-    const tdPriority = el('td', { cls: 'td-priority', text: 'p' + lease.priority });
-
-    const tdTtl = document.createElement('td');
-    if (!lease.expires_at) {
-      tdTtl.appendChild(el('span', { cls: 'ttl-label', text: '∞' }));
-    } else {
-      const remaining = Math.max(0, lease.expires_at - now);
-      const pct   = Math.min(100, (remaining / 300) * 100);
-      const mins  = Math.floor(remaining / 60);
-      const secs  = Math.floor(remaining % 60);
-      const label = remaining > 60
-        ? mins + 'm ' + String(secs).padStart(2, '0') + 's'
-        : Math.floor(remaining) + 's';
-
-      const wrap  = el('div', { cls: 'ttl-wrap' });
-      const lbl   = el('span', { cls: 'ttl-label', text: label });
-      const track = el('div', { cls: 'ttl-track' });
-      const fill  = el('div', { cls: 'ttl-fill', style: { width: pct.toFixed(1) + '%' } });
-      track.appendChild(fill);
-      append(wrap, lbl, track);
-      tdTtl.appendChild(wrap);
-    }
-
-    append(tr, tdService, tdNode, tdMb, tdPriority, tdTtl);
-    return tr;
-  });
-
-  setChildren(tbody, ...rows);
-}
-
-// ── error banner ─────────────────────────────────────────────────
-function showError(msg) {
-  const el = document.getElementById('error-banner');
-  el.textContent = msg;   // textContent — safe
-  el.style.display = 'block';
-}
-function clearError() { document.getElementById('error-banner').style.display = 'none'; }
-
-// ── poll ─────────────────────────────────────────────────────────
-async function poll() {
-  try {
-    const [nodesRes, leasesRes, residentRes, healthRes, servicesRes] = await Promise.all([
-      fetch('/api/nodes'),
-      fetch('/api/leases'),
-      fetch('/api/resident'),
-      fetch('/api/health'),
-      fetch('/api/services'),
-    ]);
-    if (!nodesRes.ok || !leasesRes.ok) throw new Error('API error: ' + nodesRes.status);
-    const [nodesData, leasesData, residentData, servicesData] = await Promise.all([
-      nodesRes.json(), leasesRes.json(),
-      residentRes.ok  ? residentRes.json()  : Promise.resolve({ residents: [] }),
-      servicesRes.ok  ? servicesRes.json()  : Promise.resolve({ services: [] }),
-    ]);
-
-    // Build per-GPU leased-MB index for the stacked bar.
-    const leasedByGpu = {};
-    for (const lease of (leasesData.leases || [])) {
-      const key = lease.node_id + ':' + lease.gpu_id;
-      leasedByGpu[key] = (leasedByGpu[key] || 0) + lease.mb_granted;
-    }
-
-    clearError();
-    renderHealth(healthRes.ok);
-    renderNodes(nodesData.nodes || [], leasedByGpu);
-    renderServices(servicesData.services || []);
-    renderLeases(leasesData.leases || []);
-    renderResidents(residentData.residents || []);
-  } catch (err) {
-    showError('Failed to reach coordinator: ' + err.message);
-    renderHealth(false);
-  }
-}
-
-poll();
-setInterval(poll, 5000);
-</script>
-</body>
-</html>
--- a/circuitforge_core/resources/coordinator/eviction_engine.py
+++ b/circuitforge_core/resources/coordinator/eviction_engine.py
@ -1,81 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-import logging
-
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.models import VRAMLease
-
-logger = logging.getLogger(__name__)
-
-_DEFAULT_EVICTION_TIMEOUT_S = 10.0
-
-
-class EvictionEngine:
-    def __init__(
-        self,
-        lease_manager: LeaseManager,
-        eviction_timeout_s: float = _DEFAULT_EVICTION_TIMEOUT_S,
-    ) -> None:
-        self.lease_manager = lease_manager
-        self._timeout = eviction_timeout_s
-
-    async def request_lease(
-        self,
-        node_id: str,
-        gpu_id: int,
-        mb: int,
-        service: str,
-        priority: int,
-        agent_url: str,
-        ttl_s: float = 0.0,
-    ) -> VRAMLease | None:
-        # Fast path: enough free VRAM
-        lease = await self.lease_manager.try_grant(
-            node_id, gpu_id, mb, service, priority, ttl_s
-        )
-        if lease is not None:
-            return lease
-
-        # Find eviction candidates
-        candidates = self.lease_manager.get_eviction_candidates(
-            node_id=node_id, gpu_id=gpu_id,
-            needed_mb=mb, requester_priority=priority,
-        )
-        if not candidates:
-            logger.info(
-                "No eviction candidates for %s on %s:GPU%d (%dMB needed)",
-                service, node_id, gpu_id, mb,
-            )
-            return None
-
-        # Evict candidates
-        freed_mb = sum(c.mb_granted for c in candidates)
-        logger.info(
-            "Evicting %d lease(s) to free %dMB for %s",
-            len(candidates), freed_mb, service,
-        )
-        for candidate in candidates:
-            await self._evict_lease(candidate, agent_url)
-
-        # Wait for evictions to free up VRAM (poll with timeout)
-        loop = asyncio.get_running_loop()
-        deadline = loop.time() + self._timeout
-        while loop.time() < deadline:
-            lease = await self.lease_manager.try_grant(
-                node_id, gpu_id, mb, service, priority, ttl_s
-            )
-            if lease is not None:
-                return lease
-            await asyncio.sleep(0.1)
-
-        logger.warning("Eviction timed out for %s after %.1fs", service, self._timeout)
-        return None
-
-    async def _evict_lease(self, lease: VRAMLease, agent_url: str) -> None:
-        """Release lease accounting. Process-level eviction deferred to Plan B."""
-        await self.lease_manager.release(lease.lease_id)
-
-    async def _call_agent_evict(self, agent_url: str, lease: VRAMLease) -> bool:
-        """POST /evict to the agent. Stub for v1 — real process lookup in Plan B."""
-        return True
--- a/circuitforge_core/resources/coordinator/lease_manager.py
+++ b/circuitforge_core/resources/coordinator/lease_manager.py
@ -1,130 +0,0 @@
-from __future__ import annotations
-
-import asyncio
-from collections import defaultdict
-
-from circuitforge_core.resources.models import ResidentAllocation, VRAMLease
-
-
-class LeaseManager:
-    def __init__(self) -> None:
-        self._leases: dict[str, VRAMLease] = {}
-        self._gpu_total: dict[tuple[str, int], int] = {}
-        self._gpu_used: dict[tuple[str, int], int] = defaultdict(int)
-        self._lock = asyncio.Lock()
-        # Resident allocations — keyed "node_id:service", updated by heartbeat.
-        # No lock needed: only the single heartbeat task writes this dict.
-        self._residents: dict[str, ResidentAllocation] = {}
-
-    def register_gpu(self, node_id: str, gpu_id: int, total_mb: int) -> None:
-        self._gpu_total[(node_id, gpu_id)] = total_mb
-
-    def gpu_total_mb(self, node_id: str, gpu_id: int) -> int:
-        return self._gpu_total.get((node_id, gpu_id), 0)
-
-    def used_mb(self, node_id: str, gpu_id: int) -> int:
-        return self._gpu_used[(node_id, gpu_id)]
-
-    async def try_grant(
-        self,
-        node_id: str,
-        gpu_id: int,
-        mb: int,
-        service: str,
-        priority: int,
-        ttl_s: float = 0.0,
-    ) -> VRAMLease | None:
-        async with self._lock:
-            total = self._gpu_total.get((node_id, gpu_id), 0)
-            used = self._gpu_used[(node_id, gpu_id)]
-            if total - used < mb:
-                return None
-            lease = VRAMLease.create(
-                gpu_id=gpu_id, node_id=node_id, mb=mb,
-                service=service, priority=priority, ttl_s=ttl_s,
-            )
-            self._leases[lease.lease_id] = lease
-            self._gpu_used[(node_id, gpu_id)] += mb
-            return lease
-
-    async def release(self, lease_id: str) -> bool:
-        async with self._lock:
-            lease = self._leases.pop(lease_id, None)
-            if lease is None:
-                return False
-            self._gpu_used[(lease.node_id, lease.gpu_id)] -= lease.mb_granted
-            return True
-
-    def get_eviction_candidates(
-        self,
-        node_id: str,
-        gpu_id: int,
-        needed_mb: int,
-        requester_priority: int,
-    ) -> list[VRAMLease]:
-        candidates = [
-            lease for lease in self._leases.values()
-            if lease.node_id == node_id
-            and lease.gpu_id == gpu_id
-            and lease.priority > requester_priority
-        ]
-        candidates.sort(key=lambda lease: lease.priority, reverse=True)
-        selected: list[VRAMLease] = []
-        freed = 0
-        for candidate in candidates:
-            selected.append(candidate)
-            freed += candidate.mb_granted
-            if freed >= needed_mb:
-                break
-        return selected
-
-    def list_leases(
-        self, node_id: str | None = None, gpu_id: int | None = None
-    ) -> list[VRAMLease]:
-        return [
-            lease for lease in self._leases.values()
-            if (node_id is None or lease.node_id == node_id)
-            and (gpu_id is None or lease.gpu_id == gpu_id)
-        ]
-
-    def all_leases(self) -> list[VRAMLease]:
-        return list(self._leases.values())
-
-    # ── resident tracking ────────────────────────────────────────────
-
-    def set_residents_for_node(
-        self,
-        node_id: str,
-        residents: list[tuple[str, str | None]],  # (service, model_name)
-    ) -> None:
-        """
-        Replace the resident snapshot for a node.
-
-        Preserves first_seen for entries whose service+model_name are unchanged,
-        so the dashboard can show how long a model has been warm.
-        """
-        new_keys = {f"{node_id}:{service}" for service, _ in residents}
-
-        # Remove stale entries (service no longer running on this node).
-        for key in list(self._residents):
-            if key.startswith(f"{node_id}:") and key not in new_keys:
-                del self._residents[key]
-
-        # Upsert: preserve first_seen when model is unchanged, reset otherwise.
-        for service, model_name in residents:
-            key = f"{node_id}:{service}"
-            existing = self._residents.get(key)
-            if existing is not None and existing.model_name == model_name:
-                continue  # same model still loaded — keep original first_seen
-            self._residents[key] = ResidentAllocation(
-                service=service,
-                node_id=node_id,
-                model_name=model_name,
-            )
-
-    def all_residents(self) -> list[ResidentAllocation]:
-        return list(self._residents.values())
-
-    def resident_keys(self) -> set[str]:
-        """Return set of 'node_id:service' strings for currently-warm services."""
-        return set(self._residents.keys())
--- a/circuitforge_core/resources/coordinator/node_selector.py
+++ b/circuitforge_core/resources/coordinator/node_selector.py
@ -1,74 +0,0 @@
-from __future__ import annotations
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
-    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-
-_WARM_BONUS_MB = 1000
-
-
-@dataclass(frozen=True)
-class _Scored:
-    node_id: str
-    gpu_id: int
-    vram_free_mb: int
-    effective_free_mb: int
-    can_fit: bool
-    warm: bool
-
-
-def select_node(
-    agents: "dict[str, AgentRecord]",
-    service: str,
-    profile_registry: "ProfileRegistry",
-    resident_keys: set[str],
-) -> tuple[str, int] | None:
-    """
-    Pick the best (node_id, gpu_id) for the requested service.
-    Warm nodes (service already running) get priority, then sorted by free VRAM.
-    Returns None if no suitable node exists.
-    """
-    service_max_mb = _find_service_max_mb(service, profile_registry)
-    if service_max_mb is None:
-        return None  # service not in any profile
-
-    candidates: list[_Scored] = []
-    for node_id, record in agents.items():
-        if not record.online:
-            continue
-        for gpu in record.gpus:
-            warm = f"{node_id}:{service}" in resident_keys
-            effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
-            can_fit = gpu.vram_free_mb >= service_max_mb
-            candidates.append(_Scored(
-                node_id=node_id,
-                gpu_id=gpu.gpu_id,
-                vram_free_mb=gpu.vram_free_mb,
-                effective_free_mb=effective,
-                can_fit=can_fit,
-                warm=warm,
-            ))
-    if not candidates:
-        return None
-    # Prefer: (1) warm nodes (model already resident — no cold start)
-    #         (2) cold nodes that can fit the service (free >= half of max_mb)
-    # Fallback: best-effort node when nothing fits and nothing is warm
-    #   (coordinator will attempt to start the service anyway; it may evict or fail)
-    # Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
-    #   bonus applies to all GPUs on the node. This is a known coarseness —
-    #   per-GPU resident tracking requires a resident_key format change.
-    preferred = [c for c in candidates if c.warm or c.can_fit]
-    pool = preferred if preferred else candidates
-    best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
-    return best.node_id, best.gpu_id
-
-
-def _find_service_max_mb(service: str, profile_registry: "ProfileRegistry") -> int | None:
-    for profile in profile_registry.list_public():
-        svc = profile.services.get(service)
-        if svc is not None:
-            return svc.max_mb
-    return None
--- a/circuitforge_core/resources/coordinator/node_store.py
+++ b/circuitforge_core/resources/coordinator/node_store.py
@ -1,85 +0,0 @@
-"""
-circuitforge_core.resources.coordinator.node_store — SQLite persistence for known agent nodes.
-
-Gives the coordinator restart-safe memory of which nodes have ever registered.
-On startup the coordinator reloads all known nodes and immediately probes them;
-nodes that respond come back online within one heartbeat cycle (~10 s) without
-any manual intervention on the agent hosts.
-"""
-from __future__ import annotations
-
-import logging
-import sqlite3
-import time
-from pathlib import Path
-
-logger = logging.getLogger(__name__)
-
-_DEFAULT_DB_PATH = Path.home() / ".local" / "share" / "circuitforge" / "cf-orch-nodes.db"
-_STALE_AGE_DAYS = 30  # nodes unseen for this long are pruned automatically
-
-
-class NodeStore:
-    """
-    Thin SQLite wrapper for persisting known agent nodes across coordinator restarts.
-
-    Thread-safe for single-writer use (coordinator runs in one asyncio thread).
-    """
-
-    def __init__(self, db_path: Path = _DEFAULT_DB_PATH) -> None:
-        self.db_path = db_path
-        db_path.parent.mkdir(parents=True, exist_ok=True)
-        self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
-        self._conn.row_factory = sqlite3.Row
-        self._migrate()
-        logger.debug("NodeStore initialised at %s", db_path)
-
-    def _migrate(self) -> None:
-        self._conn.executescript("""
-            CREATE TABLE IF NOT EXISTS known_nodes (
-                node_id    TEXT PRIMARY KEY,
-                agent_url  TEXT NOT NULL,
-                last_seen  REAL NOT NULL
-            );
-        """)
-        self._conn.commit()
-
-    def upsert(self, node_id: str, agent_url: str) -> None:
-        """Record or update a node. Called on every successful registration."""
-        self._conn.execute(
-            """
-            INSERT INTO known_nodes (node_id, agent_url, last_seen)
-            VALUES (?, ?, ?)
-            ON CONFLICT(node_id) DO UPDATE SET
-                agent_url = excluded.agent_url,
-                last_seen = excluded.last_seen
-            """,
-            (node_id, agent_url, time.time()),
-        )
-        self._conn.commit()
-
-    def all(self) -> list[tuple[str, str]]:
-        """Return all known (node_id, agent_url) pairs."""
-        rows = self._conn.execute(
-            "SELECT node_id, agent_url FROM known_nodes ORDER BY last_seen DESC"
-        ).fetchall()
-        return [(r["node_id"], r["agent_url"]) for r in rows]
-
-    def remove(self, node_id: str) -> None:
-        self._conn.execute("DELETE FROM known_nodes WHERE node_id = ?", (node_id,))
-        self._conn.commit()
-
-    def prune_stale(self, max_age_days: int = _STALE_AGE_DAYS) -> int:
-        """Delete nodes not seen within max_age_days. Returns count removed."""
-        cutoff = time.time() - max_age_days * 86400
-        cur = self._conn.execute(
-            "DELETE FROM known_nodes WHERE last_seen < ?", (cutoff,)
-        )
-        self._conn.commit()
-        removed = cur.rowcount
-        if removed:
-            logger.info("NodeStore: pruned %d stale node(s) (>%d days old)", removed, max_age_days)
-        return removed
-
-    def close(self) -> None:
-        self._conn.close()
--- a/circuitforge_core/resources/coordinator/profile_registry.py
+++ b/circuitforge_core/resources/coordinator/profile_registry.py
@ -1,65 +0,0 @@
-# circuitforge_core/resources/coordinator/profile_registry.py
-from __future__ import annotations
-
-import logging
-from pathlib import Path
-
-from circuitforge_core.resources.models import GpuInfo
-from circuitforge_core.resources.profiles.schema import GpuProfile, load_profile
-
-_PUBLIC_DIR = Path(__file__).parent.parent / "profiles" / "public"
-
-# VRAM thresholds for public profile selection (MB)
-_PROFILE_THRESHOLDS = [
-    (22000, "single-gpu-24gb"),
-    (14000, "single-gpu-16gb"),
-    (8000, "single-gpu-8gb"),
-    (5500, "single-gpu-6gb"),
-    (3500, "single-gpu-4gb"),
-    (0, "single-gpu-2gb"),
-]
-
-_log = logging.getLogger(__name__)
-
-
-class ProfileRegistry:
-    def __init__(self, extra_dirs: list[Path] | None = None) -> None:
-        self._profiles: dict[str, GpuProfile] = {}
-        self._load_dir(_PUBLIC_DIR)
-        for d in (extra_dirs or []):
-            if d.exists():
-                self._load_dir(d)
-
-    def _load_dir(self, directory: Path) -> None:
-        for yaml_file in directory.glob("*.yaml"):
-            try:
-                profile = load_profile(yaml_file)
-                self._profiles[profile.name] = profile
-            except Exception as exc:
-                _log.warning("Skipping %s: %s", yaml_file, exc)
-
-    def load(self, path: Path) -> GpuProfile:
-        profile = load_profile(path)
-        self._profiles[profile.name] = profile
-        return profile
-
-    def list_public(self) -> list[GpuProfile]:
-        # CPU profiles (cpu-*) are intentionally excluded — this endpoint
-        # is used to match GPU hardware. CPU inference nodes self-select
-        # their profile via the CLI and are not listed for lease matching.
-        return [
-            p for p in self._profiles.values()
-            if p.name.startswith("single-gpu-")
-        ]
-
-    def get(self, name: str) -> GpuProfile | None:
-        return self._profiles.get(name)
-
-    def auto_detect(self, gpus: list[GpuInfo]) -> GpuProfile:
-        primary_vram = gpus[0].vram_total_mb if gpus else 0
-        for threshold_mb, profile_name in _PROFILE_THRESHOLDS:
-            if primary_vram >= threshold_mb:
-                profile = self._profiles.get(profile_name)
-                if profile:
-                    return profile
-        return self._profiles["single-gpu-2gb"]
--- a/circuitforge_core/resources/coordinator/service_registry.py
+++ b/circuitforge_core/resources/coordinator/service_registry.py
@ -1,173 +0,0 @@
-from __future__ import annotations
-
-import dataclasses
-import time
-import uuid
-from dataclasses import dataclass
-from typing import Literal
-
-
-@dataclass
-class ServiceAllocation:
-    allocation_id: str
-    service: str
-    node_id: str
-    gpu_id: int
-    model: str | None
-    caller: str
-    url: str
-    created_at: float
-    expires_at: float  # 0 = no expiry
-
-
-@dataclass
-class ServiceInstance:
-    service: str
-    node_id: str
-    gpu_id: int
-    state: Literal["starting", "running", "idle", "stopped"]
-    model: str | None
-    url: str | None
-    idle_since: float | None = None
-    health_path: str = "/health"
-
-
-class ServiceRegistry:
-    """
-    In-memory registry of service allocations and instance state.
-
-    Allocations: per-caller request — many per service instance.
-    Instances: per (service, node_id, gpu_id) — one per running container.
-    """
-
-    def __init__(self) -> None:
-        self._allocations: dict[str, ServiceAllocation] = {}
-        self._instances: dict[str, ServiceInstance] = {}  # key: "service:node_id:gpu_id"
-
-    # ── allocation API ────────────────────────────────────────────────
-
-    def allocate(
-        self,
-        service: str,
-        node_id: str,
-        gpu_id: int,
-        model: str | None,
-        url: str,
-        caller: str,
-        ttl_s: float,
-    ) -> ServiceAllocation:
-        alloc = ServiceAllocation(
-            allocation_id=str(uuid.uuid4()),
-            service=service,
-            node_id=node_id,
-            gpu_id=gpu_id,
-            model=model,
-            caller=caller,
-            url=url,
-            created_at=time.time(),
-            expires_at=time.time() + ttl_s if ttl_s > 0 else 0.0,
-        )
-        self._allocations[alloc.allocation_id] = alloc
-
-        # If an instance exists in idle/stopped state, mark it running again
-        key = f"{service}:{node_id}:{gpu_id}"
-        if key in self._instances:
-            inst = self._instances[key]
-            if inst.state in ("idle", "stopped"):
-                self._instances[key] = dataclasses.replace(
-                    inst, state="running", idle_since=None
-                )
-        return alloc
-
-    def release(self, allocation_id: str) -> bool:
-        alloc = self._allocations.pop(allocation_id, None)
-        if alloc is None:
-            return False
-        # If no active allocations remain for this instance, mark it idle
-        key = f"{alloc.service}:{alloc.node_id}:{alloc.gpu_id}"
-        if self.active_allocations(alloc.service, alloc.node_id, alloc.gpu_id) == 0:
-            if key in self._instances:
-                self._instances[key] = dataclasses.replace(
-                    self._instances[key], state="idle", idle_since=time.time()
-                )
-        return True
-
-    def active_allocations(self, service: str, node_id: str, gpu_id: int) -> int:
-        return sum(
-            1 for a in self._allocations.values()
-            if a.service == service and a.node_id == node_id and a.gpu_id == gpu_id
-        )
-
-    # ── instance API ─────────────────────────────────────────────────
-
-    def upsert_instance(
-        self,
-        service: str,
-        node_id: str,
-        gpu_id: int,
-        state: Literal["starting", "running", "idle", "stopped"],
-        model: str | None,
-        url: str | None,
-        health_path: str = "/health",
-    ) -> ServiceInstance:
-        key = f"{service}:{node_id}:{gpu_id}"
-        existing = self._instances.get(key)
-        idle_since: float | None = None
-        if state == "idle":
-            # Preserve idle_since if already idle; set now if transitioning into idle
-            idle_since = existing.idle_since if (existing and existing.state == "idle") else time.time()
-        inst = ServiceInstance(
-            service=service, node_id=node_id, gpu_id=gpu_id,
-            state=state, model=model, url=url, idle_since=idle_since,
-            health_path=health_path,
-        )
-        self._instances[key] = inst
-        return inst
-
-    def get_allocation(self, allocation_id: str) -> ServiceAllocation | None:
-        return self._allocations.get(allocation_id)
-
-    def sweep_expired_allocations(self) -> list[str]:
-        """
-        Remove all allocations whose TTL has elapsed and transition the
-        corresponding instance to 'idle' if no active allocations remain.
-        Returns the list of expired allocation_ids.
-        """
-        now = time.time()
-        expired = [
-            alloc_id
-            for alloc_id, alloc in self._allocations.items()
-            if alloc.expires_at > 0 and now > alloc.expires_at
-        ]
-        for alloc_id in expired:
-            self.release(alloc_id)
-        return expired
-
-    def all_allocations(self) -> list[ServiceAllocation]:
-        return list(self._allocations.values())
-
-    def all_instances(self) -> list[ServiceInstance]:
-        return list(self._instances.values())
-
-    def mark_stopped(self, service: str, node_id: str, gpu_id: int) -> None:
-        """Transition an instance to 'stopped' state and clear idle_since."""
-        key = f"{service}:{node_id}:{gpu_id}"
-        if key in self._instances:
-            self._instances[key] = dataclasses.replace(
-                self._instances[key], state="stopped", idle_since=None
-            )
-
-    def idle_past_timeout(self, idle_stop_config: dict[str, int]) -> list[ServiceInstance]:
-        """
-        Return instances in 'idle' state whose idle time exceeds their configured timeout.
-        idle_stop_config: {service_name: seconds} — 0 means never stop automatically.
-        """
-        now = time.time()
-        result = []
-        for inst in self._instances.values():
-            if inst.state != "idle" or inst.idle_since is None:
-                continue
-            timeout = idle_stop_config.get(inst.service, 0)
-            if timeout > 0 and (now - inst.idle_since) >= timeout:
-                result.append(inst)
-        return result
--- a/circuitforge_core/resources/docuvision/init.py
+++ b/circuitforge_core/resources/docuvision/init.py
--- a/circuitforge_core/resources/docuvision/app.py
+++ b/circuitforge_core/resources/docuvision/app.py
@ -1,250 +0,0 @@
-"""
-cf-docuvision — managed document understanding service.
-
-Wraps ByteDance/Dolphin-v2 (Qwen2.5-VL backbone) behind a simple HTTP API.
-Managed by cf-orch; started/stopped as a ProcessSpec service.
-
-API
---
-GET  /health          → {"status": "ok", "model": "<path>"}
-POST /extract         → ExtractResponse
-
-Usage (standalone)::
-
-    python -m circuitforge_core.resources.docuvision.app \\
-        --model /Library/Assets/LLM/docuvision/models/dolphin-v2 \\
-        --port 8003 --gpu-id 0
-"""
-from __future__ import annotations
-
-import argparse
-import base64
-import io
-import json
-import logging
-from contextlib import asynccontextmanager
-from typing import Any
-
-import uvicorn
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-
-logger = logging.getLogger(__name__)
-
-# Module-level state — populated by _load_model() on first /extract call
-_model: Any = None
-_processor: Any = None
-_model_path: str = ""
-_device: str = "cpu"
-
-
-# ── lazy loader ───────────────────────────────────────────────────────────────
-
-def _load_model() -> None:
-    """Lazy-load Dolphin-v2. Called once on first /extract request."""
-    global _model, _processor, _device
-
-    if _model is not None:
-        return
-
-    import torch
-    from transformers import AutoProcessor, AutoModelForCausalLM
-
-    logger.info("Loading Dolphin-v2 from %s ...", _model_path)
-    _device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    _processor = AutoProcessor.from_pretrained(
-        _model_path,
-        trust_remote_code=True,
-    )
-    _model = AutoModelForCausalLM.from_pretrained(
-        _model_path,
-        trust_remote_code=True,
-        torch_dtype=torch.float16 if _device == "cuda" else torch.float32,
-        device_map=_device,
-    )
-    _model.eval()
-    logger.info("Dolphin-v2 loaded on %s", _device)
-
-
-# ── FastAPI app ───────────────────────────────────────────────────────────────
-
-@asynccontextmanager
-async def _lifespan(app: FastAPI):
-    yield
-
-
-app = FastAPI(title="cf-docuvision", lifespan=_lifespan)
-
-
-# ── request / response models ─────────────────────────────────────────────────
-
-class ExtractRequest(BaseModel):
-    """
-    Either image_b64 (base64-encoded bytes) or image_path (absolute path) must
-    be provided. hint guides the extraction mode:
-      - "auto"     - Dolphin-v2 detects layout and element types automatically
-      - "table"    - optimise for tabular data (receipts, invoices, forms)
-      - "text"     - optimise for dense prose (contracts, letters)
-      - "form"     - optimise for form field extraction
-    """
-    image_b64: str | None = None
-    image_path: str | None = None
-    hint: str = "auto"
-
-
-class ElementOut(BaseModel):
-    type: str          # heading | paragraph | list | table | figure | formula | code
-    text: str
-    bbox: list[float] | None = None   # [x0, y0, x1, y1] normalised 0-1 if available
-
-
-class TableOut(BaseModel):
-    html: str
-    bbox: list[float] | None = None
-
-
-class ExtractResponse(BaseModel):
-    elements: list[ElementOut]
-    raw_text: str
-    tables: list[TableOut]
-    metadata: dict[str, Any]
-
-
-# ── helpers ───────────────────────────────────────────────────────────────────
-
-_HINT_PROMPTS: dict[str, str] = {
-    "auto":  "Parse this document. Extract all elements with their types and text content.",
-    "table": "Extract all tables from this document as structured HTML. Also extract any line-item text.",
-    "text":  "Extract all text from this document preserving paragraph and heading structure.",
-    "form":  "Extract all form fields from this document. Return field labels and their values.",
-}
-
-
-def _image_from_request(req: ExtractRequest):
-    """Return a PIL Image from either image_b64 or image_path."""
-    from PIL import Image
-
-    if req.image_b64:
-        img_bytes = base64.b64decode(req.image_b64)
-        return Image.open(io.BytesIO(img_bytes)).convert("RGB")
-
-    if req.image_path:
-        from pathlib import Path
-        p = Path(req.image_path)
-        if not p.exists():
-            raise HTTPException(status_code=404, detail=f"image_path not found: {req.image_path}")
-        return Image.open(p).convert("RGB")
-
-    raise HTTPException(status_code=422, detail="Either image_b64 or image_path must be provided")
-
-
-def _parse_dolphin_output(raw: str) -> tuple[list[ElementOut], list[TableOut], str]:
-    """
-    Parse Dolphin-v2's structured output into elements and tables.
-
-    Dolphin-v2 returns a JSON array of element dicts with keys:
-      type, text, [html], [bbox]
-
-    Falls back gracefully if the model returns plain text instead.
-    """
-    elements: list[ElementOut] = []
-    tables: list[TableOut] = []
-
-    # Try JSON parse first
-    try:
-        parsed = json.loads(raw)
-        if isinstance(parsed, list):
-            for item in parsed:
-                etype = item.get("type", "paragraph")
-                text = item.get("text", "")
-                bbox = item.get("bbox")
-                if etype == "table":
-                    tables.append(TableOut(html=item.get("html", text), bbox=bbox))
-                elements.append(ElementOut(type=etype, text=text, bbox=bbox))
-            raw_text = "\n".join(e.text for e in elements)
-            return elements, tables, raw_text
-    except (json.JSONDecodeError, TypeError):
-        pass
-
-    # Plain-text fallback: treat entire output as a single paragraph
-    elements = [ElementOut(type="paragraph", text=raw.strip())]
-    return elements, tables, raw.strip()
-
-
-# ── routes ────────────────────────────────────────────────────────────────────
-
-@app.get("/health")
-async def health() -> dict[str, str]:
-    return {"status": "ok", "model": _model_path}
-
-
-@app.post("/extract", response_model=ExtractResponse)
-async def extract(req: ExtractRequest) -> ExtractResponse:
-    _load_model()
-
-    image = _image_from_request(req)
-    prompt = _HINT_PROMPTS.get(req.hint, _HINT_PROMPTS["auto"])
-
-    import torch
-
-    inputs = _processor(
-        text=prompt,
-        images=image,
-        return_tensors="pt",
-    ).to(_device)
-
-    with torch.no_grad():
-        output_ids = _model.generate(
-            **inputs,
-            max_new_tokens=2048,
-            do_sample=False,
-        )
-
-    # Decode only the newly generated tokens
-    input_len = inputs["input_ids"].shape[1]
-    raw_output = _processor.decode(
-        output_ids[0][input_len:],
-        skip_special_tokens=True,
-    )
-
-    elements, tables, raw_text = _parse_dolphin_output(raw_output)
-
-    w, h = image.size
-
-    return ExtractResponse(
-        elements=elements,
-        raw_text=raw_text,
-        tables=tables,
-        metadata={
-            "hint": req.hint,
-            "width": w,
-            "height": h,
-            "model": _model_path,
-            "device": _device,
-        },
-    )
-
-
-# ── CLI entry point ───────────────────────────────────────────────────────────
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="cf-docuvision service")
-    parser.add_argument("--model", required=True, help="Path to Dolphin-v2 model directory")
-    parser.add_argument("--port", type=int, default=8003)
-    parser.add_argument("--host", default="0.0.0.0")
-    parser.add_argument("--gpu-id", type=int, default=0)
-    args = parser.parse_args()
-
-    global _model_path
-    _model_path = args.model
-
-    import os
-    os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
-
-    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
-    uvicorn.run(app, host=args.host, port=args.port)
-
-
-if __name__ == "__main__":
-    main()
--- a/circuitforge_core/resources/inference/init.py
+++ b/circuitforge_core/resources/inference/init.py
--- a/circuitforge_core/resources/inference/llm_server.py
+++ b/circuitforge_core/resources/inference/llm_server.py
@ -1,137 +0,0 @@
-"""Generic OpenAI-compatible inference server for HuggingFace causal LMs."""
-from __future__ import annotations
-
-import argparse
-import time
-import uuid
-from contextlib import asynccontextmanager
-from typing import Any
-
-import torch
-import uvicorn
-from fastapi import FastAPI, HTTPException
-from pydantic import BaseModel
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-_model: Any = None
-_tokenizer: Any = None
-_model_id: str = ""
-_device: str = "cpu"
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    yield
-
-
-app = FastAPI(lifespan=lifespan)
-
-
-class Message(BaseModel):
-    role: str
-    content: str
-
-
-class ChatRequest(BaseModel):
-    model: str | None = None
-    messages: list[Message]
-    max_tokens: int | None = 512
-    temperature: float | None = 0.7
-    stream: bool | None = False
-
-
-@app.get("/health")
-def health() -> dict[str, str]:
-    return {"status": "ok", "model": _model_id}
-
-
-@app.get("/v1/models")
-def list_models() -> dict[str, Any]:
-    return {
-        "object": "list",
-        "data": [{"id": _model_id, "object": "model", "owned_by": "cf-orch"}],
-    }
-
-
-@app.post("/v1/chat/completions")
-def chat_completions(req: ChatRequest) -> dict[str, Any]:
-    if _model is None:
-        raise HTTPException(503, detail="Model not loaded")
-    if req.stream:
-        raise HTTPException(501, detail="Streaming not supported")
-
-    conversation = [{"role": m.role, "content": m.content} for m in req.messages]
-    try:
-        encoded = _tokenizer.apply_chat_template(
-            conversation,
-            return_tensors="pt",
-            add_generation_prompt=True,
-        )
-        # transformers 5.x returns BatchEncoding; 4.x returned a bare tensor
-        input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device)
-    except Exception as exc:
-        raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
-
-    max_new = req.max_tokens or 512
-    temp = req.temperature if req.temperature is not None else 0.7
-    gen_kwargs: dict[str, Any] = {
-        "max_new_tokens": max_new,
-        "do_sample": temp > 0,
-        "pad_token_id": _tokenizer.eos_token_id,
-    }
-    if temp > 0:
-        gen_kwargs["temperature"] = temp
-
-    with torch.inference_mode():
-        output_ids = _model.generate(input_ids, **gen_kwargs)
-
-    new_tokens = output_ids[0][input_ids.shape[-1]:]
-    reply = _tokenizer.decode(new_tokens, skip_special_tokens=True)
-
-    return {
-        "id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
-        "object": "chat.completion",
-        "created": int(time.time()),
-        "model": _model_id,
-        "choices": [
-            {
-                "index": 0,
-                "message": {"role": "assistant", "content": reply},
-                "finish_reason": "stop",
-            }
-        ],
-        "usage": {
-            "prompt_tokens": input_ids.shape[-1],
-            "completion_tokens": len(new_tokens),
-            "total_tokens": input_ids.shape[-1] + len(new_tokens),
-        },
-    }
-
-
-def _load_model(model_path: str, gpu_id: int) -> None:
-    global _model, _tokenizer, _model_id, _device
-    _device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
-    _model_id = model_path
-    _tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    _model = AutoModelForCausalLM.from_pretrained(
-        model_path,
-        dtype=torch.float16 if "cuda" in _device else torch.float32,
-        device_map={"": _device},
-        trust_remote_code=True,
-    )
-    _model.eval()
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser(description="cf-orch generic LLM inference server")
-    parser.add_argument("--model", required=True)
-    parser.add_argument("--port", type=int, default=8000)
-    parser.add_argument("--host", default="0.0.0.0")
-    parser.add_argument("--gpu-id", type=int, default=0)
-    args = parser.parse_args()
-    _load_model(args.model, args.gpu_id)
-    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
-
-
-if __name__ == "__main__":
-    main()
--- a/circuitforge_core/resources/models.py
+++ b/circuitforge_core/resources/models.py
@ -1,66 +0,0 @@
-from __future__ import annotations
-
-import time
-import uuid
-from dataclasses import dataclass, field
-from typing import Optional
-
-
-@dataclass(frozen=True)
-class VRAMLease:
-    lease_id: str
-    gpu_id: int
-    node_id: str
-    mb_granted: int
-    holder_service: str
-    priority: int
-    expires_at: float  # unix timestamp; 0.0 = no expiry
-
-    @classmethod
-    def create(
-        cls,
-        gpu_id: int,
-        node_id: str,
-        mb: int,
-        service: str,
-        priority: int,
-        ttl_s: float = 0.0,
-    ) -> VRAMLease:
-        return cls(
-            lease_id=str(uuid.uuid4()),
-            gpu_id=gpu_id,
-            node_id=node_id,
-            mb_granted=mb,
-            holder_service=service,
-            priority=priority,
-            expires_at=time.time() + ttl_s if ttl_s > 0.0 else 0.0,
-        )
-
-    def is_expired(self) -> bool:
-        return self.expires_at > 0.0 and time.time() > self.expires_at
-
-
-@dataclass(frozen=True)
-class GpuInfo:
-    gpu_id: int
-    name: str
-    vram_total_mb: int
-    vram_used_mb: int
-    vram_free_mb: int
-
-
-@dataclass(frozen=True)
-class ResidentAllocation:
-    """A model that is loaded and warm in VRAM but not actively serving a request."""
-    service: str
-    node_id: str
-    model_name: Optional[str]  # None if service is running but model probe failed
-    first_seen: float = field(default_factory=time.time)
-
-
-@dataclass
-class NodeInfo:
-    node_id: str
-    agent_url: str
-    gpus: list[GpuInfo]
-    last_heartbeat: float = field(default_factory=time.time)
--- a/circuitforge_core/resources/profiles/init.py
+++ b/circuitforge_core/resources/profiles/init.py
--- a/circuitforge_core/resources/profiles/public/cpu-16gb.yaml
+++ b/circuitforge_core/resources/profiles/public/cpu-16gb.yaml
@ -1,41 +0,0 @@
-schema_version: 1
-name: cpu-16gb
-eviction_timeout_s: 30.0
-services:
-  ollama:
-    max_mb: 0
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-stt:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 1
-    backend: moonshine
-  cf-tts:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 1
-  cf-embed:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    always_on: true
-  cf-classify:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    always_on: true
-model_size_hints:
-  llm_max_params: 3b-q4
-  image_gen_max: none
--- a/circuitforge_core/resources/profiles/public/cpu-32gb.yaml
+++ b/circuitforge_core/resources/profiles/public/cpu-32gb.yaml
@ -1,41 +0,0 @@
-schema_version: 1
-name: cpu-32gb
-eviction_timeout_s: 30.0
-services:
-  ollama:
-    max_mb: 0
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-stt:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    backend: faster-whisper
-  cf-tts:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 2
-  cf-embed:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 4
-    always_on: true
-  cf-classify:
-    max_mb: 0
-    priority: 2
-    shared: true
-    max_concurrent: 4
-    always_on: true
-model_size_hints:
-  llm_max_params: 7b-q4
-  image_gen_max: none
--- a/circuitforge_core/resources/profiles/public/single-gpu-16gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-16gb.yaml
@ -1,73 +0,0 @@
-schema_version: 1
-name: single-gpu-16gb
-vram_total_mb: 16384
-eviction_timeout_s: 10.0
-services:
-  vllm:
-    max_mb: 9000
-    priority: 1
-    idle_stop_after_s: 600
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
-      port: 8000
-      host_port: 8000
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  ollama:
-    max_mb: 12288
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 3072
-    priority: 2
-    shared: true
-    max_concurrent: 4
-  cf-docuvision:
-    max_mb: 6144
-    priority: 2
-    shared: true
-    max_concurrent: 3
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
-      port: 8003
-      host_port: 8003
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  cf-stt:
-    max_mb: 1200
-    priority: 2
-    shared: true
-    max_concurrent: 3
-    backend: parakeet-tdt
-  cf-tts:
-    max_mb: 1024
-    priority: 2
-    shared: true
-    max_concurrent: 3
-  cf-embed:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 6
-    always_on: true
-  cf-classify:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 6
-    always_on: true
-  comfyui:
-    max_mb: 14336
-    priority: 4
-model_size_hints:
-  llm_max_params: 34b
-  image_gen_max: flux-dev-fp8
--- a/circuitforge_core/resources/profiles/public/single-gpu-24gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-24gb.yaml
@ -1,73 +0,0 @@
-schema_version: 1
-name: single-gpu-24gb
-vram_total_mb: 24576
-eviction_timeout_s: 10.0
-services:
-  vllm:
-    max_mb: 9000
-    priority: 1
-    idle_stop_after_s: 600
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
-      port: 8000
-      host_port: 8000
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  ollama:
-    max_mb: 18432
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 4096
-    priority: 2
-    shared: true
-    max_concurrent: 6
-  cf-docuvision:
-    max_mb: 8192
-    priority: 2
-    shared: true
-    max_concurrent: 4
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
-      port: 8003
-      host_port: 8003
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  cf-stt:
-    max_mb: 1200
-    priority: 2
-    shared: true
-    max_concurrent: 4
-    backend: parakeet-tdt
-  cf-tts:
-    max_mb: 1024
-    priority: 2
-    shared: true
-    max_concurrent: 4
-  cf-embed:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 8
-    always_on: true
-  cf-classify:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 8
-    always_on: true
-  comfyui:
-    max_mb: 20480
-    priority: 4
-model_size_hints:
-  llm_max_params: 70b
-  image_gen_max: flux-dev-fp16
--- a/circuitforge_core/resources/profiles/public/single-gpu-2gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-2gb.yaml
@ -1,30 +0,0 @@
-schema_version: 1
-name: single-gpu-2gb
-vram_total_mb: 2048
-eviction_timeout_s: 15.0
-services:
-  ollama:
-    max_mb: 1536
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 1
-  cf-stt:
-    max_mb: 200
-    priority: 2
-    shared: true
-    max_concurrent: 1
-    backend: moonshine
-model_size_hints:
-  llm_max_params: 3b
-  image_gen_max: none
--- a/circuitforge_core/resources/profiles/public/single-gpu-4gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-4gb.yaml
@ -1,38 +0,0 @@
-schema_version: 1
-name: single-gpu-4gb
-vram_total_mb: 4096
-eviction_timeout_s: 15.0
-services:
-  ollama:
-    max_mb: 3072
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 1024
-    priority: 2
-    shared: true
-    max_concurrent: 1
-  cf-stt:
-    max_mb: 600
-    priority: 2
-    shared: true
-    max_concurrent: 1
-    backend: faster-whisper
-  cf-tts:
-    max_mb: 512
-    priority: 2
-    shared: true
-    max_concurrent: 1
-  comfyui:
-    max_mb: 3584
-    priority: 4
-model_size_hints:
-  llm_max_params: 3b
-  image_gen_max: sd15-fp8
--- a/circuitforge_core/resources/profiles/public/single-gpu-6gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-6gb.yaml
@ -1,61 +0,0 @@
-schema_version: 1
-name: single-gpu-6gb
-vram_total_mb: 6144
-eviction_timeout_s: 10.0
-services:
-  vllm:
-    max_mb: 5500
-    priority: 1
-    idle_stop_after_s: 600
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
-      port: 8000
-      host_port: 8000
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  ollama:
-    max_mb: 3584
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 1536
-    priority: 2
-    shared: true
-    max_concurrent: 2
-  cf-docuvision:
-    max_mb: 3072
-    priority: 2
-    shared: true
-    max_concurrent: 1
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
-      port: 8003
-      host_port: 8003
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  cf-stt:
-    max_mb: 600
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    backend: faster-whisper
-  cf-tts:
-    max_mb: 768
-    priority: 2
-    shared: true
-    max_concurrent: 1
-  comfyui:
-    max_mb: 5120
-    priority: 4
-model_size_hints:
-  llm_max_params: 7b
-  image_gen_max: sd15
--- a/circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml
+++ b/circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml
@ -1,68 +0,0 @@
-schema_version: 1
-name: single-gpu-8gb
-vram_total_mb: 8192
-eviction_timeout_s: 10.0
-services:
-  vllm:
-    max_mb: 6500
-    priority: 1
-    idle_stop_after_s: 600
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
-      port: 8000
-      host_port: 8000
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  ollama:
-    max_mb: 4096
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: "/usr/local/bin/ollama"
-      args_template: "serve"
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-  cf-vision:
-    max_mb: 2048
-    priority: 2
-    shared: true
-    max_concurrent: 3
-  cf-docuvision:
-    max_mb: 4096
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    managed:
-      type: process
-      exec_path: "/devl/miniconda3/envs/cf/bin/python"
-      args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
-      port: 8003
-      host_port: 8003
-      cwd: "/Library/Development/CircuitForge/circuitforge-core"
-  cf-stt:
-    max_mb: 1200
-    priority: 2
-    shared: true
-    max_concurrent: 2
-    backend: parakeet-tdt
-  cf-tts:
-    max_mb: 1024
-    priority: 2
-    shared: true
-    max_concurrent: 2
-  comfyui:
-    max_mb: 6144
-    priority: 4
-    managed:
-      type: process
-      exec_path: "/opt/miniconda3/envs/comfyui/bin/python"
-      args_template: "/opt/ComfyUI/main.py --listen 0.0.0.0 --port {port} --cuda-device {gpu_id}"
-      cwd: "/opt/ComfyUI"
-      port: 8188
-      host_port: 8188
-model_size_hints:
-  llm_max_params: 8b
-  image_gen_max: sdxl-fp8
--- a/circuitforge_core/resources/profiles/schema.py
+++ b/circuitforge_core/resources/profiles/schema.py
@ -1,121 +0,0 @@
-# circuitforge_core/resources/profiles/schema.py
-from __future__ import annotations
-
-from pathlib import Path
-from typing import Any
-
-import yaml
-from pydantic import BaseModel, Field, model_validator
-
-SUPPORTED_SCHEMA_VERSION = 1
-
-
-class DockerSpec(BaseModel):
-    """Spec for a Docker-managed service."""
-
-    image: str
-    port: int
-    host_port: int
-    command_template: str = ""
-    volumes: list[str] = Field(default_factory=list)
-    env: dict[str, str] = Field(default_factory=dict)
-    runtime: str = "nvidia"
-    ipc: str = "host"
-
-    model_config = {"frozen": True}
-
-
-class ProcessSpec(BaseModel):
-    """Spec for a process-managed service (non-Docker, e.g. conda env)."""
-
-    exec_path: str
-    args_template: str = ""
-    cwd: str = ""
-    env: dict[str, str] = Field(default_factory=dict)
-    port: int = 0
-    host_port: int = 0
-    # adopt=True: if the service is already listening on host_port, claim it rather
-    # than spawning a new process (useful for system daemons like Ollama).
-    adopt: bool = False
-    # Override the health probe path; defaults to /health (Ollama uses /api/tags).
-    health_path: str = "/health"
-
-    model_config = {"frozen": True}
-
-
-class ServiceProfile(BaseModel):
-    max_mb: int
-    priority: int
-    shared: bool = False
-    max_concurrent: int = 1
-    always_on: bool = False
-    idle_stop_after_s: int = 0
-    backend: str | None = None
-    consumers: list[str] = Field(default_factory=list)
-    managed: DockerSpec | ProcessSpec | None = None
-
-    model_config = {"frozen": True}
-
-    @model_validator(mode="before")
-    @classmethod
-    def _parse_managed(cls, values: Any) -> Any:
-        if not isinstance(values, dict):
-            return values
-        raw = values.get("managed")
-        if raw is None:
-            return values
-        if not isinstance(raw, dict):
-            return values
-        spec_type = raw.get("type")
-        managed_fields = {k: v for k, v in raw.items() if k != "type"}
-        if spec_type == "docker":
-            values["managed"] = DockerSpec(**managed_fields)
-        elif spec_type == "process":
-            values["managed"] = ProcessSpec(**managed_fields)
-        else:
-            raise ValueError(f"Unknown managed service type: {spec_type!r}")
-        return values
-
-
-class GpuNodeEntry(BaseModel):
-    id: int
-    vram_mb: int
-    role: str
-    card: str = "unknown"
-    always_on: bool = False
-    services: list[str] = Field(default_factory=list)
-
-    model_config = {"frozen": True}
-
-
-class NodeProfile(BaseModel):
-    gpus: list[GpuNodeEntry]
-    agent_url: str | None = None
-    nas_mount: str | None = None
-
-    model_config = {"frozen": True}
-
-
-class GpuProfile(BaseModel):
-    schema_version: int
-    name: str
-    vram_total_mb: int | None = None
-    eviction_timeout_s: float = 10.0
-    services: dict[str, ServiceProfile] = Field(default_factory=dict)
-    model_size_hints: dict[str, str] = Field(default_factory=dict)
-    nodes: dict[str, NodeProfile] = Field(default_factory=dict)
-
-    model_config = {"frozen": True}
-
-
-def load_profile(path: Path) -> GpuProfile:
-    raw: dict[str, Any] = yaml.safe_load(path.read_text())
-    if not isinstance(raw, dict):
-        raise ValueError(f"Profile file {path} must be a YAML mapping, got {type(raw).__name__}")
-    version = raw.get("schema_version")
-    if version != SUPPORTED_SCHEMA_VERSION:
-        raise ValueError(
-            f"Unsupported schema_version {version!r} in {path}. "
-            f"Expected {SUPPORTED_SCHEMA_VERSION}."
-        )
-    return GpuProfile.model_validate(raw)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "circuitforge-core"
-version = "0.7.0"
-description = "Shared scaffold for CircuitForge products"
+version = "0.8.0"
+description = "Shared scaffold for CircuitForge products (MIT)"
 requires-python = ">=3.11"
 dependencies = [
    "pyyaml>=6.0",
@ -14,32 +14,17 @@ dependencies = [
 ]

 [project.optional-dependencies]
-orch = [
-    "fastapi>=0.110",
-    "uvicorn[standard]>=0.29",
-    "httpx>=0.27",
-    "pydantic>=2.0",
-    "typer[all]>=0.12",
-    "psutil>=5.9",
-]
-tasks = [
-    "httpx>=0.27",
-]
 manage = [
    "platformdirs>=4.0",
    "typer[all]>=0.12",
 ]
 dev = [
-    "circuitforge-core[orch]",
-    "circuitforge-core[tasks]",
    "circuitforge-core[manage]",
    "pytest>=8.0",
    "pytest-asyncio>=0.23",
-    "httpx>=0.27",
 ]

 [project.scripts]
-cf-orch = "circuitforge_core.resources.cli:app"
 cf-manage = "circuitforge_core.manage.cli:app"

 [tool.setuptools.packages.find]
--- a/tests/test_resources/init.py
+++ b/tests/test_resources/init.py
--- a/tests/test_resources/test_agent_app.py
+++ b/tests/test_resources/test_agent_app.py
@ -1,68 +0,0 @@
-from __future__ import annotations
-
-import pytest
-from unittest.mock import MagicMock
-from fastapi.testclient import TestClient
-
-from circuitforge_core.resources.agent.app import create_agent_app
-from circuitforge_core.resources.models import GpuInfo
-from circuitforge_core.resources.agent.eviction_executor import EvictionResult
-
-MOCK_GPUS = [
-    GpuInfo(
-        gpu_id=0,
-        name="RTX 4000",
-        vram_total_mb=8192,
-        vram_used_mb=1024,
-        vram_free_mb=7168,
-    ),
-]
-
-
-@pytest.fixture
-def agent_client():
-    mock_monitor = MagicMock()
-    mock_monitor.poll.return_value = MOCK_GPUS
-    mock_executor = MagicMock()
-    app = create_agent_app(
-        node_id="heimdall",
-        monitor=mock_monitor,
-        executor=mock_executor,
-    )
-    return TestClient(app), mock_monitor, mock_executor
-
-
-def test_health_returns_ok(agent_client):
-    client, _, _ = agent_client
-    resp = client.get("/health")
-    assert resp.status_code == 200
-    assert resp.json()["status"] == "ok"
-    assert resp.json()["node_id"] == "heimdall"
-
-
-def test_gpu_info_returns_gpu_list(agent_client):
-    client, _, _ = agent_client
-    resp = client.get("/gpu-info")
-    assert resp.status_code == 200
-    data = resp.json()
-    assert len(data["gpus"]) == 1
-    assert data["gpus"][0]["gpu_id"] == 0
-    assert data["gpus"][0]["name"] == "RTX 4000"
-    assert data["gpus"][0]["vram_free_mb"] == 7168
-
-
-def test_evict_calls_executor(agent_client):
-    client, _, mock_executor = agent_client
-    mock_executor.evict_pid.return_value = EvictionResult(
-        success=True, method="sigterm", message="done"
-    )
-    resp = client.post("/evict", json={"pid": 1234, "grace_period_s": 5.0})
-    assert resp.status_code == 200
-    assert resp.json()["success"] is True
-    mock_executor.evict_pid.assert_called_once_with(pid=1234, grace_period_s=5.0)
-
-
-def test_evict_requires_pid(agent_client):
-    client, _, _ = agent_client
-    resp = client.post("/evict", json={"grace_period_s": 5.0})
-    assert resp.status_code == 422
--- a/tests/test_resources/test_agent_supervisor.py
+++ b/tests/test_resources/test_agent_supervisor.py
@ -1,93 +0,0 @@
-import asyncio
-import time
-import pytest
-from unittest.mock import AsyncMock, MagicMock, patch
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry, ServiceInstance
-
-
-def test_build_idle_stop_config_empty_without_registry():
-    lm = LeaseManager()
-    supervisor = AgentSupervisor(lease_manager=lm)
-    assert supervisor._build_idle_stop_config() == {}
-
-
-def test_build_idle_stop_config_from_profiles():
-    lm = LeaseManager()
-    mock_svc = MagicMock()
-    mock_svc.idle_stop_after_s = 600
-    mock_profile = MagicMock()
-    mock_profile.services = {"vllm": mock_svc}
-    mock_profile_registry = MagicMock()
-    mock_profile_registry.list_public.return_value = [mock_profile]
-
-    supervisor = AgentSupervisor(lease_manager=lm, profile_registry=mock_profile_registry)
-    config = supervisor._build_idle_stop_config()
-    assert config == {"vllm": 600}
-
-
-@pytest.mark.asyncio
-async def test_run_idle_sweep_posts_stop():
-    lm = LeaseManager()
-    service_registry = ServiceRegistry()
-
-    # Upsert instance as running, then allocate + release to transition it to idle
-    service_registry.upsert_instance(
-        service="vllm",
-        node_id="heimdall",
-        gpu_id=0,
-        state="running",
-        model="test-model",
-        url="http://heimdall:8000",
-    )
-    alloc = service_registry.allocate(
-        service="vllm",
-        node_id="heimdall",
-        gpu_id=0,
-        model="test-model",
-        url="http://heimdall:8000",
-        caller="test",
-        ttl_s=300.0,
-    )
-    service_registry.release(alloc.allocation_id)
-
-    # Backdate idle_since so it exceeds the timeout
-    import dataclasses
-    key = "vllm:heimdall:0"
-    inst = service_registry._instances[key]
-    service_registry._instances[key] = dataclasses.replace(inst, idle_since=time.time() - 700)
-
-    mock_profile_registry = MagicMock()
-    mock_svc = MagicMock()
-    mock_svc.idle_stop_after_s = 600
-    mock_profile = MagicMock()
-    mock_profile.services = {"vllm": mock_svc}
-    mock_profile_registry.list_public.return_value = [mock_profile]
-
-    supervisor = AgentSupervisor(
-        lease_manager=lm,
-        service_registry=service_registry,
-        profile_registry=mock_profile_registry,
-    )
-    supervisor.register("heimdall", "http://heimdall:7701")
-
-    posted_urls = []
-
-    async def fake_http_post(url: str) -> bool:
-        posted_urls.append(url)
-        return True
-
-    supervisor._http_post = fake_http_post
-    await supervisor._run_idle_sweep()
-
-    assert len(posted_urls) == 1
-    assert posted_urls[0] == "http://heimdall:7701/services/vllm/stop"
-
-
-@pytest.mark.asyncio
-async def test_run_idle_sweep_skips_without_registry():
-    lm = LeaseManager()
-    supervisor = AgentSupervisor(lease_manager=lm)
-    # Should return immediately without error
-    await supervisor._run_idle_sweep()
--- a/tests/test_resources/test_agent_watchdog.py
+++ b/tests/test_resources/test_agent_watchdog.py
@ -1,151 +0,0 @@
-# tests/test_resources/test_agent_watchdog.py
-"""
-Tests for AgentSupervisor watchdog behaviour:
-  - restore_from_store() reloads known nodes from NodeStore on startup
-  - register() persists to NodeStore
-  - restored nodes start offline and come online after a successful poll
-  - NodeStore=None path is a no-op (backwards compatibility)
-"""
-from __future__ import annotations
-
-from pathlib import Path
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.node_store import NodeStore
-
-
-# ── fixtures ──────────────────────────────────────────────────────────────────
-
-@pytest.fixture
-def store(tmp_path: Path) -> NodeStore:
-    return NodeStore(db_path=tmp_path / "nodes.db")
-
-
-@pytest.fixture
-def supervisor(store: NodeStore) -> AgentSupervisor:
-    return AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
-
-
-@pytest.fixture
-def supervisor_no_store() -> AgentSupervisor:
-    return AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
-
-
-# ── register() persists ───────────────────────────────────────────────────────
-
-def test_register_persists_to_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
-    supervisor.register("heimdall", "http://127.0.0.1:7701")
-    rows = store.all()
-    assert len(rows) == 1
-    assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
-
-
-def test_register_updates_url_in_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
-    supervisor.register("navi", "http://10.1.10.10:7701")
-    supervisor.register("navi", "http://10.1.10.10:9999")
-    rows = store.all()
-    assert len(rows) == 1
-    assert rows[0][1] == "http://10.1.10.10:9999"
-
-
-def test_register_without_store_does_not_crash(supervisor_no_store: AgentSupervisor) -> None:
-    supervisor_no_store.register("heimdall", "http://127.0.0.1:7701")
-    assert supervisor_no_store.get_node_info("heimdall") is not None
-
-
-# ── restore_from_store() ──────────────────────────────────────────────────────
-
-def test_restore_loads_known_nodes(tmp_path: Path) -> None:
-    """Nodes written by a previous supervisor session are restored into a fresh one."""
-    db = tmp_path / "nodes.db"
-
-    # Session 1: register two nodes
-    s1 = NodeStore(db_path=db)
-    sup1 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s1)
-    sup1.register("navi", "http://10.1.10.10:7701")
-    sup1.register("strahl", "http://10.1.10.20:7701")
-
-    # Session 2: fresh supervisor, same DB
-    s2 = NodeStore(db_path=db)
-    sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
-    restored = sup2.restore_from_store()
-
-    assert restored == 2
-    assert sup2.get_node_info("navi") is not None
-    assert sup2.get_node_info("strahl") is not None
-
-
-def test_restore_marks_nodes_offline(tmp_path: Path) -> None:
-    """Restored nodes start offline — they haven't been polled yet."""
-    db = tmp_path / "nodes.db"
-
-    s1 = NodeStore(db_path=db)
-    AgentSupervisor(lease_manager=LeaseManager(), node_store=s1).register(
-        "navi", "http://10.1.10.10:7701"
-    )
-
-    s2 = NodeStore(db_path=db)
-    sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
-    sup2.restore_from_store()
-
-    assert sup2.online_agents() == {}
-
-
-def test_restore_returns_zero_without_store() -> None:
-    sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
-    assert sup.restore_from_store() == 0
-
-
-def test_restore_skips_already_registered(tmp_path: Path) -> None:
-    """Nodes manually registered before restore_from_store() are not duplicated."""
-    db = tmp_path / "nodes.db"
-    store = NodeStore(db_path=db)
-    store.upsert("heimdall", "http://127.0.0.1:7701")
-
-    sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
-    sup.register("heimdall", "http://127.0.0.1:7701")  # already in memory
-    restored = sup.restore_from_store()
-
-    assert restored == 0  # already present, not double-counted
-
-
-# ── restored node comes online after poll ─────────────────────────────────────
-
-@pytest.mark.asyncio
-async def test_restored_node_comes_online_after_poll(tmp_path: Path) -> None:
-    """After restore, a successful poll_agent() brings the node online."""
-    db = tmp_path / "nodes.db"
-    store = NodeStore(db_path=db)
-    store.upsert("navi", "http://10.1.10.10:7701")
-
-    sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
-    sup.restore_from_store()
-
-    # Stub poll_agent to succeed
-    gpu_payload = {"gpus": [{"gpu_id": 0, "name": "RTX 4000",
-                              "vram_total_mb": 8192, "vram_used_mb": 512, "vram_free_mb": 7680}]}
-    resident_payload = {"residents": []}
-
-    mock_resp_gpu = MagicMock()
-    mock_resp_gpu.raise_for_status = MagicMock()
-    mock_resp_gpu.json.return_value = gpu_payload
-
-    mock_resp_res = MagicMock()
-    mock_resp_res.is_success = True
-    mock_resp_res.json.return_value = resident_payload
-
-    mock_client = AsyncMock()
-    mock_client.get = AsyncMock(side_effect=[mock_resp_gpu, mock_resp_res])
-    mock_client.__aenter__ = AsyncMock(return_value=mock_client)
-    mock_client.__aexit__ = AsyncMock(return_value=False)
-
-    with patch("circuitforge_core.resources.coordinator.agent_supervisor.httpx.AsyncClient",
-               return_value=mock_client):
-        result = await sup.poll_agent("navi")
-
-    assert result is True
-    assert "navi" in sup.online_agents()
--- a/tests/test_resources/test_cli.py
+++ b/tests/test_resources/test_cli.py
@ -1,33 +0,0 @@
-from __future__ import annotations
-
-from pathlib import Path
-from unittest.mock import patch
-
-from typer.testing import CliRunner
-
-from circuitforge_core.resources.cli import app
-
-runner = CliRunner()
-
-
-def test_cli_help():
-    result = runner.invoke(app, ["--help"])
-    assert result.exit_code == 0
-    assert "cf-orch" in result.output.lower() or "Usage" in result.output
-
-
-def test_status_command_shows_no_coordinator_message():
-    with patch("httpx.get", side_effect=ConnectionRefusedError("refused")):
-        result = runner.invoke(app, ["status"])
-    assert result.exit_code != 0 or "unreachable" in result.output.lower() \
-        or "coordinator" in result.output.lower()
-
-
-def test_install_service_creates_systemd_unit(tmp_path: Path):
-    unit_path = tmp_path / "cf-orch.service"
-    with patch(
-        "circuitforge_core.resources.cli._SYSTEMD_UNIT_PATH", unit_path
-    ):
-        result = runner.invoke(app, ["install-service", "--dry-run"])
-    assert result.exit_code == 0
-    assert "cf-orch.service" in result.output or "systemd" in result.output.lower()
--- a/tests/test_resources/test_client.py
+++ b/tests/test_resources/test_client.py
@ -1,94 +0,0 @@
-import json
-from unittest.mock import AsyncMock, MagicMock, patch
-
-import pytest
-import httpretty
-from circuitforge_core.resources.client import CFOrchClient, Allocation
-
-_ALLOC_BODY = (
-    '{"allocation_id":"abc123","service":"vllm","node_id":"heimdall",'
-    '"gpu_id":0,"model":"Ouro-1.4B","url":"http://heimdall:8000","started":false,"warm":true}'
-)
-
-
-@httpretty.activate
-def test_sync_allocate_returns_allocation():
-    httpretty.register_uri(
-        httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
-        body=_ALLOC_BODY, content_type="application/json",
-    )
-    httpretty.register_uri(
-        httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/abc123",
-        body='{"released":true}', content_type="application/json",
-    )
-    client = CFOrchClient("http://orch:7700")
-    with client.allocate("vllm", model_candidates=["Ouro-1.4B"], caller="test") as alloc:
-        assert isinstance(alloc, Allocation)
-        assert alloc.url == "http://heimdall:8000"
-        assert alloc.model == "Ouro-1.4B"
-        assert alloc.allocation_id == "abc123"
-    assert httpretty.last_request().method == "DELETE"
-
-
-@httpretty.activate
-def test_sync_allocate_ignores_404_on_release():
-    httpretty.register_uri(
-        httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
-        body='{"allocation_id":"xyz","service":"vllm","node_id":"a","gpu_id":0,'
-             '"model":"m","url":"http://a:8000","started":false,"warm":false}',
-        content_type="application/json",
-    )
-    httpretty.register_uri(
-        httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/xyz",
-        status=404, body='{"detail":"not found"}', content_type="application/json",
-    )
-    client = CFOrchClient("http://orch:7700")
-    with client.allocate("vllm", model_candidates=["m"]) as alloc:
-        assert alloc.url == "http://a:8000"
-    # No exception raised — 404 on release is silently ignored
-
-
-@httpretty.activate
-def test_sync_allocate_raises_on_503():
-    httpretty.register_uri(
-        httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
-        status=503, body='{"detail":"no capacity"}', content_type="application/json",
-    )
-    client = CFOrchClient("http://orch:7700")
-    with pytest.raises(RuntimeError, match="cf-orch allocation failed"):
-        with client.allocate("vllm", model_candidates=["m"]):
-            pass
-
-
-async def test_async_allocate_works():
-    # httpretty only patches stdlib sockets; httpx async uses anyio sockets so
-    # we mock httpx.AsyncClient directly instead.
-    alloc_data = {
-        "allocation_id": "a1", "service": "vllm", "node_id": "n",
-        "gpu_id": 0, "model": "m", "url": "http://n:8000",
-        "started": False, "warm": False,
-    }
-    release_data = {"released": True}
-
-    def _make_response(data, status_code=200):
-        resp = MagicMock()
-        resp.is_success = status_code < 400
-        resp.status_code = status_code
-        resp.json.return_value = data
-        return resp
-
-    mock_post = AsyncMock(return_value=_make_response(alloc_data))
-    mock_delete = AsyncMock(return_value=_make_response(release_data))
-
-    mock_async_client = MagicMock()
-    mock_async_client.post = mock_post
-    mock_async_client.delete = mock_delete
-    mock_async_client.__aenter__ = AsyncMock(return_value=mock_async_client)
-    mock_async_client.__aexit__ = AsyncMock(return_value=False)
-
-    with patch("httpx.AsyncClient", return_value=mock_async_client):
-        client = CFOrchClient("http://orch:7700")
-        async with client.allocate_async("vllm", model_candidates=["m"]) as alloc:
-            assert alloc.url == "http://n:8000"
-            assert alloc.allocation_id == "a1"
-    mock_delete.assert_called_once()
--- a/tests/test_resources/test_coordinator_allocate.py
+++ b/tests/test_resources/test_coordinator_allocate.py
@ -1,132 +0,0 @@
-import pytest
-from unittest.mock import AsyncMock, MagicMock, patch
-from fastapi.testclient import TestClient
-from circuitforge_core.resources.coordinator.app import create_coordinator_app
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
-from circuitforge_core.resources.models import GpuInfo, NodeInfo
-
-
-def _make_supervisor_mock(online: bool = True):
-    sup = MagicMock()
-    record = AgentRecord(node_id="heimdall", agent_url="http://heimdall:7701")
-    record.gpus = [GpuInfo(0, "RTX 4000", 8192, 0, 8192)]
-    record.online = online
-    sup.online_agents.return_value = {"heimdall": record} if online else {}
-    sup.get_node_info.return_value = NodeInfo(
-        node_id="heimdall",
-        agent_url="http://heimdall:7701",
-        gpus=record.gpus,
-        last_heartbeat=0.0,
-    )
-    return sup
-
-
-@pytest.fixture
-def alloc_client():
-    lm = LeaseManager()
-    pr = ProfileRegistry()
-    sup = _make_supervisor_mock()
-    sr = ServiceRegistry()
-    app = create_coordinator_app(lease_manager=lm, profile_registry=pr, agent_supervisor=sup, service_registry=sr)
-    return TestClient(app), sup, sr
-
-
-def test_allocate_returns_allocation_id_and_url(alloc_client):
-    client, sup, sr = alloc_client
-    with patch("httpx.AsyncClient") as mock_http:
-        mock_resp = MagicMock()
-        mock_resp.is_success = True
-        mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
-        mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
-
-        resp = client.post("/api/services/vllm/allocate", json={
-            "model_candidates": ["Ouro-1.4B"],
-            "ttl_s": 300.0,
-            "caller": "test",
-        })
-
-    assert resp.status_code == 200
-    data = resp.json()
-    assert "allocation_id" in data
-    assert data["service"] == "vllm"
-    assert data["node_id"] == "heimdall"
-    assert data["url"] == "http://heimdall:8000"
-
-
-def test_allocate_returns_503_when_no_online_nodes(alloc_client):
-    client, sup, sr = alloc_client
-    sup.online_agents.return_value = {}
-    resp = client.post("/api/services/vllm/allocate", json={"model_candidates": ["Ouro-1.4B"]})
-    assert resp.status_code == 503
-
-
-def test_allocate_returns_422_for_empty_candidates(alloc_client):
-    client, _, sr = alloc_client
-    resp = client.post("/api/services/vllm/allocate", json={"model_candidates": []})
-    assert resp.status_code == 422
-
-
-def test_allocate_returns_422_for_unknown_service(alloc_client):
-    client, _, sr = alloc_client
-    resp = client.post("/api/services/cf-made-up/allocate", json={"model_candidates": ["x"]})
-    assert resp.status_code == 422
-
-
-def test_allocate_records_in_registry(alloc_client):
-    client, sup, sr = alloc_client
-    with patch("httpx.AsyncClient") as mock_http:
-        mock_resp = MagicMock()
-        mock_resp.is_success = True
-        mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
-        mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
-
-        resp = client.post("/api/services/vllm/allocate", json={
-            "model_candidates": ["Ouro-1.4B"],
-            "ttl_s": 300.0,
-            "caller": "test",
-        })
-
-    assert resp.status_code == 200
-    allocation_id = resp.json()["allocation_id"]
-
-    status_resp = client.get("/api/services/vllm/status")
-    assert status_resp.status_code == 200
-    status_data = status_resp.json()
-    assert status_data["service"] == "vllm"
-    alloc_ids = [a["allocation_id"] for a in status_data["allocations"]]
-    assert allocation_id in alloc_ids
-
-
-def test_release_allocation(alloc_client):
-    client, sup, sr = alloc_client
-    with patch("httpx.AsyncClient") as mock_http:
-        mock_resp = MagicMock()
-        mock_resp.is_success = True
-        mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
-        mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
-
-        resp = client.post("/api/services/vllm/allocate", json={
-            "model_candidates": ["Ouro-1.4B"],
-            "ttl_s": 300.0,
-            "caller": "test",
-        })
-
-    assert resp.status_code == 200
-    allocation_id = resp.json()["allocation_id"]
-
-    del_resp = client.delete(f"/api/services/vllm/allocations/{allocation_id}")
-    assert del_resp.status_code == 200
-    assert del_resp.json() == {"released": True, "allocation_id": allocation_id}
-
-    status_resp = client.get("/api/services/vllm/status")
-    alloc_ids = [a["allocation_id"] for a in status_resp.json()["allocations"]]
-    assert allocation_id not in alloc_ids
-
-
-def test_release_allocation_not_found(alloc_client):
-    client, _, sr = alloc_client
-    resp = client.delete("/api/services/vllm/allocations/bad-id")
-    assert resp.status_code == 404
--- a/tests/test_resources/test_coordinator_app.py
+++ b/tests/test_resources/test_coordinator_app.py
@ -1,183 +0,0 @@
-import pytest
-from unittest.mock import MagicMock
-from pathlib import Path
-from fastapi.testclient import TestClient
-from circuitforge_core.resources.coordinator.app import create_coordinator_app
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.models import GpuInfo, NodeInfo
-from circuitforge_core.resources.profiles.schema import load_profile
-
-
-@pytest.fixture
-def coordinator_client():
-    lease_manager = LeaseManager()
-    lease_manager.register_gpu("heimdall", 0, 8192)
-    profile_registry = ProfileRegistry()
-    supervisor = MagicMock()
-    supervisor.all_nodes.return_value = [
-        NodeInfo(
-            node_id="heimdall",
-            agent_url="http://localhost:7701",
-            gpus=[GpuInfo(gpu_id=0, name="RTX 4000",
-                          vram_total_mb=8192, vram_used_mb=0, vram_free_mb=8192)],
-            last_heartbeat=0.0,
-        )
-    ]
-    supervisor.get_node_info.return_value = NodeInfo(
-        node_id="heimdall",
-        agent_url="http://localhost:7701",
-        gpus=[],
-        last_heartbeat=0.0,
-    )
-    app = create_coordinator_app(
-        lease_manager=lease_manager,
-        profile_registry=profile_registry,
-        agent_supervisor=supervisor,
-        service_registry=ServiceRegistry(),
-    )
-    return TestClient(app), lease_manager
-
-
-def test_health_returns_ok(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.get("/api/health")
-    assert resp.status_code == 200
-    assert resp.json()["status"] == "ok"
-
-
-def test_get_nodes_returns_list(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.get("/api/nodes")
-    assert resp.status_code == 200
-    nodes = resp.json()["nodes"]
-    assert len(nodes) == 1
-    assert nodes[0]["node_id"] == "heimdall"
-
-
-def test_get_profiles_returns_public_profiles(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.get("/api/profiles")
-    assert resp.status_code == 200
-    names = [p["name"] for p in resp.json()["profiles"]]
-    assert "single-gpu-8gb" in names
-
-
-def test_post_lease_grants_lease(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.post("/api/leases", json={
-        "node_id": "heimdall", "gpu_id": 0,
-        "mb": 2048, "service": "peregrine", "priority": 1,
-    })
-    assert resp.status_code == 200
-    data = resp.json()
-    assert data["lease"]["mb_granted"] == 2048
-    assert data["lease"]["holder_service"] == "peregrine"
-    assert "lease_id" in data["lease"]
-
-
-def test_delete_lease_releases_it(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.post("/api/leases", json={
-        "node_id": "heimdall", "gpu_id": 0,
-        "mb": 2048, "service": "peregrine", "priority": 1,
-    })
-    lease_id = resp.json()["lease"]["lease_id"]
-    del_resp = client.delete(f"/api/leases/{lease_id}")
-    assert del_resp.status_code == 200
-    assert del_resp.json()["released"] is True
-
-
-def test_delete_unknown_lease_returns_404(coordinator_client):
-    client, _ = coordinator_client
-    resp = client.delete("/api/leases/nonexistent-id")
-    assert resp.status_code == 404
-
-
-def test_get_leases_returns_active_leases(coordinator_client):
-    client, _ = coordinator_client
-    client.post("/api/leases", json={
-        "node_id": "heimdall", "gpu_id": 0,
-        "mb": 1024, "service": "kiwi", "priority": 2,
-    })
-    resp = client.get("/api/leases")
-    assert resp.status_code == 200
-    assert len(resp.json()["leases"]) == 1
-
-
-def test_dashboard_serves_html(coordinator_client):
-    """GET / returns the dashboard HTML page."""
-    client, _ = coordinator_client
-    resp = client.get("/")
-    assert resp.status_code == 200
-    assert "text/html" in resp.headers["content-type"]
-    # Verify key structural markers are present (without asserting exact markup)
-    assert "cf-orch" in resp.text
-    assert "/api/nodes" in resp.text
-    assert "/api/leases" in resp.text
-
-
-def test_online_agents_excludes_offline():
-    lm = LeaseManager()
-    sup = AgentSupervisor(lm)
-    sup.register("online_node", "http://a:7701")
-    sup.register("offline_node", "http://b:7701")
-    sup._agents["online_node"].online = True
-    sup._agents["offline_node"].online = False
-    result = sup.online_agents()
-    assert "online_node" in result
-    assert "offline_node" not in result
-
-
-def test_resident_keys_returns_set_of_node_service():
-    lm = LeaseManager()
-    lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
-    keys = lm.resident_keys()
-    assert keys == {"heimdall:vllm", "heimdall:ollama"}
-
-
-def test_single_gpu_8gb_profile_has_idle_stop_after_s():
-    profile = load_profile(
-        Path("circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml")
-    )
-    vllm_svc = profile.services.get("vllm")
-    assert vllm_svc is not None
-    assert hasattr(vllm_svc, "idle_stop_after_s")
-    assert vllm_svc.idle_stop_after_s == 600
-
-
-def test_ensure_service_returns_503_when_vram_too_low():
-    """VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
-    # Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
-    lease_manager = LeaseManager()
-    lease_manager.register_gpu("low-vram-node", 0, 512)
-    profile_registry = ProfileRegistry()
-    supervisor = MagicMock()
-    supervisor.get_node_info.return_value = NodeInfo(
-        node_id="low-vram-node",
-        agent_url="http://localhost:7701",
-        gpus=[GpuInfo(gpu_id=0, name="GTX 1050",
-                      vram_total_mb=512, vram_used_mb=412, vram_free_mb=100)],
-        last_heartbeat=0.0,
-    )
-    supervisor.all_nodes.return_value = []
-    app = create_coordinator_app(
-        lease_manager=lease_manager,
-        profile_registry=profile_registry,
-        agent_supervisor=supervisor,
-        service_registry=ServiceRegistry(),
-    )
-    client = TestClient(app)
-
-    resp = client.post("/api/services/vllm/ensure", json={
-        "node_id": "low-vram-node",
-        "gpu_id": 0,
-        "params": {"model": "some-model"},
-    })
-
-    assert resp.status_code == 503
-    assert "Insufficient VRAM" in resp.json()["detail"]
-    # Guard must fire before any agent HTTP call is attempted.
-    supervisor.get_node_info.assert_called_once_with("low-vram-node")
--- a/tests/test_resources/test_coordinator_auth.py
+++ b/tests/test_resources/test_coordinator_auth.py
@ -1,148 +0,0 @@
-"""Tests for HeimdallAuthMiddleware — TTL cache and request gating."""
-import time
-import pytest
-from unittest.mock import patch, MagicMock
-from fastapi import FastAPI
-from fastapi.testclient import TestClient
-
-from circuitforge_core.resources.coordinator.auth import (
-    HeimdallAuthMiddleware,
-    _ValidationCache,
-    CACHE_TTL_S,
-)
-
-
-# ── Cache unit tests ──────────────────────────────────────────────────────────
-
-def test_cache_miss_returns_none():
-    cache = _ValidationCache()
-    assert cache.get("nonexistent") is None
-
-
-def test_cache_stores_and_retrieves():
-    cache = _ValidationCache()
-    cache.set("key1", valid=True, tier="paid", user_id="u1")
-    entry = cache.get("key1")
-    assert entry is not None
-    assert entry.valid is True
-    assert entry.tier == "paid"
-
-
-def test_cache_entry_expires():
-    cache = _ValidationCache(ttl_s=0.05)
-    cache.set("key1", valid=True, tier="paid", user_id="u1")
-    time.sleep(0.1)
-    assert cache.get("key1") is None
-
-
-def test_cache_evict_removes_key():
-    cache = _ValidationCache()
-    cache.set("key1", valid=True, tier="paid", user_id="u1")
-    cache.evict("key1")
-    assert cache.get("key1") is None
-
-
-def test_cache_prune_removes_expired():
-    cache = _ValidationCache(ttl_s=0.05)
-    cache.set("k1", valid=True, tier="paid", user_id="")
-    cache.set("k2", valid=True, tier="paid", user_id="")
-    time.sleep(0.1)
-    removed = cache.prune()
-    assert removed == 2
-
-
-# ── Middleware integration tests ──────────────────────────────────────────────
-
-def _make_app_with_auth(middleware: HeimdallAuthMiddleware) -> TestClient:
-    app = FastAPI()
-    app.middleware("http")(middleware)
-
-    @app.get("/api/health")
-    def health():
-        return {"status": "ok"}
-
-    @app.post("/api/services/vllm/allocate")
-    def allocate():
-        return {"allocation_id": "abc", "url": "http://gpu:8000"}
-
-    return TestClient(app, raise_server_exceptions=False)
-
-
-def _patched_middleware(valid: bool, tier: str = "paid") -> HeimdallAuthMiddleware:
-    """Return a middleware whose Heimdall call is pre-mocked."""
-    mw = HeimdallAuthMiddleware(
-        heimdall_url="http://heimdall.test",
-        min_tier="paid",
-    )
-    mw._validate_against_heimdall = MagicMock(  # type: ignore[method-assign]
-        return_value=(valid, tier, "user-1" if valid else "")
-    )
-    return mw
-
-
-def test_health_exempt_no_auth_required():
-    mw = _patched_middleware(valid=True)
-    client = _make_app_with_auth(mw)
-    resp = client.get("/api/health")
-    assert resp.status_code == 200
-
-
-def test_missing_auth_header_returns_401():
-    mw = _patched_middleware(valid=True)
-    client = _make_app_with_auth(mw)
-    resp = client.post("/api/services/vllm/allocate")
-    assert resp.status_code == 401
-
-
-def test_invalid_key_returns_403():
-    mw = _patched_middleware(valid=False)
-    client = _make_app_with_auth(mw)
-    resp = client.post(
-        "/api/services/vllm/allocate",
-        headers={"Authorization": "Bearer BAD-KEY"},
-    )
-    assert resp.status_code == 403
-
-
-def test_valid_paid_key_passes():
-    mw = _patched_middleware(valid=True, tier="paid")
-    client = _make_app_with_auth(mw)
-    resp = client.post(
-        "/api/services/vllm/allocate",
-        headers={"Authorization": "Bearer CFG-KIWI-GOOD-GOOD-GOOD"},
-    )
-    assert resp.status_code == 200
-
-
-def test_free_tier_key_rejected_when_min_is_paid():
-    mw = _patched_middleware(valid=True, tier="free")
-    client = _make_app_with_auth(mw)
-    resp = client.post(
-        "/api/services/vllm/allocate",
-        headers={"Authorization": "Bearer CFG-KIWI-FREE-FREE-FREE"},
-    )
-    assert resp.status_code == 403
-    assert "paid" in resp.json()["detail"]
-
-
-def test_cache_prevents_second_heimdall_call():
-    mw = _patched_middleware(valid=True, tier="paid")
-    client = _make_app_with_auth(mw)
-    key = "CFG-KIWI-CACHED-KEY-1"
-    headers = {"Authorization": f"Bearer {key}"}
-    client.post("/api/services/vllm/allocate", headers=headers)
-    client.post("/api/services/vllm/allocate", headers=headers)
-    # Heimdall should only have been called once — second hit is from cache
-    assert mw._validate_against_heimdall.call_count == 1  # type: ignore[attr-defined]
-
-
-def test_from_env_returns_none_without_heimdall_url(monkeypatch):
-    monkeypatch.delenv("HEIMDALL_URL", raising=False)
-    assert HeimdallAuthMiddleware.from_env() is None
-
-
-def test_from_env_returns_middleware_when_set(monkeypatch):
-    monkeypatch.setenv("HEIMDALL_URL", "http://heimdall.test")
-    mw = HeimdallAuthMiddleware.from_env()
-    assert mw is not None
-    assert mw._heimdall == "http://heimdall.test"
--- a/tests/test_resources/test_coordinator_probe.py
+++ b/tests/test_resources/test_coordinator_probe.py
@ -1,215 +0,0 @@
-# tests/test_resources/test_coordinator_probe.py
-"""
-Unit tests for _run_instance_probe_loop in coordinator/app.py.
-
-Covers:
-  - healthy path:   /health → 200 → state transitions starting → running
-  - timeout path:   no healthy response within _PROBE_TIMEOUT_S → starting → stopped
-  - cleanup path:   non-starting instance cleans up its start_times entry
-"""
-from __future__ import annotations
-
-import asyncio
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from circuitforge_core.resources.coordinator.app import (
-    _PROBE_TIMEOUT_S,
-    _run_instance_probe_loop,
-)
-from circuitforge_core.resources.coordinator.service_registry import ServiceInstance, ServiceRegistry
-
-
-# ── helpers ──────────────────────────────────────────────────────────────────
-
-def _inst(**kwargs) -> ServiceInstance:
-    defaults = dict(
-        service="vllm", node_id="node1", gpu_id=0,
-        state="starting", model="qwen", url="http://localhost:8000",
-    )
-    defaults.update(kwargs)
-    return ServiceInstance(**defaults)
-
-
-def _registry(*instances: ServiceInstance) -> MagicMock:
-    reg = MagicMock(spec=ServiceRegistry)
-    reg.all_instances.return_value = list(instances)
-    return reg
-
-
-def _health_resp(status: int = 200) -> MagicMock:
-    """Context-manager mock that simulates an HTTP response."""
-    resp = MagicMock()
-    resp.status = status
-    resp.__enter__ = lambda s: resp
-    resp.__exit__ = MagicMock(return_value=False)
-    return resp
-
-
-async def _one_tick(coro_fn, registry, *, time_val: float = 1000.0, **url_patch):
-    """
-    Run the probe loop for exactly one iteration then cancel it.
-
-    asyncio.sleep is patched to return immediately on the first call
-    and raise CancelledError on the second (ending the loop cleanly).
-    """
-    calls = 0
-
-    async def _fake_sleep(_delay):
-        nonlocal calls
-        calls += 1
-        if calls > 1:
-            raise asyncio.CancelledError()
-
-    patches = [
-        patch("asyncio.sleep", new=_fake_sleep),
-        patch("time.time", return_value=time_val),
-    ]
-    if url_patch:
-        patches.append(patch("urllib.request.urlopen", **url_patch))
-
-    ctx = [p.__enter__() for p in patches]
-    try:
-        await coro_fn(registry)
-    except asyncio.CancelledError:
-        pass
-    finally:
-        for p in reversed(patches):
-            p.__exit__(None, None, None)
-
-
-# ── tests ────────────────────────────────────────────────────────────────────
-
-@pytest.mark.asyncio
-async def test_probe_transitions_starting_to_running():
-    """GET /health → 200 while in starting state → upsert_instance(state='running')."""
-    reg = _registry(_inst(state="starting", url="http://localhost:8000"))
-
-    calls = 0
-
-    async def fake_sleep(_delay):
-        nonlocal calls
-        calls += 1
-        if calls > 1:
-            raise asyncio.CancelledError()
-
-    with patch("asyncio.sleep", new=fake_sleep), \
-         patch("time.time", return_value=1000.0), \
-         patch("urllib.request.urlopen", return_value=_health_resp(200)):
-        try:
-            await _run_instance_probe_loop(reg)
-        except asyncio.CancelledError:
-            pass
-
-    reg.upsert_instance.assert_called_once_with(
-        service="vllm", node_id="node1", gpu_id=0,
-        state="running", model="qwen", url="http://localhost:8000",
-    )
-
-
-@pytest.mark.asyncio
-async def test_probe_transitions_starting_to_stopped_on_timeout():
-    """No healthy response + time past _PROBE_TIMEOUT_S → upsert_instance(state='stopped').
-
-    Tick 1: seeds start_times[key] = 1000.0
-    Tick 2: time has advanced past _PROBE_TIMEOUT_S → timeout fires → stopped
-    Tick 3: CancelledError exits the loop
-    """
-    reg = _registry(_inst(state="starting", url="http://localhost:8000"))
-
-    tick = 0
-    # Tick 1: t=1000 (seed); Tick 2: t=far_future (timeout fires)
-    times = [1000.0, 1000.0 + _PROBE_TIMEOUT_S + 1.0]
-
-    async def fake_sleep(_delay):
-        nonlocal tick
-        tick += 1
-        if tick > 2:
-            raise asyncio.CancelledError()
-
-    with patch("asyncio.sleep", new=fake_sleep), \
-         patch("time.time", side_effect=times * 10), \
-         patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
-        try:
-            await _run_instance_probe_loop(reg)
-        except asyncio.CancelledError:
-            pass
-
-    reg.upsert_instance.assert_called_once_with(
-        service="vllm", node_id="node1", gpu_id=0,
-        state="stopped", model="qwen", url="http://localhost:8000",
-    )
-
-
-@pytest.mark.asyncio
-async def test_probe_cleans_up_start_times_for_non_starting():
-    """
-    An instance that is no longer in 'starting' state should not cause
-    upsert_instance to be called, and its key should be removed from start_times.
-
-    We verify this indirectly: run two ticks — first with state='starting' (seeds
-    the key and transitions to running), second with the updated registry returning
-    state='running' (should not call upsert again).
-    """
-    starting_inst = _inst(state="starting", url="http://localhost:8000")
-    running_inst = _inst(state="running", url="http://localhost:8000")
-
-    tick = 0
-
-    # First tick: instance is starting → transitions to running
-    # Second tick: registry now returns running → no upsert
-    # Third tick: cancel
-    def instances_side_effect():
-        if tick <= 1:
-            return [starting_inst]
-        return [running_inst]
-
-    reg = MagicMock(spec=ServiceRegistry)
-    reg.all_instances.side_effect = instances_side_effect
-
-    async def fake_sleep(_delay):
-        nonlocal tick
-        tick += 1
-        if tick > 2:
-            raise asyncio.CancelledError()
-
-    with patch("asyncio.sleep", new=fake_sleep), \
-         patch("time.time", return_value=1000.0), \
-         patch("urllib.request.urlopen", return_value=_health_resp(200)):
-        try:
-            await _run_instance_probe_loop(reg)
-        except asyncio.CancelledError:
-            pass
-
-    # upsert should have been called exactly once (the starting→running transition)
-    assert reg.upsert_instance.call_count == 1
-    reg.upsert_instance.assert_called_once_with(
-        service="vllm", node_id="node1", gpu_id=0,
-        state="running", model="qwen", url="http://localhost:8000",
-    )
-
-
-@pytest.mark.asyncio
-async def test_probe_no_url_does_not_attempt_health_check():
-    """Instance with no URL stays in starting state (no health check, no timeout yet)."""
-    reg = _registry(_inst(state="starting", url=None))
-
-    tick = 0
-
-    async def fake_sleep(_delay):
-        nonlocal tick
-        tick += 1
-        if tick > 1:
-            raise asyncio.CancelledError()
-
-    with patch("asyncio.sleep", new=fake_sleep), \
-         patch("time.time", return_value=1000.0), \
-         patch("urllib.request.urlopen") as mock_urlopen:
-        try:
-            await _run_instance_probe_loop(reg)
-        except asyncio.CancelledError:
-            pass
-
-    mock_urlopen.assert_not_called()
-    reg.upsert_instance.assert_not_called()
--- a/tests/test_resources/test_docuvision.py
+++ b/tests/test_resources/test_docuvision.py
@ -1,215 +0,0 @@
-# tests/test_resources/test_docuvision.py
-"""
-Unit tests for cf-docuvision FastAPI service (circuitforge_core/resources/docuvision/app.py).
-
-Covers:
-  - GET /health          → status + model path
-  - POST /extract        → image_b64, image_path, hint routing, metadata fields
-  - _parse_dolphin_output → JSON list path, table detection, plain-text fallback
-  - _image_from_request  → missing both fields → 422; bad image_path → 404
-"""
-from __future__ import annotations
-
-import base64
-import io
-import json
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-from fastapi.testclient import TestClient
-from PIL import Image
-
-import circuitforge_core.resources.docuvision.app as docuvision_module
-from circuitforge_core.resources.docuvision.app import (
-    _parse_dolphin_output,
-    app,
-)
-
-
-# ── fixtures ──────────────────────────────────────────────────────────────────
-
-def _make_jpeg_b64(width: int = 10, height: int = 10) -> str:
-    """Return a base64-encoded 10x10 white JPEG."""
-    img = Image.new("RGB", (width, height), color=(255, 255, 255))
-    buf = io.BytesIO()
-    img.save(buf, format="JPEG")
-    return base64.b64encode(buf.getvalue()).decode()
-
-
-@pytest.fixture(autouse=True)
-def _reset_module_state():
-    """Reset module-level model state between tests."""
-    docuvision_module._model = None
-    docuvision_module._processor = None
-    docuvision_module._model_path = "/fake/model"
-    docuvision_module._device = "cpu"
-    yield
-    docuvision_module._model = None
-    docuvision_module._processor = None
-
-
-@pytest.fixture
-def mock_model():
-    """
-    Inject fake model + processor into the module so _load_model() is skipped.
-
-    The processor returns a dict-like with 'input_ids'; the model generate()
-    returns a tensor-like whose decode produces a JSON string.
-    """
-    fake_ids = MagicMock()
-    fake_ids.shape = [1, 5]      # input_len = 5
-
-    fake_inputs = {"input_ids": fake_ids}
-    fake_inputs_obj = MagicMock()
-    fake_inputs_obj.__getitem__ = lambda self, k: fake_inputs[k]
-    fake_inputs_obj.to = lambda device: fake_inputs_obj
-
-    fake_output = MagicMock()
-    fake_output.__getitem__ = lambda self, idx: MagicMock()  # output_ids[0]
-
-    fake_model = MagicMock()
-    fake_model.generate.return_value = fake_output
-
-    fake_processor = MagicMock()
-    fake_processor.return_value = fake_inputs_obj
-    fake_processor.decode.return_value = json.dumps([
-        {"type": "heading", "text": "Invoice", "bbox": [0.0, 0.0, 1.0, 0.1]},
-        {"type": "table", "text": "row1", "html": "<table><tr><td>row1</td></tr></table>",
-         "bbox": [0.0, 0.1, 1.0, 0.5]},
-    ])
-
-    docuvision_module._model = fake_model
-    docuvision_module._processor = fake_processor
-    return fake_model, fake_processor
-
-
-@pytest.fixture
-def client():
-    return TestClient(app)
-
-
-# ── health ────────────────────────────────────────────────────────────────────
-
-def test_health_returns_ok(client):
-    resp = client.get("/health")
-    assert resp.status_code == 200
-    data = resp.json()
-    assert data["status"] == "ok"
-    assert data["model"] == "/fake/model"
-
-
-# ── _parse_dolphin_output ────────────────────────────────────────────────────
-
-def test_parse_json_list_elements():
-    raw = json.dumps([
-        {"type": "heading", "text": "Title"},
-        {"type": "paragraph", "text": "Body text"},
-    ])
-    elements, tables, raw_text = _parse_dolphin_output(raw)
-    assert len(elements) == 2
-    assert elements[0].type == "heading"
-    assert elements[0].text == "Title"
-    assert elements[1].type == "paragraph"
-    assert raw_text == "Title\nBody text"
-    assert tables == []
-
-
-def test_parse_json_table_extracted():
-    raw = json.dumps([
-        {"type": "table", "text": "row", "html": "<table><tr><td>A</td></tr></table>",
-         "bbox": [0.0, 0.0, 1.0, 0.5]},
-    ])
-    elements, tables, raw_text = _parse_dolphin_output(raw)
-    assert len(tables) == 1
-    assert tables[0].html == "<table><tr><td>A</td></tr></table>"
-    assert tables[0].bbox == [0.0, 0.0, 1.0, 0.5]
-    assert len(elements) == 1
-    assert elements[0].type == "table"
-
-
-def test_parse_plain_text_fallback():
-    raw = "This is not JSON at all."
-    elements, tables, raw_text = _parse_dolphin_output(raw)
-    assert len(elements) == 1
-    assert elements[0].type == "paragraph"
-    assert elements[0].text == raw
-    assert tables == []
-    assert raw_text == raw
-
-
-def test_parse_empty_string_fallback():
-    elements, tables, raw_text = _parse_dolphin_output("")
-    assert len(elements) == 1
-    assert elements[0].type == "paragraph"
-    assert elements[0].text == ""
-
-
-def test_parse_json_missing_type_defaults_to_paragraph():
-    raw = json.dumps([{"text": "no type field"}])
-    elements, tables, _ = _parse_dolphin_output(raw)
-    assert elements[0].type == "paragraph"
-
-
-# ── POST /extract ─────────────────────────────────────────────────────────────
-
-def test_extract_image_b64(client, mock_model):
-    resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "auto"})
-    assert resp.status_code == 200
-    data = resp.json()
-    assert "elements" in data
-    assert "raw_text" in data
-    assert "tables" in data
-    assert data["metadata"]["hint"] == "auto"
-    assert data["metadata"]["model"] == "/fake/model"
-    assert data["metadata"]["width"] == 10
-    assert data["metadata"]["height"] == 10
-
-
-def test_extract_hint_table_routes_correct_prompt(client, mock_model):
-    _, fake_processor = mock_model
-    resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "table"})
-    assert resp.status_code == 200
-    # Verify processor was called with the table-specific prompt
-    call_kwargs = fake_processor.call_args
-    assert "table" in call_kwargs.kwargs.get("text", "") or \
-           "table" in str(call_kwargs)
-
-
-def test_extract_hint_unknown_falls_back_to_auto(client, mock_model):
-    """An unrecognised hint silently falls back to the 'auto' prompt."""
-    resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "nonsense"})
-    assert resp.status_code == 200
-
-
-def test_extract_image_path(tmp_path, client, mock_model):
-    img_file = tmp_path / "doc.png"
-    Image.new("RGB", (8, 8), color=(0, 0, 0)).save(img_file)
-    resp = client.post("/extract", json={"image_path": str(img_file)})
-    assert resp.status_code == 200
-    assert resp.json()["metadata"]["width"] == 8
-
-
-def test_extract_image_path_not_found(client, mock_model):
-    resp = client.post("/extract", json={"image_path": "/nonexistent/path/img.png"})
-    assert resp.status_code == 404
-
-
-def test_extract_no_image_raises_422(client, mock_model):
-    resp = client.post("/extract", json={"hint": "auto"})
-    assert resp.status_code == 422
-
-
-def test_extract_response_includes_tables(client, mock_model):
-    """Verify table objects surface in response when model returns table elements."""
-    resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
-    assert resp.status_code == 200
-    data = resp.json()
-    assert len(data["tables"]) == 1
-    assert "<table>" in data["tables"][0]["html"]
-
-
-def test_extract_device_in_metadata(client, mock_model):
-    resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
-    assert resp.status_code == 200
-    assert "device" in resp.json()["metadata"]
--- a/tests/test_resources/test_eviction_engine.py
+++ b/tests/test_resources/test_eviction_engine.py
@ -1,67 +0,0 @@
-import asyncio
-import pytest
-from unittest.mock import AsyncMock, patch
-from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-
-
-@pytest.fixture
-def lease_manager():
-    mgr = LeaseManager()
-    mgr.register_gpu("heimdall", 0, 8192)
-    return mgr
-
-
-@pytest.fixture
-def engine(lease_manager):
-    return EvictionEngine(lease_manager=lease_manager, eviction_timeout_s=0.1)
-
-
-@pytest.mark.asyncio
-async def test_request_lease_grants_when_vram_available(engine, lease_manager):
-    lease = await engine.request_lease(
-        node_id="heimdall", gpu_id=0, mb=4096,
-        service="peregrine", priority=1,
-        agent_url="http://localhost:7701",
-    )
-    assert lease is not None
-    assert lease.mb_granted == 4096
-
-
-@pytest.mark.asyncio
-async def test_request_lease_evicts_and_grants(engine, lease_manager):
-    # Pre-fill with a low-priority lease
-    big_lease = await lease_manager.try_grant(
-        "heimdall", 0, 7000, "comfyui", priority=4
-    )
-    assert big_lease is not None
-
-    # Mock the agent eviction call
-    with patch(
-        "circuitforge_core.resources.coordinator.eviction_engine.EvictionEngine._call_agent_evict",
-        new_callable=AsyncMock,
-    ) as mock_evict:
-        mock_evict.return_value = True
-        # Simulate the comfyui lease being released (as if the agent evicted it)
-        asyncio.get_event_loop().call_later(
-            0.05, lambda: asyncio.ensure_future(lease_manager.release(big_lease.lease_id))
-        )
-        lease = await engine.request_lease(
-            node_id="heimdall", gpu_id=0, mb=4096,
-            service="peregrine", priority=1,
-            agent_url="http://localhost:7701",
-        )
-    assert lease is not None
-    assert lease.holder_service == "peregrine"
-
-
-@pytest.mark.asyncio
-async def test_request_lease_returns_none_when_no_eviction_candidates(engine):
-    await engine.lease_manager.try_grant("heimdall", 0, 6000, "vllm", priority=1)
-    # Requesting 4GB but no lower-priority leases exist
-    lease = await engine.request_lease(
-        node_id="heimdall", gpu_id=0, mb=4096,
-        service="kiwi", priority=2,
-        agent_url="http://localhost:7701",
-    )
-    assert lease is None
--- a/tests/test_resources/test_eviction_executor.py
+++ b/tests/test_resources/test_eviction_executor.py
@ -1,43 +0,0 @@
-import signal
-from unittest.mock import patch, call
-import pytest
-from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor, EvictionResult
-
-
-def test_evict_by_pid_sends_sigterm_then_sigkill():
-    executor = EvictionExecutor(grace_period_s=0.01)
-    # pid_exists always True → grace period expires → SIGKILL fires
-    with patch("os.kill") as mock_kill, \
-         patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
-        mock_psutil.pid_exists.return_value = True
-        result = executor.evict_pid(pid=1234, grace_period_s=0.01)
-
-    assert result.success is True
-    calls = mock_kill.call_args_list
-    assert call(1234, signal.SIGTERM) in calls
-    assert call(1234, signal.SIGKILL) in calls
-
-
-def test_evict_pid_succeeds_on_sigterm_alone():
-    executor = EvictionExecutor(grace_period_s=0.1)
-    with patch("os.kill"), \
-         patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
-        mock_psutil.pid_exists.side_effect = [True, False]  # gone after SIGTERM
-        result = executor.evict_pid(pid=5678, grace_period_s=0.01)
-    assert result.success is True
-    assert result.method == "sigterm"
-
-
-def test_evict_pid_not_found_returns_failure():
-    executor = EvictionExecutor()
-    with patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
-        mock_psutil.pid_exists.return_value = False
-        result = executor.evict_pid(pid=9999)
-    assert result.success is False
-    assert "not found" in result.message.lower()
-
-
-def test_eviction_result_is_immutable():
-    result = EvictionResult(success=True, method="sigterm", message="ok")
-    with pytest.raises((AttributeError, TypeError)):
-        result.success = False  # type: ignore
--- a/tests/test_resources/test_gpu_monitor.py
+++ b/tests/test_resources/test_gpu_monitor.py
@ -1,60 +0,0 @@
-from unittest.mock import patch
-from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
-
-
-SAMPLE_NVIDIA_SMI_OUTPUT = (
-    "0, Quadro RTX 4000, 8192, 6843, 1349\n"
-    "1, Quadro RTX 4000, 8192, 721, 7471\n"
-)
-
-
-def test_parse_returns_list_of_gpu_info():
-    monitor = GpuMonitor()
-    with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
-        mock_run.return_value.returncode = 0
-        mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
-        gpus = monitor.poll()
-    assert len(gpus) == 2
-    assert gpus[0].gpu_id == 0
-    assert gpus[0].name == "Quadro RTX 4000"
-    assert gpus[0].vram_total_mb == 8192
-    assert gpus[0].vram_used_mb == 6843
-    assert gpus[0].vram_free_mb == 1349
-
-
-def test_parse_second_gpu():
-    monitor = GpuMonitor()
-    with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
-        mock_run.return_value.returncode = 0
-        mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
-        gpus = monitor.poll()
-    assert gpus[1].gpu_id == 1
-    assert gpus[1].vram_used_mb == 721
-    assert gpus[1].vram_free_mb == 7471
-
-
-def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
-    monitor = GpuMonitor()
-    with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run", side_effect=FileNotFoundError):
-        gpus = monitor.poll()
-    assert gpus == []
-
-
-def test_poll_returns_empty_list_on_nonzero_exit():
-    monitor = GpuMonitor()
-    with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
-        mock_run.return_value.returncode = 1
-        mock_run.return_value.stdout = ""
-        gpus = monitor.poll()
-    assert gpus == []
-
-
-def test_poll_skips_malformed_lines():
-    monitor = GpuMonitor()
-    malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
-    with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
-        mock_run.return_value.returncode = 0
-        mock_run.return_value.stdout = malformed
-        gpus = monitor.poll()
-    assert len(gpus) == 1
-    assert gpus[0].gpu_id == 1
--- a/tests/test_resources/test_integration.py
+++ b/tests/test_resources/test_integration.py
@ -1,221 +0,0 @@
-"""Integration test: full lease → eviction → re-grant cycle.
-
-Runs coordinator in-process (no subprocesses, no real nvidia-smi).
-Uses TestClient for HTTP, mocks AgentSupervisor to return fixed node state.
-"""
-import pytest
-from unittest.mock import MagicMock
-from fastapi.testclient import TestClient
-
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
-from circuitforge_core.resources.coordinator.app import create_coordinator_app
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.models import GpuInfo, NodeInfo
-
-
-@pytest.fixture
-def system():
-    """Create an in-process coordinator system with 8GB GPU and mock supervisor."""
-    lease_manager = LeaseManager()
-    lease_manager.register_gpu("local", 0, 8192)
-
-    mock_supervisor = MagicMock(spec=AgentSupervisor)
-    mock_supervisor.all_nodes.return_value = [
-        NodeInfo(
-            node_id="local",
-            agent_url="http://localhost:7701",
-            gpus=[GpuInfo(
-                gpu_id=0,
-                name="RTX 4000",
-                vram_total_mb=8192,
-                vram_used_mb=0,
-                vram_free_mb=8192,
-            )],
-            last_heartbeat=0.0,
-        )
-    ]
-    mock_supervisor.get_node_info.return_value = NodeInfo(
-        node_id="local",
-        agent_url="http://localhost:7701",
-        gpus=[],
-        last_heartbeat=0.0,
-    )
-
-    profile_registry = ProfileRegistry()
-    app = create_coordinator_app(
-        lease_manager=lease_manager,
-        profile_registry=profile_registry,
-        agent_supervisor=mock_supervisor,
-        service_registry=ServiceRegistry(),
-    )
-    client = TestClient(app)
-    return client, lease_manager
-
-
-def test_full_lease_cycle(system):
-    """Test: grant, verify, release, verify gone."""
-    client, _ = system
-
-    # Grant a lease
-    resp = client.post("/api/leases", json={
-        "node_id": "local",
-        "gpu_id": 0,
-        "mb": 4096,
-        "service": "peregrine",
-        "priority": 1,
-    })
-    assert resp.status_code == 200
-    lease_data = resp.json()["lease"]
-    lease_id = lease_data["lease_id"]
-    assert lease_data["mb_granted"] == 4096
-    assert lease_data["holder_service"] == "peregrine"
-
-    # Verify it appears in active leases
-    resp = client.get("/api/leases")
-    assert resp.status_code == 200
-    leases = resp.json()["leases"]
-    assert any(l["lease_id"] == lease_id for l in leases)
-
-    # Release it
-    resp = client.delete(f"/api/leases/{lease_id}")
-    assert resp.status_code == 200
-    assert resp.json()["released"] is True
-
-    # Verify it's gone
-    resp = client.get("/api/leases")
-    assert resp.status_code == 200
-    leases = resp.json()["leases"]
-    assert not any(l["lease_id"] == lease_id for l in leases)
-
-
-def test_vram_exhaustion_returns_503(system):
-    """Test: fill GPU, then request with no eviction candidates returns 503."""
-    client, _ = system
-
-    # Fill GPU 0 with high-priority lease
-    resp = client.post("/api/leases", json={
-        "node_id": "local",
-        "gpu_id": 0,
-        "mb": 8000,
-        "service": "vllm",
-        "priority": 1,
-    })
-    assert resp.status_code == 200
-
-    # Try to get more VRAM with same priority (no eviction candidates)
-    resp = client.post("/api/leases", json={
-        "node_id": "local",
-        "gpu_id": 0,
-        "mb": 2000,
-        "service": "kiwi",
-        "priority": 1,
-    })
-    assert resp.status_code == 503
-    assert "Insufficient VRAM" in resp.json()["detail"]
-
-
-def test_auto_detect_profile_for_8gb():
-    """Test: ProfileRegistry auto-detects single-gpu-8gb for 8GB GPU."""
-    registry = ProfileRegistry()
-    gpu = GpuInfo(
-        gpu_id=0,
-        name="RTX 4000",
-        vram_total_mb=8192,
-        vram_used_mb=0,
-        vram_free_mb=8192,
-    )
-    profile = registry.auto_detect([gpu])
-    assert profile.name == "single-gpu-8gb"
-    # Verify profile has services configured
-    assert hasattr(profile, "services")
-
-
-def test_node_endpoint_shows_nodes(system):
-    """Test: GET /api/nodes returns the mocked node."""
-    client, _ = system
-    resp = client.get("/api/nodes")
-    assert resp.status_code == 200
-    nodes = resp.json()["nodes"]
-    assert len(nodes) == 1
-    assert nodes[0]["node_id"] == "local"
-    assert nodes[0]["agent_url"] == "http://localhost:7701"
-    assert len(nodes[0]["gpus"]) == 1
-    assert nodes[0]["gpus"][0]["name"] == "RTX 4000"
-
-
-def test_profiles_endpoint_returns_public_profiles(system):
-    """Test: GET /api/profiles returns standard public profiles."""
-    client, _ = system
-    resp = client.get("/api/profiles")
-    assert resp.status_code == 200
-    profiles = resp.json()["profiles"]
-    names = [p["name"] for p in profiles]
-    # Verify common public profiles are present
-    assert "single-gpu-8gb" in names
-    assert "single-gpu-6gb" in names
-    assert "single-gpu-2gb" in names
-
-
-def test_multiple_leases_tracked_independently(system):
-    """Test: multiple active leases are tracked correctly."""
-    client, _ = system
-
-    # Grant lease 1
-    resp1 = client.post("/api/leases", json={
-        "node_id": "local",
-        "gpu_id": 0,
-        "mb": 2048,
-        "service": "peregrine",
-        "priority": 2,
-    })
-    assert resp1.status_code == 200
-    lease1_id = resp1.json()["lease"]["lease_id"]
-
-    # Grant lease 2
-    resp2 = client.post("/api/leases", json={
-        "node_id": "local",
-        "gpu_id": 0,
-        "mb": 2048,
-        "service": "kiwi",
-        "priority": 2,
-    })
-    assert resp2.status_code == 200
-    lease2_id = resp2.json()["lease"]["lease_id"]
-
-    # Both should be in active leases
-    resp = client.get("/api/leases")
-    leases = resp.json()["leases"]
-    lease_ids = [l["lease_id"] for l in leases]
-    assert lease1_id in lease_ids
-    assert lease2_id in lease_ids
-    assert len(leases) == 2
-
-    # Release lease 1
-    resp = client.delete(f"/api/leases/{lease1_id}")
-    assert resp.status_code == 200
-
-    # Only lease 2 should remain
-    resp = client.get("/api/leases")
-    leases = resp.json()["leases"]
-    lease_ids = [l["lease_id"] for l in leases]
-    assert lease1_id not in lease_ids
-    assert lease2_id in lease_ids
-    assert len(leases) == 1
-
-
-def test_delete_nonexistent_lease_returns_404(system):
-    """Test: deleting a nonexistent lease returns 404."""
-    client, _ = system
-    resp = client.delete("/api/leases/nonexistent-lease-id")
-    assert resp.status_code == 404
-    assert "not found" in resp.json()["detail"]
-
-
-def test_health_endpoint_returns_ok(system):
-    """Test: GET /api/health returns status ok."""
-    client, _ = system
-    resp = client.get("/api/health")
-    assert resp.status_code == 200
-    assert resp.json()["status"] == "ok"
--- a/tests/test_resources/test_lease_manager.py
+++ b/tests/test_resources/test_lease_manager.py
@ -1,85 +0,0 @@
-import pytest
-from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
-
-
-@pytest.fixture
-def mgr():
-    m = LeaseManager()
-    m.register_gpu(node_id="heimdall", gpu_id=0, total_mb=8192)
-    return m
-
-
-@pytest.mark.asyncio
-async def test_grant_succeeds_when_vram_available(mgr):
-    lease = await mgr.try_grant(
-        node_id="heimdall", gpu_id=0, mb=4096,
-        service="peregrine", priority=1
-    )
-    assert lease is not None
-    assert lease.mb_granted == 4096
-    assert lease.node_id == "heimdall"
-    assert lease.gpu_id == 0
-
-
-@pytest.mark.asyncio
-async def test_grant_fails_when_vram_insufficient(mgr):
-    await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
-                         service="vllm", priority=1)
-    lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
-                                 service="kiwi", priority=2)
-    assert lease is None
-
-
-@pytest.mark.asyncio
-async def test_release_frees_vram(mgr):
-    lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
-                                 service="vllm", priority=1)
-    assert lease is not None
-    released = await mgr.release(lease.lease_id)
-    assert released is True
-    lease2 = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
-                                   service="comfyui", priority=4)
-    assert lease2 is not None
-
-
-@pytest.mark.asyncio
-async def test_release_unknown_lease_returns_false(mgr):
-    result = await mgr.release("nonexistent-id")
-    assert result is False
-
-
-@pytest.mark.asyncio
-async def test_get_eviction_candidates_returns_lower_priority_leases(mgr):
-    await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=3000,
-                         service="comfyui", priority=4)
-    await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
-                         service="ollama", priority=1)
-    candidates = mgr.get_eviction_candidates(
-        node_id="heimdall", gpu_id=0,
-        needed_mb=3000, requester_priority=2
-    )
-    assert len(candidates) == 1
-    assert candidates[0].holder_service == "comfyui"
-
-
-@pytest.mark.asyncio
-async def test_list_leases_for_gpu(mgr):
-    await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=1024,
-                         service="peregrine", priority=1)
-    await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=512,
-                         service="kiwi", priority=2)
-    leases = mgr.list_leases(node_id="heimdall", gpu_id=0)
-    assert len(leases) == 2
-
-
-def test_register_gpu_sets_total(mgr):
-    assert mgr.gpu_total_mb("heimdall", 0) == 8192
-
-
-@pytest.mark.asyncio
-async def test_used_mb_tracks_grants():
-    mgr = LeaseManager()
-    mgr.register_gpu("heimdall", 0, 8192)
-    await mgr.try_grant("heimdall", 0, 3000, "a", 1)
-    await mgr.try_grant("heimdall", 0, 2000, "b", 2)
-    assert mgr.used_mb("heimdall", 0) == 5000
--- a/tests/test_resources/test_models.py
+++ b/tests/test_resources/test_models.py
@ -1,47 +0,0 @@
-import time
-import pytest
-from circuitforge_core.resources.models import VRAMLease, GpuInfo, NodeInfo
-
-
-def test_vram_lease_create_assigns_unique_ids():
-    lease_a = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
-                                service="peregrine", priority=1)
-    lease_b = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
-                                service="peregrine", priority=1)
-    assert lease_a.lease_id != lease_b.lease_id
-
-
-def test_vram_lease_create_with_ttl_sets_expiry():
-    before = time.time()
-    lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=2048,
-                              service="kiwi", priority=2, ttl_s=60.0)
-    after = time.time()
-    assert before + 60.0 <= lease.expires_at <= after + 60.0
-
-
-def test_vram_lease_create_no_ttl_has_zero_expiry():
-    lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
-                              service="snipe", priority=2)
-    assert lease.expires_at == 0.0
-
-
-def test_vram_lease_is_immutable():
-    lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
-                              service="snipe", priority=2)
-    with pytest.raises((AttributeError, TypeError)):
-        lease.mb_granted = 999  # type: ignore
-
-
-def test_gpu_info_fields():
-    info = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
-                   vram_used_mb=2048, vram_free_mb=6144)
-    assert info.vram_free_mb == 6144
-
-
-def test_node_info_fields():
-    gpu = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
-                  vram_used_mb=0, vram_free_mb=8192)
-    node = NodeInfo(node_id="heimdall", agent_url="http://localhost:7701",
-                    gpus=[gpu], last_heartbeat=time.time())
-    assert node.node_id == "heimdall"
-    assert len(node.gpus) == 1
--- a/tests/test_resources/test_node_selector.py
+++ b/tests/test_resources/test_node_selector.py
@ -1,82 +0,0 @@
-import pytest
-from circuitforge_core.resources.coordinator.node_selector import select_node
-from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
-from circuitforge_core.resources.models import GpuInfo
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-
-
-def _make_agent(node_id: str, free_mb: int, online: bool = True) -> AgentRecord:
-    r = AgentRecord(node_id=node_id, agent_url=f"http://{node_id}:7701")
-    r.gpus = [GpuInfo(gpu_id=0, name="RTX", vram_total_mb=8192,
-                      vram_used_mb=8192 - free_mb, vram_free_mb=free_mb)]
-    r.online = online
-    return r
-
-
-def test_selects_node_with_most_free_vram():
-    agents = {
-        "a": _make_agent("a", free_mb=2000),
-        "b": _make_agent("b", free_mb=6000),
-    }
-    registry = ProfileRegistry()
-    result = select_node(agents, "vllm", registry, resident_keys=set())
-    assert result == ("b", 0)
-
-
-def test_prefers_warm_node_even_with_less_free_vram():
-    agents = {
-        "a": _make_agent("a", free_mb=2000),
-        "b": _make_agent("b", free_mb=6000),
-    }
-    registry = ProfileRegistry()
-    result = select_node(agents, "vllm", registry, resident_keys={"a:vllm"})
-    assert result == ("a", 0)
-
-
-def test_excludes_offline_nodes():
-    agents = {
-        "a": _make_agent("a", free_mb=8000, online=False),
-        "b": _make_agent("b", free_mb=2000, online=True),
-    }
-    registry = ProfileRegistry()
-    result = select_node(agents, "vllm", registry, resident_keys=set())
-    assert result == ("b", 0)
-
-
-def test_returns_none_when_no_node_has_profile_for_service():
-    agents = {"a": _make_agent("a", free_mb=8000)}
-    registry = ProfileRegistry()
-    result = select_node(agents, "cf-nonexistent-service", registry, resident_keys=set())
-    assert result is None
-
-
-def test_returns_none_when_no_agents():
-    registry = ProfileRegistry()
-    result = select_node({}, "vllm", registry, resident_keys=set())
-    assert result is None
-
-
-def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
-    """can_fit requires free_mb >= service max_mb (full ceiling, not half).
-    9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
-    """
-    agents = {
-        "a": _make_agent("a", free_mb=1000),
-        "b": _make_agent("b", free_mb=9500),
-    }
-    registry = ProfileRegistry()
-    result = select_node(agents, "vllm", registry, resident_keys=set())
-    # "b" is the only node in the preferred (can_fit) pool
-    assert result == ("b", 0)
-
-
-def test_falls_back_to_best_effort_when_no_node_fully_fits():
-    """When nothing can_fit, select_node returns the best-VRAM node as fallback."""
-    agents = {
-        "a": _make_agent("a", free_mb=1000),
-        "b": _make_agent("b", free_mb=2000),
-    }
-    registry = ProfileRegistry()
-    # Neither has enough free VRAM; fallback picks highest effective_free_mb
-    result = select_node(agents, "vllm", registry, resident_keys=set())
-    assert result == ("b", 0)
--- a/tests/test_resources/test_node_store.py
+++ b/tests/test_resources/test_node_store.py
@ -1,87 +0,0 @@
-# tests/test_resources/test_node_store.py
-"""Unit tests for NodeStore — SQLite persistence layer for known agent nodes."""
-from __future__ import annotations
-
-import time
-from pathlib import Path
-
-import pytest
-
-from circuitforge_core.resources.coordinator.node_store import NodeStore
-
-
-@pytest.fixture
-def store(tmp_path: Path) -> NodeStore:
-    return NodeStore(db_path=tmp_path / "test-nodes.db")
-
-
-def test_upsert_and_all(store: NodeStore) -> None:
-    store.upsert("heimdall", "http://127.0.0.1:7701")
-    rows = store.all()
-    assert len(rows) == 1
-    assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
-
-
-def test_upsert_updates_url(store: NodeStore) -> None:
-    store.upsert("navi", "http://10.1.10.10:7701")
-    store.upsert("navi", "http://10.1.10.10:7702")
-    rows = store.all()
-    assert len(rows) == 1
-    assert rows[0][1] == "http://10.1.10.10:7702"
-
-
-def test_multiple_nodes(store: NodeStore) -> None:
-    store.upsert("heimdall", "http://127.0.0.1:7701")
-    store.upsert("navi", "http://10.1.10.10:7701")
-    store.upsert("strahl", "http://10.1.10.20:7701")
-    assert len(store.all()) == 3
-
-
-def test_remove(store: NodeStore) -> None:
-    store.upsert("heimdall", "http://127.0.0.1:7701")
-    store.upsert("navi", "http://10.1.10.10:7701")
-    store.remove("navi")
-    ids = [r[0] for r in store.all()]
-    assert "navi" not in ids
-    assert "heimdall" in ids
-
-
-def test_prune_stale_removes_old_entries(store: NodeStore) -> None:
-    # Insert a node with a last_seen in the distant past
-    store._conn.execute(
-        "INSERT INTO known_nodes (node_id, agent_url, last_seen) VALUES (?, ?, ?)",
-        ("ghost", "http://dead:7701", time.time() - 40 * 86400),
-    )
-    store._conn.commit()
-    store.upsert("live", "http://live:7701")
-
-    removed = store.prune_stale(max_age_days=30)
-    assert removed == 1
-    ids = [r[0] for r in store.all()]
-    assert "ghost" not in ids
-    assert "live" in ids
-
-
-def test_prune_stale_keeps_recent(store: NodeStore) -> None:
-    store.upsert("recent", "http://recent:7701")
-    removed = store.prune_stale(max_age_days=30)
-    assert removed == 0
-    assert len(store.all()) == 1
-
-
-def test_all_empty(store: NodeStore) -> None:
-    assert store.all() == []
-
-
-def test_db_persists_across_instances(tmp_path: Path) -> None:
-    """Data written by one NodeStore instance is visible to a new one on the same file."""
-    db = tmp_path / "shared.db"
-    s1 = NodeStore(db_path=db)
-    s1.upsert("navi", "http://10.1.10.10:7701")
-    s1.close()
-
-    s2 = NodeStore(db_path=db)
-    rows = s2.all()
-    assert len(rows) == 1
-    assert rows[0][0] == "navi"
-    s2.close()
--- a/tests/test_resources/test_ollama_adopt.py
+++ b/tests/test_resources/test_ollama_adopt.py
@ -1,176 +0,0 @@
-# tests/test_resources/test_ollama_adopt.py
-"""
-Tests for the Ollama adopt-if-running path:
-  - ProcessSpec: adopt and health_path fields parsed from YAML
-  - ServiceManager.start(): adopt path claims running service; falls through if not running
-  - ServiceManager.is_running(): adopt path uses health probe, not proc table
-  - ServiceInstance.health_path persists through upsert_instance
-  - Probe loop uses inst.health_path instead of hardcoded /health
-"""
-from __future__ import annotations
-
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from circuitforge_core.resources.agent.service_manager import ServiceManager
-from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
-from circuitforge_core.resources.profiles.schema import GpuProfile, ProcessSpec, ServiceProfile, load_profile
-
-
-# ── ProcessSpec schema ────────────────────────────────────────────────────────
-
-def test_process_spec_defaults():
-    spec = ProcessSpec(exec_path="/usr/local/bin/ollama")
-    assert spec.adopt is False
-    assert spec.health_path == "/health"
-
-
-def test_process_spec_adopt_fields():
-    spec = ProcessSpec(
-        exec_path="/usr/local/bin/ollama",
-        adopt=True,
-        health_path="/api/tags",
-        port=11434,
-        host_port=11434,
-    )
-    assert spec.adopt is True
-    assert spec.health_path == "/api/tags"
-
-
-def test_profile_yaml_parses_adopt(tmp_path: Path):
-    yaml_text = """\
-schema_version: 1
-name: test
-services:
-  ollama:
-    max_mb: 4096
-    priority: 1
-    managed:
-      type: process
-      adopt: true
-      exec_path: /usr/local/bin/ollama
-      args_template: serve
-      port: 11434
-      host_port: 11434
-      health_path: /api/tags
-"""
-    p = tmp_path / "profile.yaml"
-    p.write_text(yaml_text)
-    profile = load_profile(p)
-    spec = profile.services["ollama"].managed
-    assert isinstance(spec, ProcessSpec)
-    assert spec.adopt is True
-    assert spec.health_path == "/api/tags"
-    assert spec.host_port == 11434
-
-
-# ── ServiceManager adopt path ─────────────────────────────────────────────────
-
-def _make_manager_with_ollama(advertise_host: str = "127.0.0.1") -> ServiceManager:
-    profile = GpuProfile(
-        schema_version=1,
-        name="test",
-        services={
-            "ollama": ServiceProfile(
-                max_mb=4096,
-                priority=1,
-                managed=ProcessSpec(
-                    exec_path="/usr/local/bin/ollama",
-                    args_template="serve",
-                    port=11434,
-                    host_port=11434,
-                    adopt=True,
-                    health_path="/api/tags",
-                ),
-            )
-        },
-    )
-    return ServiceManager(node_id="heimdall", profile=profile, advertise_host=advertise_host)
-
-
-def test_start_adopt_claims_running_service():
-    """When Ollama is already healthy, start() returns its URL without spawning a process."""
-    mgr = _make_manager_with_ollama()
-    with patch.object(mgr, "_probe_health", return_value=True) as mock_probe:
-        url = mgr.start("ollama", gpu_id=0, params={})
-    assert url == "http://127.0.0.1:11434"
-    mock_probe.assert_called_once_with(11434, "/api/tags")
-    assert "ollama" not in mgr._procs  # no subprocess spawned
-
-
-def test_start_adopt_spawns_when_not_running():
-    """When Ollama is not yet running, start() spawns it normally."""
-    mgr = _make_manager_with_ollama()
-    mock_proc = MagicMock()
-    mock_proc.poll.return_value = None
-
-    with patch.object(mgr, "_probe_health", return_value=False), \
-         patch("subprocess.Popen", return_value=mock_proc) as mock_popen:
-        url = mgr.start("ollama", gpu_id=0, params={})
-
-    assert url == "http://127.0.0.1:11434"
-    mock_popen.assert_called_once()
-    assert "ollama" in mgr._procs
-
-
-def test_is_running_adopt_uses_health_probe():
-    """is_running() for adopt=True services checks the health endpoint, not the proc table."""
-    mgr = _make_manager_with_ollama()
-    with patch.object(mgr, "_probe_health", return_value=True):
-        assert mgr.is_running("ollama") is True
-    with patch.object(mgr, "_probe_health", return_value=False):
-        assert mgr.is_running("ollama") is False
-
-
-def test_probe_health_returns_true_on_200():
-    mgr = _make_manager_with_ollama()
-    mock_resp = MagicMock()
-    mock_resp.status = 200
-    mock_resp.__enter__ = lambda s: mock_resp
-    mock_resp.__exit__ = MagicMock(return_value=False)
-
-    with patch("urllib.request.urlopen", return_value=mock_resp):
-        assert mgr._probe_health(11434, "/api/tags") is True
-
-
-def test_probe_health_returns_false_on_connection_error():
-    mgr = _make_manager_with_ollama()
-    with patch("urllib.request.urlopen", side_effect=OSError("refused")):
-        assert mgr._probe_health(11434, "/api/tags") is False
-
-
-# ── ServiceRegistry health_path ───────────────────────────────────────────────
-
-def test_upsert_instance_stores_health_path():
-    reg = ServiceRegistry()
-    inst = reg.upsert_instance(
-        service="ollama", node_id="heimdall", gpu_id=0,
-        state="running", model=None, url="http://127.0.0.1:11434",
-        health_path="/api/tags",
-    )
-    assert inst.health_path == "/api/tags"
-
-
-def test_upsert_instance_default_health_path():
-    reg = ServiceRegistry()
-    inst = reg.upsert_instance(
-        service="vllm", node_id="heimdall", gpu_id=0,
-        state="starting", model="qwen", url="http://127.0.0.1:8000",
-    )
-    assert inst.health_path == "/health"
-
-
-def test_all_gpu_profiles_have_ollama_managed_block():
-    """Sanity check: all public GPU profiles now have a managed block for ollama."""
-    from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-    registry = ProfileRegistry()
-    for profile in registry.list_public():
-        svc = profile.services.get("ollama")
-        if svc is None:
-            continue  # profile may not define ollama
-        assert svc.managed is not None, f"{profile.name}: ollama missing managed block"
-        assert isinstance(svc.managed, ProcessSpec)
-        assert svc.managed.adopt is True, f"{profile.name}: ollama adopt should be True"
-        assert svc.managed.health_path == "/api/tags", f"{profile.name}: wrong health_path"
--- a/tests/test_resources/test_profile_registry.py
+++ b/tests/test_resources/test_profile_registry.py
@ -1,101 +0,0 @@
-# tests/test_resources/test_profile_registry.py
-import pytest
-from unittest.mock import MagicMock
-
-from circuitforge_core.resources.profiles.schema import (
-    GpuProfile, ServiceProfile, load_profile
-)
-from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
-
-
-def test_load_8gb_profile(tmp_path):
-    yaml_content = """
-schema_version: 1
-name: single-gpu-8gb
-vram_total_mb: 8192
-eviction_timeout_s: 10.0
-services:
-  vllm:
-    max_mb: 5120
-    priority: 1
-  cf-vision:
-    max_mb: 2048
-    priority: 2
-    shared: true
-    max_concurrent: 3
-"""
-    profile_file = tmp_path / "test.yaml"
-    profile_file.write_text(yaml_content)
-    profile = load_profile(profile_file)
-
-    assert profile.name == "single-gpu-8gb"
-    assert profile.schema_version == 1
-    assert profile.vram_total_mb == 8192
-    assert profile.eviction_timeout_s == 10.0
-    assert "vllm" in profile.services
-    assert profile.services["vllm"].max_mb == 5120
-    assert profile.services["vllm"].priority == 1
-    assert profile.services["cf-vision"].shared is True
-    assert profile.services["cf-vision"].max_concurrent == 3
-
-
-def test_load_profile_rejects_wrong_schema_version(tmp_path):
-    yaml_content = "schema_version: 99\nname: future\n"
-    profile_file = tmp_path / "future.yaml"
-    profile_file.write_text(yaml_content)
-    with pytest.raises(ValueError, match="schema_version"):
-        load_profile(profile_file)
-
-
-def test_service_profile_defaults():
-    svc = ServiceProfile(max_mb=1024, priority=2)
-    assert svc.shared is False
-    assert svc.max_concurrent == 1
-    assert svc.always_on is False
-    assert svc.backend is None
-    assert svc.consumers == []
-
-
-def test_profile_registry_loads_public_profiles():
-    registry = ProfileRegistry()
-    profiles = registry.list_public()
-    names = [p.name for p in profiles]
-    assert "single-gpu-8gb" in names
-    assert "single-gpu-6gb" in names
-    assert "single-gpu-2gb" in names
-
-
-def test_profile_registry_auto_detect_selects_8gb():
-    registry = ProfileRegistry()
-    mock_gpus = [
-        MagicMock(vram_total_mb=8192),
-    ]
-    profile = registry.auto_detect(mock_gpus)
-    assert profile.name == "single-gpu-8gb"
-
-
-def test_profile_registry_auto_detect_selects_6gb():
-    registry = ProfileRegistry()
-    mock_gpus = [MagicMock(vram_total_mb=6144)]
-    profile = registry.auto_detect(mock_gpus)
-    assert profile.name == "single-gpu-6gb"
-
-
-def test_profile_registry_auto_detect_selects_2gb():
-    registry = ProfileRegistry()
-    mock_gpus = [MagicMock(vram_total_mb=2048)]
-    profile = registry.auto_detect(mock_gpus)
-    assert profile.name == "single-gpu-2gb"
-
-
-def test_profile_registry_load_from_path(tmp_path):
-    yaml_content = (
-        "schema_version: 1\nname: custom\n"
-        "vram_total_mb: 12288\neviction_timeout_s: 5.0\n"
-    )
-    p = tmp_path / "custom.yaml"
-    p.write_text(yaml_content)
-    registry = ProfileRegistry()
-    profile = registry.load(p)
-    assert profile.name == "custom"
-    assert profile.vram_total_mb == 12288
--- a/tests/test_resources/test_service_manager.py
+++ b/tests/test_resources/test_service_manager.py
@ -1,194 +0,0 @@
-"""Tests for ServiceManager ProcessSpec support."""
-from __future__ import annotations
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from circuitforge_core.resources.agent.service_manager import ServiceManager
-from circuitforge_core.resources.profiles.schema import (
-    GpuProfile,
-    ProcessSpec,
-    ServiceProfile,
-)
-
-
-def _make_profile(args_template: str = "--port {port} --gpu-id {gpu_id}") -> GpuProfile:
-    return GpuProfile(
-        schema_version=1,
-        name="test",
-        vram_total_mb=8192,
-        services={
-            "vllm": ServiceProfile(
-                max_mb=5120,
-                priority=1,
-                managed=ProcessSpec(
-                    exec_path="/usr/bin/python",
-                    args_template=args_template,
-                    port=8000,
-                    host_port=8000,
-                    cwd="/tmp",
-                ),
-            ),
-            "no_managed": ServiceProfile(max_mb=1024, priority=2),
-        },
-    )
-
-
-@pytest.fixture
-def manager():
-    return ServiceManager(node_id="test-node", profile=_make_profile(), advertise_host="127.0.0.1")
-
-
-# ---------------------------------------------------------------------------
-# is_running
-# ---------------------------------------------------------------------------
-
-
-def test_is_running_returns_false_when_no_proc(manager):
-    assert manager.is_running("vllm") is False
-
-
-def test_is_running_returns_false_when_proc_exited(manager):
-    mock_proc = MagicMock()
-    mock_proc.poll.return_value = 1  # exited
-    manager._procs["vllm"] = mock_proc
-    assert manager.is_running("vllm") is False
-
-
-def test_is_running_returns_false_when_port_not_listening(manager):
-    mock_proc = MagicMock()
-    mock_proc.poll.return_value = None  # still running
-    manager._procs["vllm"] = mock_proc
-
-    with patch("socket.create_connection", side_effect=OSError("refused")):
-        assert manager.is_running("vllm") is False
-
-
-def test_is_running_returns_true_when_proc_alive_and_port_open(manager):
-    mock_proc = MagicMock()
-    mock_proc.poll.return_value = None  # still running
-    manager._procs["vllm"] = mock_proc
-
-    mock_socket = MagicMock()
-    mock_socket.__enter__ = MagicMock(return_value=mock_socket)
-    mock_socket.__exit__ = MagicMock(return_value=False)
-    with patch("socket.create_connection", return_value=mock_socket):
-        assert manager.is_running("vllm") is True
-
-
-def test_is_running_unknown_service_returns_false(manager):
-    assert manager.is_running("nonexistent") is False
-
-
-def test_is_running_no_managed_spec_returns_false(manager):
-    assert manager.is_running("no_managed") is False
-
-
-# ---------------------------------------------------------------------------
-# start
-# ---------------------------------------------------------------------------
-
-
-def test_start_launches_process_and_returns_url(manager):
-    with patch("subprocess.Popen") as mock_popen, \
-         patch.object(manager, "is_running", return_value=False):
-        mock_popen.return_value = MagicMock()
-        url = manager.start("vllm", gpu_id=0, params={"model": "mymodel"})
-
-    assert url == "http://127.0.0.1:8000"
-    mock_popen.assert_called_once()
-    call_args = mock_popen.call_args
-    cmd = call_args[0][0]
-    assert cmd[0] == "/usr/bin/python"
-    assert "--port" in cmd
-    assert "8000" in cmd
-    assert "--gpu-id" in cmd
-    assert "0" in cmd
-
-
-def test_start_returns_url_immediately_when_already_running(manager):
-    with patch.object(manager, "is_running", return_value=True):
-        with patch("subprocess.Popen") as mock_popen:
-            url = manager.start("vllm", gpu_id=0, params={})
-
-    assert url == "http://127.0.0.1:8000"
-    mock_popen.assert_not_called()
-
-
-def test_start_raises_for_unknown_service(manager):
-    with pytest.raises(ValueError, match="not in profile"):
-        manager.start("nonexistent", gpu_id=0, params={})
-
-
-def test_start_stores_proc_in_procs(manager):
-    mock_proc = MagicMock()
-    with patch("subprocess.Popen", return_value=mock_proc), \
-         patch.object(manager, "is_running", return_value=False):
-        manager.start("vllm", gpu_id=0, params={})
-
-    assert manager._procs["vllm"] is mock_proc
-
-
-# ---------------------------------------------------------------------------
-# stop
-# ---------------------------------------------------------------------------
-
-
-def test_stop_terminates_running_process(manager):
-    mock_proc = MagicMock()
-    manager._procs["vllm"] = mock_proc
-
-    result = manager.stop("vllm")
-
-    assert result is True
-    mock_proc.terminate.assert_called_once()
-    mock_proc.wait.assert_called_once()
-    assert "vllm" not in manager._procs
-
-
-def test_stop_kills_process_that_wont_terminate(manager):
-    mock_proc = MagicMock()
-    mock_proc.wait.side_effect = Exception("timeout")
-    manager._procs["vllm"] = mock_proc
-
-    result = manager.stop("vllm")
-
-    assert result is True
-    mock_proc.kill.assert_called_once()
-
-
-def test_stop_returns_true_when_no_proc_tracked(manager):
-    # No proc in _procs — still returns True (idempotent stop)
-    result = manager.stop("vllm")
-    assert result is True
-
-
-def test_stop_returns_false_for_unknown_service(manager):
-    result = manager.stop("nonexistent")
-    assert result is False
-
-
-# ---------------------------------------------------------------------------
-# list_running / get_url
-# ---------------------------------------------------------------------------
-
-
-def test_list_running_returns_running_services(manager):
-    def _is_running(svc: str) -> bool:
-        return svc == "vllm"
-
-    with patch.object(manager, "is_running", side_effect=_is_running):
-        running = manager.list_running()
-
-    assert running == ["vllm"]
-
-
-def test_get_url_returns_none_when_not_running(manager):
-    with patch.object(manager, "is_running", return_value=False):
-        assert manager.get_url("vllm") is None
-
-
-def test_get_url_returns_url_when_running(manager):
-    with patch.object(manager, "is_running", return_value=True):
-        assert manager.get_url("vllm") == "http://127.0.0.1:8000"
--- a/tests/test_resources/test_service_registry.py
+++ b/tests/test_resources/test_service_registry.py
@ -1,86 +0,0 @@
-import time
-import dataclasses
-import pytest
-from circuitforge_core.resources.coordinator.service_registry import (
-    ServiceRegistry, ServiceAllocation, ServiceInstance,
-)
-
-
-@pytest.fixture
-def registry():
-    return ServiceRegistry()
-
-
-def test_allocate_creates_allocation(registry):
-    alloc = registry.allocate(
-        service="vllm", node_id="heimdall", gpu_id=0,
-        model="Ouro-1.4B", url="http://heimdall:8000",
-        caller="test", ttl_s=300.0,
-    )
-    assert alloc.service == "vllm"
-    assert alloc.node_id == "heimdall"
-    assert alloc.allocation_id  # non-empty UUID string
-
-
-def test_active_allocations_count(registry):
-    registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
-    registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "b", 300.0)
-    assert registry.active_allocations("vllm", "heimdall", 0) == 2
-
-
-def test_release_decrements_count(registry):
-    alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
-    registry.release(alloc.allocation_id)
-    assert registry.active_allocations("vllm", "heimdall", 0) == 0
-
-
-def test_release_nonexistent_returns_false(registry):
-    assert registry.release("nonexistent-id") is False
-
-
-def test_upsert_instance_sets_running_state(registry):
-    registry.upsert_instance("vllm", "heimdall", 0, state="running",
-                              model="Ouro-1.4B", url="http://heimdall:8000")
-    instances = registry.all_instances()
-    assert len(instances) == 1
-    assert instances[0].state == "running"
-
-
-def test_release_last_alloc_marks_instance_idle(registry):
-    registry.upsert_instance("vllm", "heimdall", 0, state="running",
-                              model="Ouro-1.4B", url="http://heimdall:8000")
-    alloc = registry.allocate("vllm", "heimdall", 0, "Ouro-1.4B", "http://heimdall:8000", "a", 300.0)
-    registry.release(alloc.allocation_id)
-    instance = registry.all_instances()[0]
-    assert instance.state == "idle"
-    assert instance.idle_since is not None
-
-
-def test_new_alloc_on_idle_instance_marks_it_running(registry):
-    registry.upsert_instance("vllm", "heimdall", 0, state="idle",
-                              model="M", url="http://h:8000")
-    registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "x", 300.0)
-    assert registry.all_instances()[0].state == "running"
-
-
-def test_sweep_expired_allocations(registry):
-    # Register a running instance so idle-transition logic has something to act on.
-    registry.upsert_instance("vllm", "heimdall", 0, state="running",
-                              model="M", url="http://h:8000")
-    # Create an allocation with a very short TTL (1 second).
-    alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "caller", ttl_s=1)
-    assert registry.active_allocations("vllm", "heimdall", 0) == 1
-
-    # Wait for TTL to elapse.
-    time.sleep(1.1)
-
-    expired = registry.sweep_expired_allocations()
-
-    # The allocation should have been swept.
-    assert alloc.allocation_id in expired
-    assert registry.active_allocations("vllm", "heimdall", 0) == 0
-
-    # The instance should have transitioned to idle since no allocations remain.
-    instance = registry.all_instances()[0]
-    assert instance.state == "idle"
-    assert instance.idle_since is not None
				`@ -1 +0,0 @@`
				`from circuitforge_core.resources.client import CFOrchClient, Allocation # noqa: F401`