feat!: strip resources/ from MIT core — moves to circuitforge-orch (v0.8.0)

BREAKING CHANGE: circuitforge_core.resources is no longer available.
Import CFOrchClient from circuitforge_orch.client instead.
cf-orch CLI entry point is now in the circuitforge-orch package.
This commit is contained in:
pyr0ball 2026-04-04 22:34:27 -07:00
parent 2259382d0b
commit c244260d1c
63 changed files with 34 additions and 6571 deletions

View file

@ -2,15 +2,29 @@
Shared scaffold for CircuitForge products.
**Current version: 0.7.0**
## Modules
### Implemented
- `circuitforge_core.db` — SQLite connection factory and migration runner
- `circuitforge_core.llm` — LLM router with fallback chain
- `circuitforge_core.llm` — LLM router with fallback chain (Ollama, vLLM, Anthropic, OpenAI-compatible)
- `circuitforge_core.tiers` — Tier system with BYOK and local vision unlocks
- `circuitforge_core.config` — Env validation and .env loader
- `circuitforge_core.vision` — Vision router stub (v0.2+)
- `circuitforge_core.wizard` — First-run wizard base class stub
- `circuitforge_core.pipeline` — Staging queue stub (v0.2+)
- `circuitforge_core.hardware` — Hardware detection and LLM backend profile generation (VRAM tiers, GPU/CPU auto-select)
- `circuitforge_core.documents` — Document ingestion pipeline: PDF, DOCX, and image OCR → `StructuredDocument`
- `circuitforge_core.affiliates` — Affiliate URL wrapping with opt-out, BYOK user IDs, and CF env-var fallback (`wrap_url`)
- `circuitforge_core.preferences` — User preference store (local YAML file, pluggable backend); dot-path get/set API
- `circuitforge_core.tasks` — VRAM-aware LLM task scheduler; shared slot manager across services (`TaskScheduler`)
- `circuitforge_core.manage` — Cross-platform product process manager (Docker and native modes)
- `circuitforge_core.resources` — Resource coordinator and agent: VRAM allocation, eviction engine, GPU profile registry
### Stubs (in-tree, not yet implemented)
- `circuitforge_core.vision` — Vision router base class (planned: moondream2 / Claude vision dispatch)
- `circuitforge_core.wizard` — First-run wizard base class (products subclass `BaseWizard`)
- `circuitforge_core.pipeline` — Staging queue base (`StagingDB`; products provide concrete schema)
## Install

View file

@ -1 +1 @@
__version__ = "0.7.0"
__version__ = "0.8.0"

View file

@ -56,6 +56,12 @@ def _build_ebay_url(url: str, affiliate_id: str) -> str:
return f"{url}{sep}{params}"
def _build_instacart_url(url: str, affiliate_id: str) -> str:
"""Append Instacart affiliate parameter to a search URL."""
sep = "&" if "?" in url else "?"
return f"{url}{sep}aff={affiliate_id}"
def _build_amazon_url(url: str, affiliate_id: str) -> str:
"""Merge an Amazon Associates tag into a product URL's query string."""
parsed = urlparse(url)
@ -101,3 +107,10 @@ register_program(AffiliateProgram(
env_var="AMAZON_ASSOCIATES_TAG",
build_url=_build_amazon_url,
))
register_program(AffiliateProgram(
name="Instacart",
retailer_key="instacart",
env_var="INSTACART_AFFILIATE_ID",
build_url=_build_instacart_url,
))

View file

@ -1 +0,0 @@
from circuitforge_core.resources.client import CFOrchClient, Allocation # noqa: F401

View file

@ -1,105 +0,0 @@
from __future__ import annotations
import logging
from typing import Any
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
from circuitforge_core.resources.agent.service_manager import ServiceManager
logger = logging.getLogger(__name__)
class EvictRequest(BaseModel):
pid: int
grace_period_s: float = 5.0
class ServiceStartRequest(BaseModel):
gpu_id: int = 0
params: dict[str, str] = {}
def create_agent_app(
node_id: str,
monitor: GpuMonitor | None = None,
executor: EvictionExecutor | None = None,
service_manager: ServiceManager | None = None,
) -> FastAPI:
_monitor = monitor or GpuMonitor()
_executor = executor or EvictionExecutor()
app = FastAPI(title=f"cf-orch-agent [{node_id}]")
@app.get("/health")
def health() -> dict[str, Any]:
return {"status": "ok", "node_id": node_id}
@app.get("/gpu-info")
def gpu_info() -> dict[str, Any]:
gpus = _monitor.poll()
return {
"node_id": node_id,
"gpus": [
{
"gpu_id": g.gpu_id,
"name": g.name,
"vram_total_mb": g.vram_total_mb,
"vram_used_mb": g.vram_used_mb,
"vram_free_mb": g.vram_free_mb,
}
for g in gpus
],
}
@app.post("/evict")
def evict(req: EvictRequest) -> dict[str, Any]:
result = _executor.evict_pid(pid=req.pid, grace_period_s=req.grace_period_s)
return {
"success": result.success,
"method": result.method,
"message": result.message,
}
@app.get("/resident-info")
def resident_info() -> dict[str, Any]:
"""Return which models are currently loaded in each running managed service."""
if service_manager is None:
return {"residents": []}
from circuitforge_core.resources.agent.service_probe import probe_all
return {"residents": probe_all(service_manager)}
if service_manager is not None:
@app.get("/services")
def list_services() -> dict:
return {"running": service_manager.list_running()}
@app.get("/services/{service}")
def service_status(service: str) -> dict:
running = service_manager.is_running(service)
url = service_manager.get_url(service) if running else None
return {"service": service, "running": running, "url": url}
@app.post("/services/{service}/start")
def start_service(service: str, req: ServiceStartRequest) -> dict:
try:
already_running = service_manager.is_running(service)
url = service_manager.start(service, req.gpu_id, req.params)
# adopted=True signals the coordinator to treat this instance as
# immediately running rather than waiting for the probe loop.
adopted = already_running and service_manager.is_running(service)
return {"service": service, "url": url, "running": True, "adopted": adopted}
except (ValueError, NotImplementedError) as exc:
raise HTTPException(status_code=422, detail=str(exc))
except Exception as exc:
raise HTTPException(status_code=500, detail=f"Failed to start {service}: {exc}")
@app.post("/services/{service}/stop")
def stop_service(service: str) -> dict:
stopped = service_manager.stop(service)
return {"service": service, "stopped": stopped}
return app

View file

@ -1,85 +0,0 @@
from __future__ import annotations
import logging
import os
import signal
import time
from dataclasses import dataclass
import psutil
logger = logging.getLogger(__name__)
_DEFAULT_GRACE_S = 5.0
@dataclass(frozen=True)
class EvictionResult:
success: bool
method: str # "sigterm", "sigkill", "already_gone", "not_found", "error"
message: str
class EvictionExecutor:
def __init__(self, grace_period_s: float = _DEFAULT_GRACE_S) -> None:
self._default_grace = grace_period_s
def evict_pid(
self,
pid: int,
grace_period_s: float | None = None,
) -> EvictionResult:
grace = grace_period_s if grace_period_s is not None else self._default_grace
if pid <= 0:
return EvictionResult(
success=False, method="error",
message=f"Refusing to signal invalid PID {pid}"
)
if not psutil.pid_exists(pid):
return EvictionResult(
success=False, method="not_found",
message=f"PID {pid} not found"
)
try:
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
return EvictionResult(
success=True, method="already_gone",
message=f"PID {pid} vanished before SIGTERM"
)
except PermissionError as exc:
return EvictionResult(
success=False, method="error",
message=f"Permission denied terminating PID {pid}: {exc}"
)
# Wait for grace period
deadline = time.monotonic() + grace
while time.monotonic() < deadline:
if not psutil.pid_exists(pid):
logger.info("PID %d exited cleanly after SIGTERM", pid)
return EvictionResult(
success=True, method="sigterm",
message=f"PID {pid} exited after SIGTERM"
)
time.sleep(0.05)
# Escalate to SIGKILL
if psutil.pid_exists(pid):
try:
os.kill(pid, signal.SIGKILL)
logger.warning("PID %d required SIGKILL", pid)
return EvictionResult(
success=True, method="sigkill",
message=f"PID {pid} killed with SIGKILL"
)
except ProcessLookupError:
pass
return EvictionResult(
success=True, method="sigkill",
message=f"PID {pid} is gone"
)

View file

@ -1,52 +0,0 @@
from __future__ import annotations
import logging
import subprocess
from circuitforge_core.resources.models import GpuInfo
logger = logging.getLogger(__name__)
_NVIDIA_SMI_CMD = [
"nvidia-smi",
"--query-gpu=index,name,memory.total,memory.used,memory.free",
"--format=csv,noheader,nounits",
]
class GpuMonitor:
def poll(self) -> list[GpuInfo]:
try:
result = subprocess.run(
_NVIDIA_SMI_CMD,
capture_output=True,
text=True,
timeout=5,
)
except (FileNotFoundError, subprocess.TimeoutExpired) as exc:
logger.warning("nvidia-smi unavailable: %s", exc)
return []
if result.returncode != 0:
logger.warning("nvidia-smi exited %d", result.returncode)
return []
return self._parse(result.stdout)
def _parse(self, output: str) -> list[GpuInfo]:
gpus: list[GpuInfo] = []
for line in output.strip().splitlines():
parts = [p.strip() for p in line.split(",")]
if len(parts) != 5:
continue
try:
gpus.append(GpuInfo(
gpu_id=int(parts[0]),
name=parts[1],
vram_total_mb=int(parts[2]),
vram_used_mb=int(parts[3]),
vram_free_mb=int(parts[4]),
))
except ValueError:
logger.debug("Skipping malformed nvidia-smi line: %r", line)
return gpus

View file

@ -1,186 +0,0 @@
"""
ServiceManager start/stop Docker containers and processes for cf-orch managed services.
Container naming convention: cf-orch-{service}-{node_id}
"""
from __future__ import annotations
import os
import re
import subprocess
from collections import defaultdict
from typing import Any
from circuitforge_core.resources.profiles.schema import DockerSpec, GpuProfile, ProcessSpec
def _expand_volume(v: str) -> str:
"""Expand bash-style volume strings including ${VAR:-default} and $VAR."""
def _sub(m: re.Match) -> str: # type: ignore[type-arg]
var, default = m.group(1), m.group(2) or ""
return os.environ.get(var) or default
v = re.sub(r"\$\{(\w+)(?::-(.*?))?\}", _sub, v)
v = re.sub(r"\$(\w+)", lambda m: os.environ.get(m.group(1), m.group(0)), v)
return v
class ServiceManager:
def __init__(
self,
node_id: str,
profile: GpuProfile,
advertise_host: str = "127.0.0.1",
) -> None:
self.node_id = node_id
self.profile = profile
self.advertise_host = advertise_host
self._procs: dict[str, Any] = {}
def container_name(self, service: str) -> str:
return f"cf-orch-{service}-{self.node_id}"
def _get_spec(self, service: str) -> DockerSpec | ProcessSpec | None:
svc = self.profile.services.get(service)
if svc is None:
return None
return svc.managed
def is_running(self, service: str) -> bool:
spec = self._get_spec(service)
if spec is None:
return False
if isinstance(spec, DockerSpec):
try:
result = subprocess.run(
[
"docker",
"inspect",
"--format",
"{{.State.Running}}",
self.container_name(service),
],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip() == "true"
except subprocess.CalledProcessError:
return False
if isinstance(spec, ProcessSpec):
# For adopt=True services, check the health endpoint regardless of whether
# we spawned the process (it may be a system daemon we didn't start).
if spec.adopt:
return self._probe_health(spec.host_port, spec.health_path)
proc = self._procs.get(service)
if proc is None or proc.poll() is not None:
return False
import socket
try:
with socket.create_connection(("127.0.0.1", spec.host_port), timeout=1):
return True
except OSError:
return False
return False
def _probe_health(self, port: int, health_path: str = "/health") -> bool:
"""Return True if the service at localhost:port responds 200 on health_path."""
import urllib.request
try:
url = f"http://127.0.0.1:{port}{health_path}"
with urllib.request.urlopen(url, timeout=2.0) as resp:
return resp.status == 200
except Exception:
return False
def start(self, service: str, gpu_id: int, params: dict[str, str]) -> str:
spec = self._get_spec(service)
if spec is None:
raise ValueError(f"Service {service!r} not in profile or has no managed spec")
if self.is_running(service):
return f"http://{self.advertise_host}:{spec.host_port}"
if isinstance(spec, DockerSpec):
expanded_volumes = [_expand_volume(v) for v in spec.volumes]
filler: dict[str, str] = defaultdict(str, params)
expanded_command = spec.command_template.format_map(filler).split()
cmd = [
"docker", "run", "-d", "--rm",
"--name", self.container_name(service),
"--runtime", spec.runtime,
"--gpus", f"device={gpu_id}",
"--ipc", spec.ipc,
"-p", f"{spec.host_port}:{spec.port}",
]
for vol in expanded_volumes:
cmd += ["-v", vol]
for key, val in spec.env.items():
cmd += ["-e", f"{key}={val}"]
cmd.append(spec.image)
cmd.extend(expanded_command)
subprocess.run(cmd, check=True, capture_output=True, text=True)
return f"http://{self.advertise_host}:{spec.host_port}"
if isinstance(spec, ProcessSpec):
# adopt=True: if the service is already healthy, claim it without spawning.
if spec.adopt and self._probe_health(spec.host_port, spec.health_path):
return f"http://{self.advertise_host}:{spec.host_port}"
import subprocess as _sp
filler = defaultdict(str, params)
filler.setdefault("port", str(spec.port))
filler.setdefault("gpu_id", str(gpu_id))
args_expanded = spec.args_template.format_map(filler).split()
cmd = [spec.exec_path] + args_expanded
env = {**__import__("os").environ}
proc = _sp.Popen(
cmd,
cwd=spec.cwd or None,
env=env,
stdout=_sp.DEVNULL,
stderr=_sp.DEVNULL,
)
self._procs[service] = proc
return f"http://{self.advertise_host}:{spec.host_port}"
raise NotImplementedError(f"Unknown spec type: {type(spec)}")
def stop(self, service: str) -> bool:
spec = self._get_spec(service)
if spec is None:
return False
if isinstance(spec, DockerSpec):
try:
subprocess.run(
["docker", "stop", self.container_name(service)],
check=True,
capture_output=True,
text=True,
)
return True
except subprocess.CalledProcessError:
return False
if isinstance(spec, ProcessSpec):
proc = self._procs.pop(service, None)
if proc is not None:
proc.terminate()
try:
proc.wait(timeout=10)
except Exception:
proc.kill()
return True
return False
def list_running(self) -> list[str]:
return [svc for svc in self.profile.services if self.is_running(svc)]
def get_url(self, service: str) -> str | None:
spec = self._get_spec(service)
if spec is None or not self.is_running(service):
return None
return f"http://{self.advertise_host}:{spec.host_port}"

View file

@ -1,123 +0,0 @@
"""
Probe running services to detect which models are currently loaded in VRAM.
Two probe strategies run together:
1. Well-known ports always checked, regardless of who started the service.
Catches ollama, vLLM, etc. running outside cf-orch management.
2. Managed services services cf-orch started via ServiceManager.
Checked on their configured host_port, deduplicates with well-known results.
Each service exposes a different introspection API:
- vllm: GET /v1/models {"data": [{"id": "<model-name>"}]}
- ollama: GET /api/ps {"models": [{"name": "<model>", "size_vram": <bytes>}]}
ollama can have multiple models loaded simultaneously; each is reported as a
separate entry so the dashboard shows per-model residency.
The probe is best-effort: a timeout or connection refusal means model_name=None
but the service is still reported as resident.
"""
from __future__ import annotations
import json
import logging
import urllib.request
from typing import Any
from circuitforge_core.resources.profiles.schema import DockerSpec
logger = logging.getLogger(__name__)
_PROBE_TIMEOUT_S = 2.0
# Well-known service ports probed on every heartbeat.
# key → (service_name, prober_key)
_WELL_KNOWN_PORTS: dict[int, str] = {
11434: "ollama",
8000: "vllm",
8080: "vllm", # common alt vLLM port
}
def _fetch_json(url: str) -> dict[str, Any] | None:
"""GET a URL and parse JSON; returns None on any error."""
try:
with urllib.request.urlopen(url, timeout=_PROBE_TIMEOUT_S) as resp:
return json.loads(resp.read())
except Exception as exc:
logger.debug("Probe %s: %s", url, exc)
return None
def _probe_vllm(port: int) -> list[str]:
data = _fetch_json(f"http://127.0.0.1:{port}/v1/models")
if data and data.get("data"):
return [m["id"] for m in data["data"] if m.get("id")]
return []
def _probe_ollama(port: int) -> list[str]:
# /api/ps lists models currently *loaded in memory*, not just downloaded.
data = _fetch_json(f"http://127.0.0.1:{port}/api/ps")
if data and data.get("models"):
return [m["name"] for m in data["models"] if m.get("name")]
return []
_PROBERS: dict[str, Any] = {
"vllm": _probe_vllm,
"ollama": _probe_ollama,
}
def probe_all(service_manager: Any) -> list[dict[str, Any]]:
"""
Probe all services both well-known ports and cf-orch managed services.
Returns a list of dicts: [{"service": str, "model_name": str | None}].
Multiple loaded models in one service (e.g. two ollama models) each get
their own entry, disambiguated as "ollama/0", "ollama/1", etc.
"""
results: list[dict[str, Any]] = []
seen_ports: set[int] = set()
# ── 1. Well-known ports ──────────────────────────────────────────
for port, service in _WELL_KNOWN_PORTS.items():
prober = _PROBERS.get(service)
if prober is None:
continue
models = prober(port)
if not models:
continue # nothing on this port right now
seen_ports.add(port)
if len(models) == 1:
results.append({"service": service, "model_name": models[0]})
else:
for i, model in enumerate(models):
results.append({"service": f"{service}/{i}", "model_name": model})
# ── 2. Managed services (cf-orch started) ───────────────────────
if service_manager is not None:
for service in service_manager.list_running():
spec = service_manager._get_spec(service)
if not isinstance(spec, DockerSpec):
continue
if spec.host_port in seen_ports:
continue # already captured by well-known probe
prober = _PROBERS.get(service)
if prober is None:
results.append({"service": service, "model_name": None})
continue
models = prober(spec.host_port)
seen_ports.add(spec.host_port)
if not models:
results.append({"service": service, "model_name": None})
elif len(models) == 1:
results.append({"service": service, "model_name": models[0]})
else:
for i, model in enumerate(models):
results.append({"service": f"{service}/{i}", "model_name": model})
return results

View file

@ -1,234 +0,0 @@
from __future__ import annotations
import logging
import sys
from pathlib import Path
from typing import Annotated, Optional
import typer
import uvicorn
logger = logging.getLogger(__name__)
app = typer.Typer(name="cf-orch", help="CircuitForge GPU resource orchestrator")
_SYSTEMD_UNIT_PATH = Path("/etc/systemd/system/cf-orch.service")
_SYSTEMD_UNIT_TEMPLATE = """\
[Unit]
Description=CircuitForge GPU Resource Orchestrator
After=network.target
[Service]
Type=simple
ExecStart={python} -m circuitforge_core.resources.cli start
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target
"""
@app.command()
def start(
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
host: str = "0.0.0.0",
port: int = 7700,
node_id: str = "local",
agent_port: int = 7701,
) -> None:
"""Start the cf-orch coordinator (auto-detects GPU profile if not specified).
Automatically pre-registers the local agent so its GPUs appear on the
dashboard immediately. Remote nodes self-register via POST /api/nodes.
"""
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.app import create_coordinator_app
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
from circuitforge_core.resources.coordinator.node_store import NodeStore
lease_manager = LeaseManager()
profile_registry = ProfileRegistry()
service_registry = ServiceRegistry()
node_store = NodeStore()
supervisor = AgentSupervisor(
lease_manager=lease_manager,
service_registry=service_registry,
profile_registry=profile_registry,
node_store=node_store,
)
restored = supervisor.restore_from_store()
if restored:
typer.echo(f"Restored {restored} known node(s) from previous session")
monitor = GpuMonitor()
gpus = monitor.poll()
if not gpus:
typer.echo(
"Warning: no GPUs detected via nvidia-smi — coordinator running with 0 VRAM"
)
else:
typer.echo(f"Detected {len(gpus)} GPU(s)")
if profile:
active_profile = profile_registry.load(profile)
typer.echo(f"Using profile: {active_profile.name} (from {profile})")
else:
active_profile = (
profile_registry.auto_detect(gpus)
if gpus
else profile_registry.list_public()[-1]
)
typer.echo(f"Auto-selected profile: {active_profile.name}")
# Pre-register the local agent — the heartbeat loop will poll it for live GPU data.
local_agent_url = f"http://127.0.0.1:{agent_port}"
supervisor.register(node_id, local_agent_url)
typer.echo(f"Registered local node '{node_id}'{local_agent_url}")
coordinator_app = create_coordinator_app(
lease_manager=lease_manager,
profile_registry=profile_registry,
agent_supervisor=supervisor,
service_registry=service_registry,
)
typer.echo(f"Starting cf-orch coordinator on {host}:{port}")
uvicorn.run(coordinator_app, host=host, port=port)
@app.command()
def agent(
coordinator: str = "http://localhost:7700",
node_id: str = "local",
host: str = "0.0.0.0",
port: int = 7701,
advertise_host: Optional[str] = None,
profile: Annotated[Optional[Path], typer.Option(help="Profile YAML path")] = None,
) -> None:
"""Start a cf-orch node agent and self-register with the coordinator.
The agent starts its HTTP server, then POSTs its URL to the coordinator
so it appears on the dashboard without manual configuration.
Use --advertise-host to override the IP the coordinator should use to
reach this agent (e.g. on a multi-homed or NATted host).
"""
import threading
import httpx
from circuitforge_core.resources.agent.app import create_agent_app
from circuitforge_core.resources.agent.service_manager import ServiceManager
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
# The URL the coordinator should use to reach this agent.
reach_host = advertise_host or ("127.0.0.1" if host in ("0.0.0.0", "::") else host)
agent_url = f"http://{reach_host}:{port}"
_RECONNECT_INTERVAL_S = 30.0
def _reconnect_loop() -> None:
"""
Persistently re-register this agent with the coordinator.
Runs as a daemon thread for the lifetime of the agent process:
- Waits 2 s on first run (uvicorn needs time to bind)
- Re-registers every 30 s thereafter
- If the coordinator is down, silently retries no crashing
- When the coordinator restarts, the agent re-appears within one cycle
This means coordinator restarts require no manual intervention on agent hosts.
"""
import time
first = True
while True:
time.sleep(2.0 if first else _RECONNECT_INTERVAL_S)
first = False
try:
resp = httpx.post(
f"{coordinator}/api/nodes",
json={"node_id": node_id, "agent_url": agent_url},
timeout=5.0,
)
if resp.is_success:
logger.debug("Registered with coordinator at %s as '%s'", coordinator, node_id)
else:
logger.warning(
"Coordinator registration returned %s", resp.status_code
)
except Exception as exc:
logger.debug("Coordinator at %s unreachable, will retry: %s", coordinator, exc)
# Fire reconnect loop in a daemon thread so uvicorn.run() can start blocking immediately.
threading.Thread(target=_reconnect_loop, daemon=True, name="cf-orch-reconnect").start()
typer.echo(f"Reconnect loop started — will register with {coordinator} every {int(_RECONNECT_INTERVAL_S)}s")
service_manager = None
try:
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
pr = ProfileRegistry()
gpus = GpuMonitor().poll()
p = pr.load(Path(profile)) if profile else pr.auto_detect(gpus)
service_manager = ServiceManager(node_id=node_id, profile=p, advertise_host=reach_host)
typer.echo(f"ServiceManager ready with profile: {p.name}")
except Exception as exc:
typer.echo(f"Warning: ServiceManager unavailable ({exc})", err=True)
agent_app = create_agent_app(node_id=node_id, service_manager=service_manager)
typer.echo(f"Starting cf-orch agent [{node_id}] on {host}:{port}")
uvicorn.run(agent_app, host=host, port=port)
@app.command()
def status(coordinator: str = "http://localhost:7700") -> None:
"""Show GPU and lease status from the coordinator."""
import httpx
try:
resp = httpx.get(f"{coordinator}/api/nodes", timeout=5.0)
resp.raise_for_status()
nodes = resp.json().get("nodes", [])
for node in nodes:
typer.echo(f"\nNode: {node['node_id']}")
for gpu in node.get("gpus", []):
typer.echo(
f" GPU {gpu['gpu_id']}: {gpu['name']}"
f"{gpu['vram_used_mb']}/{gpu['vram_total_mb']} MB used"
)
except Exception as exc:
typer.echo(f"Coordinator unreachable at {coordinator}: {exc}", err=True)
raise typer.Exit(1)
@app.command("install-service")
def install_service(
dry_run: bool = typer.Option(
False, "--dry-run", help="Print unit file without writing"
),
) -> None:
"""Write a systemd unit file for cf-orch (requires root)."""
python = sys.executable
unit_content = _SYSTEMD_UNIT_TEMPLATE.format(python=python)
if dry_run:
typer.echo(f"Would write to {_SYSTEMD_UNIT_PATH}:\n")
typer.echo(unit_content)
return
try:
_SYSTEMD_UNIT_PATH.write_text(unit_content)
typer.echo(f"Written: {_SYSTEMD_UNIT_PATH}")
typer.echo(
"Run: sudo systemctl daemon-reload && sudo systemctl enable --now cf-orch"
)
except PermissionError:
typer.echo(
f"Permission denied writing to {_SYSTEMD_UNIT_PATH}. Run as root.", err=True
)
raise typer.Exit(1)
if __name__ == "__main__":
app()

View file

@ -1,143 +0,0 @@
from __future__ import annotations
import logging
import os
from contextlib import contextmanager, asynccontextmanager
from dataclasses import dataclass
import httpx
logger = logging.getLogger(__name__)
@dataclass
class Allocation:
allocation_id: str
service: str
node_id: str
gpu_id: int
model: str | None
url: str
started: bool
warm: bool
class CFOrchClient:
"""
Client for cf-orch coordinator allocation.
Sync usage (in LLMRouter or other sync code):
client = CFOrchClient(os.environ["CF_ORCH_URL"])
with client.allocate("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
# alloc.url is the inference endpoint
Async usage (in FastAPI apps):
async with client.allocate_async("vllm", model_candidates=["Ouro-1.4B"]) as alloc:
...
Authentication:
Pass api_key explicitly, or set CF_LICENSE_KEY env var. When set, every
request carries Authorization: Bearer <key>. Required for the hosted
CircuitForge coordinator (orch.circuitforge.tech); optional for local
self-hosted coordinators.
Raises ValueError immediately if coordinator_url is empty.
"""
def __init__(self, coordinator_url: str, api_key: str | None = None) -> None:
if not coordinator_url:
raise ValueError("coordinator_url is empty — cf-orch not configured")
self._url = coordinator_url.rstrip("/")
self._api_key = api_key or os.environ.get("CF_LICENSE_KEY", "")
def _headers(self) -> dict[str, str]:
if self._api_key:
return {"Authorization": f"Bearer {self._api_key}"}
return {}
def _build_body(self, model_candidates: list[str] | None, ttl_s: float, caller: str) -> dict:
return {
"model_candidates": model_candidates or [],
"ttl_s": ttl_s,
"caller": caller,
}
def _parse_allocation(self, data: dict, service: str) -> Allocation:
return Allocation(
allocation_id=data["allocation_id"],
service=service,
node_id=data["node_id"],
gpu_id=data["gpu_id"],
model=data.get("model"),
url=data["url"],
started=data.get("started", False),
warm=data.get("warm", False),
)
@contextmanager
def allocate(
self,
service: str,
*,
model_candidates: list[str] | None = None,
ttl_s: float = 3600.0,
caller: str = "",
):
"""Sync context manager. Allocates on enter, releases on exit."""
resp = httpx.post(
f"{self._url}/api/services/{service}/allocate",
json=self._build_body(model_candidates, ttl_s, caller),
headers=self._headers(),
timeout=120.0,
)
if not resp.is_success:
raise RuntimeError(
f"cf-orch allocation failed for {service!r}: "
f"HTTP {resp.status_code}{resp.text[:200]}"
)
alloc = self._parse_allocation(resp.json(), service)
try:
yield alloc
finally:
try:
httpx.delete(
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
headers=self._headers(),
timeout=10.0,
)
except Exception as exc:
logger.debug("cf-orch release failed (non-fatal): %s", exc)
@asynccontextmanager
async def allocate_async(
self,
service: str,
*,
model_candidates: list[str] | None = None,
ttl_s: float = 3600.0,
caller: str = "",
):
"""Async context manager. Allocates on enter, releases on exit."""
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{self._url}/api/services/{service}/allocate",
json=self._build_body(model_candidates, ttl_s, caller),
headers=self._headers(),
)
if not resp.is_success:
raise RuntimeError(
f"cf-orch allocation failed for {service!r}: "
f"HTTP {resp.status_code}{resp.text[:200]}"
)
alloc = self._parse_allocation(resp.json(), service)
try:
yield alloc
finally:
try:
await client.delete(
f"{self._url}/api/services/{service}/allocations/{alloc.allocation_id}",
headers=self._headers(),
timeout=10.0,
)
except Exception as exc:
logger.debug("cf-orch async release failed (non-fatal): %s", exc)

View file

@ -1,44 +0,0 @@
# circuitforge_core/resources/compose.yml
# One-command cf-orch deployment for Docker self-hosters:
# docker compose -f path/to/compose.yml up cf-orch-coordinator
services:
cf-orch-coordinator:
image: python:3.12-slim
command: >
sh -c "pip install 'circuitforge-core[orch]' &&
cf-orch start --host 0.0.0.0 --port 7700"
ports:
- "7700:7700"
volumes:
- /run/docker.sock:/var/run/docker.sock:ro
- cf-orch-data:/data
environment:
- CFORCH_PROFILE=${CFORCH_PROFILE:-}
restart: unless-stopped
devices:
- /dev/nvidia0:/dev/nvidia0
- /dev/nvidiactl:/dev/nvidiactl
runtime: nvidia
cf-orch-agent:
image: python:3.12-slim
command: >
sh -c "pip install 'circuitforge-core[orch]' &&
cf-orch agent --coordinator http://cf-orch-coordinator:7700
--node-id ${CFORCH_NODE_ID:-local}
--host 0.0.0.0 --port 7701"
ports:
- "7701:7701"
depends_on:
- cf-orch-coordinator
environment:
- CFORCH_NODE_ID=${CFORCH_NODE_ID:-local}
restart: unless-stopped
devices:
- /dev/nvidia0:/dev/nvidia0
- /dev/nvidiactl:/dev/nvidiactl
runtime: nvidia
volumes:
cf-orch-data:

View file

@ -1,209 +0,0 @@
from __future__ import annotations
import asyncio
import logging
import time
from dataclasses import dataclass, field
import httpx
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.node_store import NodeStore
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.models import GpuInfo, NodeInfo, ResidentAllocation
logger = logging.getLogger(__name__)
_HEARTBEAT_INTERVAL_S = 10.0
_AGENT_TIMEOUT_S = 5.0
@dataclass
class AgentRecord:
node_id: str
agent_url: str
last_seen: float = field(default_factory=time.time)
gpus: list[GpuInfo] = field(default_factory=list)
online: bool = False
class AgentSupervisor:
def __init__(
self,
lease_manager: LeaseManager,
service_registry: ServiceRegistry | None = None,
profile_registry: ProfileRegistry | None = None,
node_store: NodeStore | None = None,
) -> None:
self._agents: dict[str, AgentRecord] = {}
self._lease_manager = lease_manager
self._running = False
self._service_registry = service_registry
self._profile_registry = profile_registry
self._node_store = node_store
self._heartbeat_tick = 0
def restore_from_store(self) -> int:
"""
Load previously-known nodes from NodeStore into the in-memory registry.
All restored nodes start as offline=False. The heartbeat loop will poll
them on its first tick and promote any that respond to online=True.
Returns the number of nodes restored.
"""
if self._node_store is None:
return 0
restored = 0
for node_id, agent_url in self._node_store.all():
if node_id not in self._agents:
self._agents[node_id] = AgentRecord(
node_id=node_id, agent_url=agent_url, online=False
)
restored += 1
if restored:
logger.info("NodeStore: restored %d known node(s) from previous session", restored)
return restored
def register(self, node_id: str, agent_url: str) -> None:
if node_id not in self._agents:
self._agents[node_id] = AgentRecord(node_id=node_id, agent_url=agent_url)
logger.info("Registered agent node: %s @ %s", node_id, agent_url)
else:
if self._agents[node_id].agent_url != agent_url:
self._agents[node_id].agent_url = agent_url
logger.info("Updated agent URL for %s%s", node_id, agent_url)
if self._node_store is not None:
self._node_store.upsert(node_id, agent_url)
def get_node_info(self, node_id: str) -> NodeInfo | None:
record = self._agents.get(node_id)
if record is None:
return None
return NodeInfo(
node_id=record.node_id,
agent_url=record.agent_url,
gpus=record.gpus,
last_heartbeat=record.last_seen,
)
def all_nodes(self) -> list[NodeInfo]:
return [
NodeInfo(
node_id=r.node_id,
agent_url=r.agent_url,
gpus=r.gpus,
last_heartbeat=r.last_seen,
)
for r in self._agents.values()
]
def online_agents(self) -> "dict[str, AgentRecord]":
"""Return only currently-online agents, keyed by node_id."""
return {nid: rec for nid, rec in self._agents.items() if rec.online}
async def poll_agent(self, node_id: str) -> bool:
record = self._agents.get(node_id)
if record is None:
return False
try:
async with httpx.AsyncClient(timeout=_AGENT_TIMEOUT_S) as client:
gpu_resp = await client.get(f"{record.agent_url}/gpu-info")
gpu_resp.raise_for_status()
# Resident-info is best-effort — older agents may not have the endpoint.
try:
res_resp = await client.get(f"{record.agent_url}/resident-info")
resident_data = res_resp.json() if res_resp.is_success else {}
except Exception:
resident_data = {}
data = gpu_resp.json()
gpus = [
GpuInfo(
gpu_id=g["gpu_id"],
name=g["name"],
vram_total_mb=g["vram_total_mb"],
vram_used_mb=g["vram_used_mb"],
vram_free_mb=g["vram_free_mb"],
)
for g in data.get("gpus", [])
]
record.gpus = gpus
record.last_seen = time.time()
record.online = True
for gpu in gpus:
self._lease_manager.register_gpu(node_id, gpu.gpu_id, gpu.vram_total_mb)
residents = [
(r["service"], r.get("model_name"))
for r in resident_data.get("residents", [])
]
self._lease_manager.set_residents_for_node(node_id, residents)
return True
except Exception as exc:
logger.warning("Agent %s unreachable: %s", node_id, exc)
record.online = False
return False
async def poll_all(self) -> None:
await asyncio.gather(*[self.poll_agent(nid) for nid in self._agents])
def _build_idle_stop_config(self) -> dict[str, int]:
if self._profile_registry is None:
return {}
config: dict[str, int] = {}
for profile in self._profile_registry.list_public():
for svc_name, svc in profile.services.items():
if svc.idle_stop_after_s > 0:
existing = config.get(svc_name, 0)
config[svc_name] = min(existing, svc.idle_stop_after_s) if existing > 0 else svc.idle_stop_after_s
return config
async def _http_post(self, url: str) -> bool:
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(url)
return resp.is_success
except Exception as exc:
logger.warning("HTTP POST %s failed: %s", url, exc)
return False
async def _run_idle_sweep(self) -> None:
if self._service_registry is None:
return
expired = self._service_registry.sweep_expired_allocations()
if expired:
logger.info("TTL sweep: expired %d allocation(s): %s", len(expired), expired)
idle_stop_config = self._build_idle_stop_config()
if not idle_stop_config:
return
timed_out = self._service_registry.idle_past_timeout(idle_stop_config)
for instance in timed_out:
node_info = self.get_node_info(instance.node_id)
if node_info is None:
continue
stop_url = f"{node_info.agent_url}/services/{instance.service}/stop"
logger.info(
"Idle sweep: stopping %s on %s gpu%s (idle timeout)",
instance.service, instance.node_id, instance.gpu_id,
)
success = await self._http_post(stop_url)
if success:
self._service_registry.mark_stopped(
instance.service, instance.node_id, instance.gpu_id
)
async def run_heartbeat_loop(self) -> None:
self._running = True
while self._running:
await self.poll_all()
self._heartbeat_tick += 1
if self._heartbeat_tick % 3 == 0:
await self._run_idle_sweep()
await asyncio.sleep(_HEARTBEAT_INTERVAL_S)
def stop(self) -> None:
self._running = False

View file

@ -1,509 +0,0 @@
from __future__ import annotations
import logging
import time
import urllib.request
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Any
logger = logging.getLogger(__name__)
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse
from pydantic import BaseModel
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.node_selector import select_node
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.profiles.schema import ProcessSpec
_DASHBOARD_HTML = (Path(__file__).parent / "dashboard.html").read_text()
def _get_health_path(profile_registry: ProfileRegistry, service: str) -> str:
"""Return the health_path for a service from the first matching profile spec."""
for profile in profile_registry.list_public():
svc = profile.services.get(service)
if svc and isinstance(svc.managed, ProcessSpec):
return svc.managed.health_path
return "/health"
_PROBE_INTERVAL_S = 5.0 # how often to poll starting instances
_PROBE_TIMEOUT_S = 300.0 # give up and mark stopped after this many seconds
async def _run_instance_probe_loop(service_registry: ServiceRegistry) -> None:
"""
Background loop: transition 'starting' instances to 'running' once their
/health endpoint responds, or to 'stopped' after PROBE_TIMEOUT_S.
"""
import asyncio
start_times: dict[str, float] = {} # instance key → time first seen as starting
while True:
await asyncio.sleep(_PROBE_INTERVAL_S)
now = time.time()
for inst in service_registry.all_instances():
if inst.state != "starting":
start_times.pop(f"{inst.service}:{inst.node_id}:{inst.gpu_id}", None)
continue
key = f"{inst.service}:{inst.node_id}:{inst.gpu_id}"
start_times.setdefault(key, now)
healthy = False
if inst.url:
try:
with urllib.request.urlopen(
inst.url.rstrip("/") + inst.health_path, timeout=2.0
) as resp:
healthy = resp.status == 200
except Exception:
pass
if healthy:
service_registry.upsert_instance(
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
state="running", model=inst.model, url=inst.url,
)
start_times.pop(key, None)
logger.info("Instance %s/%s gpu=%s transitioned to running", inst.service, inst.node_id, inst.gpu_id)
elif now - start_times[key] > _PROBE_TIMEOUT_S:
service_registry.upsert_instance(
service=inst.service, node_id=inst.node_id, gpu_id=inst.gpu_id,
state="stopped", model=inst.model, url=inst.url,
)
start_times.pop(key, None)
logger.warning("Instance %s/%s gpu=%s timed out in starting state — marked stopped", inst.service, inst.node_id, inst.gpu_id)
class LeaseRequest(BaseModel):
node_id: str
gpu_id: int
mb: int
service: str
priority: int = 2
ttl_s: float = 0.0
class NodeRegisterRequest(BaseModel):
node_id: str
agent_url: str # e.g. "http://10.1.10.71:7701"
class ServiceEnsureRequest(BaseModel):
node_id: str
gpu_id: int = 0
params: dict[str, str] = {}
ttl_s: float = 3600.0
# Ordered list of model names to try; falls back down the list if VRAM is tight.
# The "model" key in params is used if this list is empty.
model_candidates: list[str] = []
class ServiceAllocateRequest(BaseModel):
model_candidates: list[str] = []
gpu_id: int | None = None
params: dict[str, str] = {}
ttl_s: float = 3600.0
caller: str = ""
def create_coordinator_app(
lease_manager: LeaseManager,
profile_registry: ProfileRegistry,
agent_supervisor: AgentSupervisor,
service_registry: ServiceRegistry,
) -> FastAPI:
eviction_engine = EvictionEngine(lease_manager=lease_manager)
@asynccontextmanager
async def _lifespan(app: FastAPI): # type: ignore[type-arg]
import asyncio
heartbeat_task = asyncio.create_task(agent_supervisor.run_heartbeat_loop())
probe_task = asyncio.create_task(_run_instance_probe_loop(service_registry))
yield
agent_supervisor.stop()
heartbeat_task.cancel()
probe_task.cancel()
app = FastAPI(title="cf-orch-coordinator", lifespan=_lifespan)
# Optional Heimdall auth — enabled when HEIMDALL_URL env var is set.
# Self-hosted coordinators skip this entirely; the CF-hosted public endpoint
# (orch.circuitforge.tech) sets HEIMDALL_URL to gate paid+ access.
from circuitforge_core.resources.coordinator.auth import HeimdallAuthMiddleware
_auth = HeimdallAuthMiddleware.from_env()
if _auth is not None:
app.middleware("http")(_auth)
@app.get("/", response_class=HTMLResponse, include_in_schema=False)
def dashboard() -> HTMLResponse:
return HTMLResponse(content=_DASHBOARD_HTML)
@app.get("/api/health")
def health() -> dict[str, Any]:
return {"status": "ok"}
@app.get("/api/nodes")
def get_nodes() -> dict[str, Any]:
nodes = agent_supervisor.all_nodes()
return {
"nodes": [
{
"node_id": n.node_id,
"agent_url": n.agent_url,
"last_heartbeat": n.last_heartbeat,
"gpus": [
{
"gpu_id": g.gpu_id,
"name": g.name,
"vram_total_mb": g.vram_total_mb,
"vram_used_mb": g.vram_used_mb,
"vram_free_mb": g.vram_free_mb,
}
for g in n.gpus
],
}
for n in nodes
]
}
@app.post("/api/nodes")
async def register_node(req: NodeRegisterRequest) -> dict[str, Any]:
"""Agents call this to self-register. Coordinator immediately polls for GPU info."""
agent_supervisor.register(req.node_id, req.agent_url)
await agent_supervisor.poll_agent(req.node_id)
return {"registered": True, "node_id": req.node_id}
@app.get("/api/profiles")
def get_profiles() -> dict[str, Any]:
return {
"profiles": [
{"name": p.name, "vram_total_mb": p.vram_total_mb}
for p in profile_registry.list_public()
]
}
@app.get("/api/resident")
def get_residents() -> dict[str, Any]:
return {
"residents": [
{
"service": r.service,
"node_id": r.node_id,
"model_name": r.model_name,
"first_seen": r.first_seen,
}
for r in lease_manager.all_residents()
]
}
@app.get("/api/leases")
def get_leases() -> dict[str, Any]:
return {
"leases": [
{
"lease_id": lease.lease_id,
"node_id": lease.node_id,
"gpu_id": lease.gpu_id,
"mb_granted": lease.mb_granted,
"holder_service": lease.holder_service,
"priority": lease.priority,
"expires_at": lease.expires_at,
}
for lease in lease_manager.all_leases()
]
}
@app.post("/api/leases")
async def request_lease(req: LeaseRequest) -> dict[str, Any]:
node_info = agent_supervisor.get_node_info(req.node_id)
if node_info is None:
raise HTTPException(
status_code=422,
detail=f"Unknown node_id {req.node_id!r} — node not registered",
)
agent_url = node_info.agent_url
lease = await eviction_engine.request_lease(
node_id=req.node_id,
gpu_id=req.gpu_id,
mb=req.mb,
service=req.service,
priority=req.priority,
agent_url=agent_url,
ttl_s=req.ttl_s,
)
if lease is None:
raise HTTPException(
status_code=503,
detail="Insufficient VRAM — no eviction candidates available",
)
return {
"lease": {
"lease_id": lease.lease_id,
"node_id": lease.node_id,
"gpu_id": lease.gpu_id,
"mb_granted": lease.mb_granted,
"holder_service": lease.holder_service,
"priority": lease.priority,
"expires_at": lease.expires_at,
}
}
@app.delete("/api/leases/{lease_id}")
async def release_lease(lease_id: str) -> dict[str, Any]:
released = await lease_manager.release(lease_id)
if not released:
raise HTTPException(status_code=404, detail=f"Lease {lease_id!r} not found")
return {"released": True, "lease_id": lease_id}
@app.post("/api/services/{service}/ensure")
async def ensure_service(service: str, req: ServiceEnsureRequest) -> dict[str, Any]:
"""
Ensure a managed service is running on the given node.
If model_candidates is provided, tries each model in order, skipping any
that exceed the live free VRAM on the target GPU. Falls back down the list
until one succeeds. The selected model is returned in the response.
"""
import httpx
node_info = agent_supervisor.get_node_info(req.node_id)
if node_info is None:
raise HTTPException(422, detail=f"Unknown node_id {req.node_id!r}")
# Resolve candidate list — fall back to params["model"] if not specified.
candidates: list[str] = req.model_candidates or (
[req.params["model"]] if "model" in req.params else []
)
if not candidates:
raise HTTPException(422, detail="No model specified: set params.model or model_candidates")
# Live free VRAM on the target GPU (used for pre-flight filtering).
gpu = next((g for g in node_info.gpus if g.gpu_id == req.gpu_id), None)
free_mb = gpu.vram_free_mb if gpu else 0
# Profile max_mb for the service gives us the VRAM ceiling for this slot.
# Models larger than free_mb are skipped before we even try to start them.
# We use model file size as a rough proxy — skip if free_mb < half of max_mb,
# since a fully-loaded model typically needs ~50-80% of its param size in VRAM.
service_max_mb = 0
for p in profile_registry.list_public():
svc = p.services.get(service)
if svc:
service_max_mb = svc.max_mb
break
# Filter candidates by VRAM headroom — require free VRAM >= service ceiling
# so the model can actually load without competing for VRAM with other processes.
if service_max_mb > 0 and free_mb < service_max_mb:
raise HTTPException(
503,
detail=f"Insufficient VRAM on gpu {req.gpu_id}: {free_mb}MB free, need {service_max_mb}MB",
)
last_error: str = ""
async with httpx.AsyncClient(timeout=120.0) as client:
for model in candidates:
params_with_model = {**req.params, "model": model}
try:
start_resp = await client.post(
f"{node_info.agent_url}/services/{service}/start",
json={"gpu_id": req.gpu_id, "params": params_with_model},
)
if start_resp.is_success:
data = start_resp.json()
return {
"service": service,
"node_id": req.node_id,
"gpu_id": req.gpu_id,
"model": model,
"url": data.get("url"),
"running": data.get("running", False),
}
last_error = start_resp.text
except httpx.HTTPError as exc:
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
raise HTTPException(
503,
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
)
@app.post("/api/services/{service}/allocate")
async def allocate_service(service: str, req: ServiceAllocateRequest) -> dict[str, Any]:
"""
Allocate a managed service coordinator picks the best node automatically.
Returns a URL + allocation_id. (Allocation not tracked server-side until Phase 2.)
"""
import httpx
if not req.model_candidates:
raise HTTPException(422, detail="model_candidates must be non-empty")
# Validate service is known in at least one profile, regardless of gpu_id
if not any(service in p.services for p in profile_registry.list_public()):
raise HTTPException(422, detail=f"Unknown service {service!r} — not in any profile")
residents = lease_manager.resident_keys()
if req.gpu_id is None:
online = agent_supervisor.online_agents()
placement = select_node(online, service, profile_registry, residents)
if placement is None:
raise HTTPException(
503,
detail=f"No online node has capacity for service {service!r}",
)
node_id, gpu_id = placement
else:
online = agent_supervisor.online_agents()
node_id = next(
(nid for nid, rec in online.items()
if any(g.gpu_id == req.gpu_id for g in rec.gpus)),
None,
)
if node_id is None:
raise HTTPException(422, detail=f"No online node has gpu_id={req.gpu_id}")
gpu_id = req.gpu_id
node_info = agent_supervisor.get_node_info(node_id)
if node_info is None:
raise HTTPException(422, detail=f"Node {node_id!r} not found")
warm = f"{node_id}:{service}" in residents
async with httpx.AsyncClient(timeout=120.0) as client:
last_error = ""
for model in req.model_candidates:
try:
resp = await client.post(
f"{node_info.agent_url}/services/{service}/start",
json={"gpu_id": gpu_id, "params": {**req.params, "model": model}},
)
if resp.is_success:
data = resp.json()
svc_url = data.get("url", "")
alloc = service_registry.allocate(
service=service,
node_id=node_id,
gpu_id=gpu_id,
model=model,
caller=req.caller,
url=svc_url,
ttl_s=req.ttl_s,
)
# Seed the instance state for first-time starts.
# adopted=True means the agent found it already running.
adopted = data.get("adopted", False)
instance_state = "running" if (warm or adopted) else "starting"
health_path = _get_health_path(profile_registry, service)
service_registry.upsert_instance(
service=service,
node_id=node_id,
gpu_id=gpu_id,
state=instance_state,
model=model,
url=svc_url,
health_path=health_path,
)
return {
"allocation_id": alloc.allocation_id,
"service": service,
"node_id": node_id,
"gpu_id": gpu_id,
"model": model,
"url": data.get("url"),
"started": not warm,
"warm": warm,
}
last_error = resp.text
except httpx.HTTPError as exc:
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
raise HTTPException(
503,
detail=f"All model candidates exhausted for {service!r}. Last error: {last_error}",
)
@app.delete("/api/services/{service}/allocations/{allocation_id}")
async def release_allocation(service: str, allocation_id: str) -> dict[str, Any]:
existing = service_registry.get_allocation(allocation_id)
if existing is None or existing.service != service:
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found for service {service!r}")
released = service_registry.release(allocation_id)
if not released:
raise HTTPException(404, detail=f"Allocation {allocation_id!r} not found")
return {"released": True, "allocation_id": allocation_id}
@app.get("/api/services/{service}/status")
def get_service_status(service: str) -> dict[str, Any]:
instances = [i for i in service_registry.all_instances() if i.service == service]
allocations = [a for a in service_registry.all_allocations() if a.service == service]
return {
"service": service,
"instances": [
{
"node_id": i.node_id,
"gpu_id": i.gpu_id,
"state": i.state,
"model": i.model,
"url": i.url,
"idle_since": i.idle_since,
}
for i in instances
],
"allocations": [
{
"allocation_id": a.allocation_id,
"node_id": a.node_id,
"gpu_id": a.gpu_id,
"model": a.model,
"caller": a.caller,
"url": a.url,
"expires_at": a.expires_at,
}
for a in allocations
],
}
@app.get("/api/services")
def list_services() -> dict[str, Any]:
instances = service_registry.all_instances()
return {
"services": [
{
"service": i.service,
"node_id": i.node_id,
"gpu_id": i.gpu_id,
"state": i.state,
"model": i.model,
"url": i.url,
}
for i in instances
]
}
@app.delete("/api/services/{service}")
async def stop_service(service: str, node_id: str) -> dict[str, Any]:
"""Stop a managed service on the given node."""
node_info = agent_supervisor.get_node_info(node_id)
if node_info is None:
raise HTTPException(422, detail=f"Unknown node_id {node_id!r}")
import httpx
async with httpx.AsyncClient(timeout=30.0) as client:
try:
resp = await client.post(f"{node_info.agent_url}/services/{service}/stop")
resp.raise_for_status()
return {"service": service, "node_id": node_id, "stopped": resp.json().get("stopped", False)}
except httpx.HTTPError as exc:
raise HTTPException(502, detail=f"Agent unreachable: {exc}")
return app

View file

@ -1,197 +0,0 @@
"""
cf-orch coordinator auth middleware.
When HEIMDALL_URL is set, all /api/* requests (except /api/health) must carry:
Authorization: Bearer <CF license key>
The key is validated against Heimdall and the result cached for
CACHE_TTL_S seconds (default 300 / 5 min). This keeps Heimdall out of the
per-allocation hot path while keeping revocation latency bounded.
When HEIMDALL_URL is not set, auth is disabled self-hosted deployments work
with no configuration change.
Environment variables
---------------------
HEIMDALL_URL Heimdall base URL, e.g. https://license.circuitforge.tech
When absent, auth is skipped entirely.
HEIMDALL_MIN_TIER Minimum tier required (default: "paid").
Accepted values: free, paid, premium, ultra.
CF_ORCH_AUTH_SECRET Shared secret sent to Heimdall so it can distinguish
coordinator service calls from end-user requests.
Must match the COORDINATOR_SECRET env var on Heimdall.
"""
from __future__ import annotations
import logging
import os
import time
from dataclasses import dataclass, field
from threading import Lock
import httpx
from fastapi import Request
from fastapi.responses import JSONResponse
logger = logging.getLogger(__name__)
# Unauthenticated paths — health check must always be accessible for monitoring.
_EXEMPT_PATHS: frozenset[str] = frozenset({"/api/health", "/", "/openapi.json", "/docs", "/redoc"})
_TIER_ORDER: dict[str, int] = {"free": 0, "paid": 1, "premium": 2, "ultra": 3}
CACHE_TTL_S: float = 300.0 # 5 minutes — matches Kiwi cloud session TTL
@dataclass
class _CacheEntry:
valid: bool
tier: str
user_id: str
expires_at: float
class _ValidationCache:
"""Thread-safe TTL cache for Heimdall validation results."""
def __init__(self, ttl_s: float = CACHE_TTL_S) -> None:
self._ttl = ttl_s
self._store: dict[str, _CacheEntry] = {}
self._lock = Lock()
def get(self, key: str) -> _CacheEntry | None:
with self._lock:
entry = self._store.get(key)
if entry is None or time.monotonic() > entry.expires_at:
return None
return entry
def set(self, key: str, valid: bool, tier: str, user_id: str) -> None:
with self._lock:
self._store[key] = _CacheEntry(
valid=valid,
tier=tier,
user_id=user_id,
expires_at=time.monotonic() + self._ttl,
)
def evict(self, key: str) -> None:
with self._lock:
self._store.pop(key, None)
def prune(self) -> int:
"""Remove expired entries. Returns count removed."""
now = time.monotonic()
with self._lock:
expired = [k for k, e in self._store.items() if now > e.expires_at]
for k in expired:
del self._store[k]
return len(expired)
class HeimdallAuthMiddleware:
"""
ASGI middleware that validates CF license keys against Heimdall.
Attach to a FastAPI app via app.middleware("http"):
middleware = HeimdallAuthMiddleware.from_env()
if middleware:
app.middleware("http")(middleware)
"""
def __init__(
self,
heimdall_url: str,
min_tier: str = "paid",
auth_secret: str = "",
cache_ttl_s: float = CACHE_TTL_S,
) -> None:
self._heimdall = heimdall_url.rstrip("/")
self._min_tier_rank = _TIER_ORDER.get(min_tier, 1)
self._min_tier = min_tier
self._auth_secret = auth_secret
self._cache = _ValidationCache(ttl_s=cache_ttl_s)
logger.info(
"[cf-orch auth] Heimdall auth enabled — url=%s min_tier=%s ttl=%ss",
self._heimdall, min_tier, cache_ttl_s,
)
@classmethod
def from_env(cls) -> "HeimdallAuthMiddleware | None":
"""Return a configured middleware instance, or None if HEIMDALL_URL is not set."""
url = os.environ.get("HEIMDALL_URL", "")
if not url:
logger.info("[cf-orch auth] HEIMDALL_URL not set — auth disabled (self-hosted mode)")
return None
return cls(
heimdall_url=url,
min_tier=os.environ.get("HEIMDALL_MIN_TIER", "paid"),
auth_secret=os.environ.get("CF_ORCH_AUTH_SECRET", ""),
)
def _validate_against_heimdall(self, license_key: str) -> tuple[bool, str, str]:
"""
Call Heimdall's /licenses/verify endpoint.
Returns (valid, tier, user_id).
On any network or parse error, returns (False, "", "") fail closed.
"""
try:
headers: dict[str, str] = {"Content-Type": "application/json"}
if self._auth_secret:
headers["X-Coordinator-Secret"] = self._auth_secret
resp = httpx.post(
f"{self._heimdall}/licenses/verify",
json={"key": license_key, "min_tier": self._min_tier},
headers=headers,
timeout=5.0,
)
if resp.status_code == 200:
data = resp.json()
return data.get("valid", False), data.get("tier", ""), data.get("user_id", "")
# 401/403 from Heimdall = key invalid/insufficient tier
logger.debug("[cf-orch auth] Heimdall returned %s for key ...%s", resp.status_code, license_key[-6:])
return False, "", ""
except Exception as exc:
logger.warning("[cf-orch auth] Heimdall unreachable — failing closed: %s", exc)
return False, "", ""
def _check_key(self, license_key: str) -> tuple[bool, str]:
"""
Validate key (cache-first). Returns (authorized, reason_if_denied).
"""
cached = self._cache.get(license_key)
if cached is not None:
if not cached.valid:
return False, "license key invalid or expired"
if _TIER_ORDER.get(cached.tier, -1) < self._min_tier_rank:
return False, f"feature requires {self._min_tier} tier (have: {cached.tier})"
return True, ""
valid, tier, user_id = self._validate_against_heimdall(license_key)
self._cache.set(license_key, valid=valid, tier=tier, user_id=user_id)
if not valid:
return False, "license key invalid or expired"
if _TIER_ORDER.get(tier, -1) < self._min_tier_rank:
return False, f"feature requires {self._min_tier} tier (have: {tier})"
return True, ""
async def __call__(self, request: Request, call_next): # type: ignore[no-untyped-def]
if request.url.path in _EXEMPT_PATHS:
return await call_next(request)
auth_header = request.headers.get("Authorization", "")
if not auth_header.startswith("Bearer "):
return JSONResponse(
status_code=401,
content={"detail": "Authorization: Bearer <license_key> required"},
)
license_key = auth_header.removeprefix("Bearer ").strip()
authorized, reason = self._check_key(license_key)
if not authorized:
return JSONResponse(status_code=403, content={"detail": reason})
return await call_next(request)

View file

@ -1,473 +0,0 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>cf-orch · dashboard</title>
<style>
*, *::before, *::after { box-sizing: border-box; margin: 0; padding: 0; }
:root {
--bg: #0d1117;
--bg2: #161b22;
--bg3: #1c2129;
--border: #30363d;
--border-dim: #21262d;
--text: #e6edf3;
--muted: #8b949e;
--dim: #4d5763;
--indigo: #818cf8;
--cyan: #22d3ee;
--green: #4ade80;
--amber: #fbbf24;
--red: #f85149;
--orange: #fb923c;
--radius: 6px;
--radius-sm: 3px;
--font: 'JetBrains Mono', 'Fira Code', ui-monospace, monospace;
}
body { background: var(--bg); color: var(--text); font-family: var(--font); font-size: 13px; line-height: 1.5; padding: 1rem; }
/* header */
header { display: flex; align-items: center; gap: 1rem; margin-bottom: 1rem; padding-bottom: 0.75rem; border-bottom: 1px solid var(--border); }
.logo { color: var(--indigo); font-size: 1.1em; font-weight: 700; }
#refresh-badge { margin-left: auto; font-size: 0.75em; color: var(--dim); }
#refresh-badge span { color: var(--green); }
/* section labels */
.section-label { font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.07em; color: var(--dim); margin-bottom: 0.5rem; }
/* health strip */
#health-strip { display: flex; flex-wrap: wrap; gap: 0.4rem; margin-bottom: 1rem; padding: 0.6rem 0.75rem; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); min-height: 36px; }
.pill { display: inline-flex; align-items: center; gap: 0.3rem; padding: 2px 10px; border-radius: 99px; font-size: 0.8em; font-weight: 600; }
.pill.ok { background: rgba(74,222,128,.12); color: var(--green); }
.pill.err { background: rgba(248,81,73,.12); color: var(--red); }
.pill.off { background: rgba(139,148,158,.1); color: var(--dim); }
/* GPU grid */
#gpu-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(180px, 1fr)); gap: 0.6rem; margin-bottom: 1rem; }
.gpu-card { background: var(--bg3); border: 1px solid var(--border); border-radius: var(--radius); padding: 0.7rem 0.8rem; }
.gpu-card.offline { border-color: #7c2d12; opacity: 0.7; }
.gpu-node { font-size: 0.75em; font-weight: 700; color: var(--indigo); margin-bottom: 1px; }
.gpu-offline .gpu-node { color: var(--orange); }
.gpu-name { font-size: 0.78em; color: var(--text); margin-bottom: 0.4rem; }
.vram-track { position: relative; background: var(--bg); border-radius: var(--radius-sm); height: 6px; margin-bottom: 0.3rem; overflow: hidden; }
.vram-leased { position: absolute; left: 0; top: 0; height: 100%; background: var(--cyan); transition: width 0.4s; }
.vram-resident { position: absolute; top: 0; height: 100%; background: var(--amber); transition: left 0.4s, width 0.4s; }
.vram-label { font-size: 0.72em; color: var(--muted); margin-bottom: 0.25rem; }
.gpu-status { font-size: 0.72em; }
.gpu-status.idle { color: var(--green); }
.gpu-status.busy { color: var(--amber); }
.gpu-status.full { color: var(--red); }
.gpu-status.offline { color: var(--orange); }
.spark-track { height: 24px; background: var(--bg); border-radius: var(--radius-sm); margin-top: 0.4rem; overflow: hidden; }
/* shared table base */
.cf-table { width: 100%; border-collapse: collapse; background: var(--bg2); border: 1px solid var(--border); border-radius: var(--radius); overflow: hidden; margin-bottom: 1rem; }
.cf-table th { background: var(--bg3); color: var(--dim); font-size: 0.72em; font-weight: 600; text-transform: uppercase; letter-spacing: 0.05em; padding: 0.4rem 0.6rem; text-align: left; border-bottom: 1px solid var(--border); }
.cf-table td { padding: 0.35rem 0.6rem; border-bottom: 1px solid var(--border-dim); font-size: 0.8em; vertical-align: middle; }
.cf-table tr:last-child td { border-bottom: none; }
.td-service { color: var(--indigo); font-weight: 600; }
.td-node { color: var(--muted); }
.td-mb { color: var(--text); }
.td-priority { color: var(--amber); }
.td-model { color: var(--cyan); font-size: 0.75em; }
.td-warm { color: var(--amber); }
.td-none { color: var(--dim); font-style: italic; }
.ttl-wrap { display: flex; align-items: center; gap: 0.5rem; }
.ttl-label { color: var(--cyan); font-variant-numeric: tabular-nums; white-space: nowrap; }
.ttl-track { flex: 1; background: var(--bg); border-radius: var(--radius-sm); height: 4px; }
.ttl-fill { height: 100%; border-radius: var(--radius-sm); background: var(--cyan); transition: width 0.4s; }
/* service state classes */
.state-running { color: #2ecc40; }
.state-idle { color: #ff851b; }
.state-stopped { color: #aaa; }
.state-starting { color: #0074d9; }
.state-unknown { color: #ff4136; }
/* error */
#error-banner { display: none; background: rgba(248,81,73,.1); border: 1px solid var(--red); border-radius: var(--radius); color: var(--red); padding: 0.5rem 0.75rem; font-size: 0.82em; margin-bottom: 1rem; }
/* footer */
footer { border-top: 1px solid var(--border); padding-top: 0.5rem; color: var(--dim); font-size: 0.72em; display: flex; gap: 1.5rem; }
footer a { color: var(--indigo); text-decoration: none; }
footer a:hover { text-decoration: underline; }
</style>
</head>
<body>
<header>
<span class="logo">cf-orch</span>
<span id="cluster-label" style="color:var(--muted)">coordinator</span>
<div id="refresh-badge">auto-refresh <span id="countdown">5</span>s</div>
</header>
<div id="error-banner"></div>
<div class="section-label">Services</div>
<div id="health-strip"></div>
<div class="section-label">GPU Nodes</div>
<div id="gpu-grid"></div>
<div id="services-section">
<div class="section-label">Service Instances</div>
<table class="cf-table" id="services-table">
<thead>
<tr>
<th>Service</th><th>Node</th><th>GPU</th><th>State</th><th>Model</th><th>URL</th>
</tr>
</thead>
<tbody id="services-body"></tbody>
</table>
</div>
<div class="section-label">Active Leases</div>
<table class="cf-table" id="leases-table">
<thead>
<tr>
<th>Service</th><th>Node / GPU</th><th>VRAM</th><th>Priority</th><th>TTL / Expires</th>
</tr>
</thead>
<tbody id="leases-body"></tbody>
</table>
<div class="section-label">Warm Models</div>
<table class="cf-table" id="resident-table">
<thead>
<tr>
<th>Service</th><th>Node</th><th>Model</th><th>Warm Since</th>
</tr>
</thead>
<tbody id="resident-body"></tbody>
</table>
<footer>
<span>cf-orch · circuitforge-core</span>
<a href="/api/nodes" target="_blank">/api/nodes</a>
<a href="/api/leases" target="_blank">/api/leases</a>
<a href="/api/resident" target="_blank">/api/resident</a>
<a href="/api/services" target="_blank">/api/services</a>
<a href="/api/health" target="_blank">/api/health</a>
</footer>
<script>
"use strict";
// ── helpers ──────────────────────────────────────────────────────
/** Create an element with optional className and textContent. */
function el(tag, opts) {
const e = document.createElement(tag);
if (opts && opts.cls) { opts.cls.split(' ').forEach(c => c && e.classList.add(c)); }
if (opts && opts.text != null) e.textContent = opts.text;
if (opts && opts.style) Object.assign(e.style, opts.style);
if (opts && opts.attr) Object.entries(opts.attr).forEach(([k,v]) => e.setAttribute(k, v));
return e;
}
/** Append children to a parent element. Returns parent. */
function append(parent, ...children) {
children.forEach(c => c && parent.appendChild(c));
return parent;
}
/** Replace all children of a DOM node. */
function setChildren(parent, ...children) {
while (parent.firstChild) parent.removeChild(parent.firstChild);
append(parent, ...children);
}
/** Build a sparkline SVG element (no innerHTML). */
function buildSparkline(history, totalMb) {
const ns = 'http://www.w3.org/2000/svg';
const svg = document.createElementNS(ns, 'svg');
svg.setAttribute('width', '100%');
svg.setAttribute('height', '16');
svg.setAttribute('viewBox', '0 0 100 16');
if (!history || history.length < 2) {
const line = document.createElementNS(ns, 'line');
line.setAttribute('x1', '0'); line.setAttribute('y1', '14');
line.setAttribute('x2', '100'); line.setAttribute('y2', '14');
line.setAttribute('stroke', '#30363d'); line.setAttribute('stroke-width', '1');
svg.appendChild(line);
return svg;
}
const max = Math.max(totalMb, 1);
const pts = history.map((v, i) => {
const x = (i / (history.length - 1)) * 100;
const y = 14 - ((v / max) * 12);
return x.toFixed(1) + ',' + y.toFixed(1);
}).join(' ');
const poly = document.createElementNS(ns, 'polyline');
poly.setAttribute('points', pts);
poly.setAttribute('fill', 'none');
poly.setAttribute('stroke', '#818cf8');
poly.setAttribute('stroke-width', '1.5');
poly.setAttribute('stroke-linejoin', 'round');
svg.appendChild(poly);
return svg;
}
/** VRAM fill colour based on utilisation fraction. */
function vramColor(pct) {
if (pct >= 0.9) return '#f85149';
if (pct >= 0.7) return '#fbbf24';
return '#22d3ee';
}
// ── sparkline history ────────────────────────────────────────────
// keyed "nodeId:gpuId" → array of vram_used_mb, max 20 samples
const sparkHistory = {};
// ── countdown ────────────────────────────────────────────────────
let countdown = 5;
setInterval(() => {
countdown = countdown <= 1 ? 5 : countdown - 1;
document.getElementById('countdown').textContent = countdown;
}, 1000);
// ── state class helper ───────────────────────────────────────────
function stateClass(state) {
const map = { running: 'state-running', idle: 'state-idle', stopped: 'state-stopped', starting: 'state-starting' };
return map[state] || 'state-unknown';
}
// ── render: services table ───────────────────────────────────────
function renderServices(services) {
const tbody = document.getElementById('services-body');
if (!services || services.length === 0) {
const tr = document.createElement('tr');
const td = el('td', { cls: 'td-none', text: 'No service instances registered.' });
td.setAttribute('colspan', '6');
tr.appendChild(td);
setChildren(tbody, tr);
return;
}
const rows = services.map(svc => {
const tr = document.createElement('tr');
const fields = [
{ text: svc.service, cls: 'td-service' },
{ text: svc.node_id, cls: 'td-node' },
{ text: String(svc.gpu_id), cls: 'td-mb' },
{ text: svc.state, cls: stateClass(svc.state) },
{ text: svc.model || '\u2014', cls: 'td-model' },
{ text: svc.url || '\u2014', cls: 'td-node' },
];
fields.forEach(f => tr.appendChild(el('td', { cls: f.cls, text: f.text })));
return tr;
});
setChildren(tbody, ...rows);
}
// ── render: health strip ─────────────────────────────────────────
function renderHealth(ok) {
const strip = document.getElementById('health-strip');
const pill = el('span', { cls: 'pill ' + (ok ? 'ok' : 'err'), text: (ok ? '● ' : '✕ ') + 'coordinator' });
setChildren(strip, pill);
}
// ── render: GPU grid ─────────────────────────────────────────────
// leasedByGpu: "nodeId:gpuId" → total MB currently leased (from active leases)
function renderNodes(nodes, leasedByGpu) {
const grid = document.getElementById('gpu-grid');
if (!nodes || nodes.length === 0) {
setChildren(grid, el('div', { text: 'No nodes registered.', style: { color: 'var(--dim)', fontSize: '0.8em', padding: '0.5rem' } }));
return;
}
const cards = [];
for (const node of nodes) {
for (const gpu of node.gpus) {
const key = node.node_id + ':' + gpu.gpu_id;
const total = gpu.vram_total_mb || 1;
const used = gpu.vram_used_mb;
const leased = leasedByGpu[key] || 0;
// Resident = nvidia-smi used minus actively leased; clamped to [0, used].
const resident = Math.max(0, Math.min(used - leased, used));
const pct = used / total;
if (!sparkHistory[key]) sparkHistory[key] = [];
sparkHistory[key].push(used);
if (sparkHistory[key].length > 20) sparkHistory[key].shift();
const statusCls = pct >= 0.9 ? 'full' : pct >= 0.1 ? 'busy' : 'idle';
const statusText = pct >= 0.9 ? 'saturated' : pct >= 0.1 ? Math.round(pct * 100) + '% used' : 'idle';
const card = el('div', { cls: 'gpu-card' });
const nodeLabel = el('div', { cls: 'gpu-node', text: node.node_id.toUpperCase() + ' · GPU ' + gpu.gpu_id });
const nameLine = el('div', { cls: 'gpu-name', text: gpu.name || 'Unknown GPU' });
// Stacked bar: cyan (leased) → amber (resident) → dark bg (free).
const leasedPct = (leased / total * 100).toFixed(1);
const residentPct = (resident / total * 100).toFixed(1);
const track = el('div', { cls: 'vram-track' });
const fillLeased = el('div', { cls: 'vram-leased', style: { width: leasedPct + '%' } });
const fillResident = el('div', { cls: 'vram-resident', style: { left: leasedPct + '%', width: residentPct + '%' } });
append(track, fillLeased, fillResident);
// Breakdown label when something is allocated.
let labelText = (used / 1024).toFixed(1) + ' / ' + (total / 1024).toFixed(1) + ' GB';
if (leased > 0 || resident > 0) {
const parts = [];
if (leased > 0) parts.push((leased / 1024).toFixed(1) + 'G leased');
if (resident > 0) parts.push((resident / 1024).toFixed(1) + 'G resident');
labelText += ' (' + parts.join(' · ') + ')';
}
const vramLbl = el('div', { cls: 'vram-label', text: labelText });
const statusEl = el('div', { cls: 'gpu-status ' + statusCls, text: statusText });
const sparkTrack = el('div', { cls: 'spark-track' });
sparkTrack.appendChild(buildSparkline(sparkHistory[key], total));
append(card, nodeLabel, nameLine, track, vramLbl, statusEl, sparkTrack);
cards.push(card);
}
}
setChildren(grid, ...cards);
}
// ── render: warm models table ────────────────────────────────────
function renderResidents(residents) {
const tbody = document.getElementById('resident-body');
if (!residents || residents.length === 0) {
const tr = document.createElement('tr');
const td = el('td', { cls: 'td-none', text: 'No warm models detected.' });
td.setAttribute('colspan', '4');
tr.appendChild(td);
setChildren(tbody, tr);
return;
}
const now = Date.now() / 1000;
const rows = residents.map(r => {
const warmSecs = now - (r.first_seen || now);
const warmText = warmSecs < 60
? Math.floor(warmSecs) + 's'
: warmSecs < 3600
? Math.floor(warmSecs / 60) + 'm ' + String(Math.floor(warmSecs % 60)).padStart(2, '0') + 's'
: Math.floor(warmSecs / 3600) + 'h ' + String(Math.floor((warmSecs % 3600) / 60)).padStart(2, '0') + 'm';
const tr = document.createElement('tr');
append(tr,
el('td', { cls: 'td-service', text: r.service }),
el('td', { cls: 'td-node', text: r.node_id }),
el('td', { cls: 'td-model', text: r.model_name || '—' }),
el('td', { cls: 'td-warm', text: warmText }),
);
return tr;
});
setChildren(tbody, ...rows);
}
// ── render: leases table ─────────────────────────────────────────
function renderLeases(leases) {
const tbody = document.getElementById('leases-body');
if (!leases || leases.length === 0) {
const tr = document.createElement('tr');
const td = el('td', { cls: 'td-none', text: 'No active leases.' });
td.setAttribute('colspan', '5');
tr.appendChild(td);
setChildren(tbody, tr);
return;
}
const now = Date.now() / 1000;
const rows = leases.map(lease => {
const mbGb = lease.mb_granted >= 1024
? (lease.mb_granted / 1024).toFixed(1) + ' GB'
: lease.mb_granted + ' MB';
const tr = document.createElement('tr');
const tdService = el('td', { cls: 'td-service', text: lease.holder_service });
const tdNode = el('td', { cls: 'td-node', text: lease.node_id + ' / GPU ' + lease.gpu_id });
const tdMb = el('td', { cls: 'td-mb', text: mbGb });
const tdPriority = el('td', { cls: 'td-priority', text: 'p' + lease.priority });
const tdTtl = document.createElement('td');
if (!lease.expires_at) {
tdTtl.appendChild(el('span', { cls: 'ttl-label', text: '∞' }));
} else {
const remaining = Math.max(0, lease.expires_at - now);
const pct = Math.min(100, (remaining / 300) * 100);
const mins = Math.floor(remaining / 60);
const secs = Math.floor(remaining % 60);
const label = remaining > 60
? mins + 'm ' + String(secs).padStart(2, '0') + 's'
: Math.floor(remaining) + 's';
const wrap = el('div', { cls: 'ttl-wrap' });
const lbl = el('span', { cls: 'ttl-label', text: label });
const track = el('div', { cls: 'ttl-track' });
const fill = el('div', { cls: 'ttl-fill', style: { width: pct.toFixed(1) + '%' } });
track.appendChild(fill);
append(wrap, lbl, track);
tdTtl.appendChild(wrap);
}
append(tr, tdService, tdNode, tdMb, tdPriority, tdTtl);
return tr;
});
setChildren(tbody, ...rows);
}
// ── error banner ─────────────────────────────────────────────────
function showError(msg) {
const el = document.getElementById('error-banner');
el.textContent = msg; // textContent — safe
el.style.display = 'block';
}
function clearError() { document.getElementById('error-banner').style.display = 'none'; }
// ── poll ─────────────────────────────────────────────────────────
async function poll() {
try {
const [nodesRes, leasesRes, residentRes, healthRes, servicesRes] = await Promise.all([
fetch('/api/nodes'),
fetch('/api/leases'),
fetch('/api/resident'),
fetch('/api/health'),
fetch('/api/services'),
]);
if (!nodesRes.ok || !leasesRes.ok) throw new Error('API error: ' + nodesRes.status);
const [nodesData, leasesData, residentData, servicesData] = await Promise.all([
nodesRes.json(), leasesRes.json(),
residentRes.ok ? residentRes.json() : Promise.resolve({ residents: [] }),
servicesRes.ok ? servicesRes.json() : Promise.resolve({ services: [] }),
]);
// Build per-GPU leased-MB index for the stacked bar.
const leasedByGpu = {};
for (const lease of (leasesData.leases || [])) {
const key = lease.node_id + ':' + lease.gpu_id;
leasedByGpu[key] = (leasedByGpu[key] || 0) + lease.mb_granted;
}
clearError();
renderHealth(healthRes.ok);
renderNodes(nodesData.nodes || [], leasedByGpu);
renderServices(servicesData.services || []);
renderLeases(leasesData.leases || []);
renderResidents(residentData.residents || []);
} catch (err) {
showError('Failed to reach coordinator: ' + err.message);
renderHealth(false);
}
}
poll();
setInterval(poll, 5000);
</script>
</body>
</html>

View file

@ -1,81 +0,0 @@
from __future__ import annotations
import asyncio
import logging
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.models import VRAMLease
logger = logging.getLogger(__name__)
_DEFAULT_EVICTION_TIMEOUT_S = 10.0
class EvictionEngine:
def __init__(
self,
lease_manager: LeaseManager,
eviction_timeout_s: float = _DEFAULT_EVICTION_TIMEOUT_S,
) -> None:
self.lease_manager = lease_manager
self._timeout = eviction_timeout_s
async def request_lease(
self,
node_id: str,
gpu_id: int,
mb: int,
service: str,
priority: int,
agent_url: str,
ttl_s: float = 0.0,
) -> VRAMLease | None:
# Fast path: enough free VRAM
lease = await self.lease_manager.try_grant(
node_id, gpu_id, mb, service, priority, ttl_s
)
if lease is not None:
return lease
# Find eviction candidates
candidates = self.lease_manager.get_eviction_candidates(
node_id=node_id, gpu_id=gpu_id,
needed_mb=mb, requester_priority=priority,
)
if not candidates:
logger.info(
"No eviction candidates for %s on %s:GPU%d (%dMB needed)",
service, node_id, gpu_id, mb,
)
return None
# Evict candidates
freed_mb = sum(c.mb_granted for c in candidates)
logger.info(
"Evicting %d lease(s) to free %dMB for %s",
len(candidates), freed_mb, service,
)
for candidate in candidates:
await self._evict_lease(candidate, agent_url)
# Wait for evictions to free up VRAM (poll with timeout)
loop = asyncio.get_running_loop()
deadline = loop.time() + self._timeout
while loop.time() < deadline:
lease = await self.lease_manager.try_grant(
node_id, gpu_id, mb, service, priority, ttl_s
)
if lease is not None:
return lease
await asyncio.sleep(0.1)
logger.warning("Eviction timed out for %s after %.1fs", service, self._timeout)
return None
async def _evict_lease(self, lease: VRAMLease, agent_url: str) -> None:
"""Release lease accounting. Process-level eviction deferred to Plan B."""
await self.lease_manager.release(lease.lease_id)
async def _call_agent_evict(self, agent_url: str, lease: VRAMLease) -> bool:
"""POST /evict to the agent. Stub for v1 — real process lookup in Plan B."""
return True

View file

@ -1,130 +0,0 @@
from __future__ import annotations
import asyncio
from collections import defaultdict
from circuitforge_core.resources.models import ResidentAllocation, VRAMLease
class LeaseManager:
def __init__(self) -> None:
self._leases: dict[str, VRAMLease] = {}
self._gpu_total: dict[tuple[str, int], int] = {}
self._gpu_used: dict[tuple[str, int], int] = defaultdict(int)
self._lock = asyncio.Lock()
# Resident allocations — keyed "node_id:service", updated by heartbeat.
# No lock needed: only the single heartbeat task writes this dict.
self._residents: dict[str, ResidentAllocation] = {}
def register_gpu(self, node_id: str, gpu_id: int, total_mb: int) -> None:
self._gpu_total[(node_id, gpu_id)] = total_mb
def gpu_total_mb(self, node_id: str, gpu_id: int) -> int:
return self._gpu_total.get((node_id, gpu_id), 0)
def used_mb(self, node_id: str, gpu_id: int) -> int:
return self._gpu_used[(node_id, gpu_id)]
async def try_grant(
self,
node_id: str,
gpu_id: int,
mb: int,
service: str,
priority: int,
ttl_s: float = 0.0,
) -> VRAMLease | None:
async with self._lock:
total = self._gpu_total.get((node_id, gpu_id), 0)
used = self._gpu_used[(node_id, gpu_id)]
if total - used < mb:
return None
lease = VRAMLease.create(
gpu_id=gpu_id, node_id=node_id, mb=mb,
service=service, priority=priority, ttl_s=ttl_s,
)
self._leases[lease.lease_id] = lease
self._gpu_used[(node_id, gpu_id)] += mb
return lease
async def release(self, lease_id: str) -> bool:
async with self._lock:
lease = self._leases.pop(lease_id, None)
if lease is None:
return False
self._gpu_used[(lease.node_id, lease.gpu_id)] -= lease.mb_granted
return True
def get_eviction_candidates(
self,
node_id: str,
gpu_id: int,
needed_mb: int,
requester_priority: int,
) -> list[VRAMLease]:
candidates = [
lease for lease in self._leases.values()
if lease.node_id == node_id
and lease.gpu_id == gpu_id
and lease.priority > requester_priority
]
candidates.sort(key=lambda lease: lease.priority, reverse=True)
selected: list[VRAMLease] = []
freed = 0
for candidate in candidates:
selected.append(candidate)
freed += candidate.mb_granted
if freed >= needed_mb:
break
return selected
def list_leases(
self, node_id: str | None = None, gpu_id: int | None = None
) -> list[VRAMLease]:
return [
lease for lease in self._leases.values()
if (node_id is None or lease.node_id == node_id)
and (gpu_id is None or lease.gpu_id == gpu_id)
]
def all_leases(self) -> list[VRAMLease]:
return list(self._leases.values())
# ── resident tracking ────────────────────────────────────────────
def set_residents_for_node(
self,
node_id: str,
residents: list[tuple[str, str | None]], # (service, model_name)
) -> None:
"""
Replace the resident snapshot for a node.
Preserves first_seen for entries whose service+model_name are unchanged,
so the dashboard can show how long a model has been warm.
"""
new_keys = {f"{node_id}:{service}" for service, _ in residents}
# Remove stale entries (service no longer running on this node).
for key in list(self._residents):
if key.startswith(f"{node_id}:") and key not in new_keys:
del self._residents[key]
# Upsert: preserve first_seen when model is unchanged, reset otherwise.
for service, model_name in residents:
key = f"{node_id}:{service}"
existing = self._residents.get(key)
if existing is not None and existing.model_name == model_name:
continue # same model still loaded — keep original first_seen
self._residents[key] = ResidentAllocation(
service=service,
node_id=node_id,
model_name=model_name,
)
def all_residents(self) -> list[ResidentAllocation]:
return list(self._residents.values())
def resident_keys(self) -> set[str]:
"""Return set of 'node_id:service' strings for currently-warm services."""
return set(self._residents.keys())

View file

@ -1,74 +0,0 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
_WARM_BONUS_MB = 1000
@dataclass(frozen=True)
class _Scored:
node_id: str
gpu_id: int
vram_free_mb: int
effective_free_mb: int
can_fit: bool
warm: bool
def select_node(
agents: "dict[str, AgentRecord]",
service: str,
profile_registry: "ProfileRegistry",
resident_keys: set[str],
) -> tuple[str, int] | None:
"""
Pick the best (node_id, gpu_id) for the requested service.
Warm nodes (service already running) get priority, then sorted by free VRAM.
Returns None if no suitable node exists.
"""
service_max_mb = _find_service_max_mb(service, profile_registry)
if service_max_mb is None:
return None # service not in any profile
candidates: list[_Scored] = []
for node_id, record in agents.items():
if not record.online:
continue
for gpu in record.gpus:
warm = f"{node_id}:{service}" in resident_keys
effective = gpu.vram_free_mb + (_WARM_BONUS_MB if warm else 0)
can_fit = gpu.vram_free_mb >= service_max_mb
candidates.append(_Scored(
node_id=node_id,
gpu_id=gpu.gpu_id,
vram_free_mb=gpu.vram_free_mb,
effective_free_mb=effective,
can_fit=can_fit,
warm=warm,
))
if not candidates:
return None
# Prefer: (1) warm nodes (model already resident — no cold start)
# (2) cold nodes that can fit the service (free >= half of max_mb)
# Fallback: best-effort node when nothing fits and nothing is warm
# (coordinator will attempt to start the service anyway; it may evict or fail)
# Note: resident_keys are per-node, not per-GPU. On multi-GPU nodes, the warm
# bonus applies to all GPUs on the node. This is a known coarseness —
# per-GPU resident tracking requires a resident_key format change.
preferred = [c for c in candidates if c.warm or c.can_fit]
pool = preferred if preferred else candidates
best = max(pool, key=lambda c: (c.warm, c.effective_free_mb))
return best.node_id, best.gpu_id
def _find_service_max_mb(service: str, profile_registry: "ProfileRegistry") -> int | None:
for profile in profile_registry.list_public():
svc = profile.services.get(service)
if svc is not None:
return svc.max_mb
return None

View file

@ -1,85 +0,0 @@
"""
circuitforge_core.resources.coordinator.node_store SQLite persistence for known agent nodes.
Gives the coordinator restart-safe memory of which nodes have ever registered.
On startup the coordinator reloads all known nodes and immediately probes them;
nodes that respond come back online within one heartbeat cycle (~10 s) without
any manual intervention on the agent hosts.
"""
from __future__ import annotations
import logging
import sqlite3
import time
from pathlib import Path
logger = logging.getLogger(__name__)
_DEFAULT_DB_PATH = Path.home() / ".local" / "share" / "circuitforge" / "cf-orch-nodes.db"
_STALE_AGE_DAYS = 30 # nodes unseen for this long are pruned automatically
class NodeStore:
"""
Thin SQLite wrapper for persisting known agent nodes across coordinator restarts.
Thread-safe for single-writer use (coordinator runs in one asyncio thread).
"""
def __init__(self, db_path: Path = _DEFAULT_DB_PATH) -> None:
self.db_path = db_path
db_path.parent.mkdir(parents=True, exist_ok=True)
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
self._conn.row_factory = sqlite3.Row
self._migrate()
logger.debug("NodeStore initialised at %s", db_path)
def _migrate(self) -> None:
self._conn.executescript("""
CREATE TABLE IF NOT EXISTS known_nodes (
node_id TEXT PRIMARY KEY,
agent_url TEXT NOT NULL,
last_seen REAL NOT NULL
);
""")
self._conn.commit()
def upsert(self, node_id: str, agent_url: str) -> None:
"""Record or update a node. Called on every successful registration."""
self._conn.execute(
"""
INSERT INTO known_nodes (node_id, agent_url, last_seen)
VALUES (?, ?, ?)
ON CONFLICT(node_id) DO UPDATE SET
agent_url = excluded.agent_url,
last_seen = excluded.last_seen
""",
(node_id, agent_url, time.time()),
)
self._conn.commit()
def all(self) -> list[tuple[str, str]]:
"""Return all known (node_id, agent_url) pairs."""
rows = self._conn.execute(
"SELECT node_id, agent_url FROM known_nodes ORDER BY last_seen DESC"
).fetchall()
return [(r["node_id"], r["agent_url"]) for r in rows]
def remove(self, node_id: str) -> None:
self._conn.execute("DELETE FROM known_nodes WHERE node_id = ?", (node_id,))
self._conn.commit()
def prune_stale(self, max_age_days: int = _STALE_AGE_DAYS) -> int:
"""Delete nodes not seen within max_age_days. Returns count removed."""
cutoff = time.time() - max_age_days * 86400
cur = self._conn.execute(
"DELETE FROM known_nodes WHERE last_seen < ?", (cutoff,)
)
self._conn.commit()
removed = cur.rowcount
if removed:
logger.info("NodeStore: pruned %d stale node(s) (>%d days old)", removed, max_age_days)
return removed
def close(self) -> None:
self._conn.close()

View file

@ -1,65 +0,0 @@
# circuitforge_core/resources/coordinator/profile_registry.py
from __future__ import annotations
import logging
from pathlib import Path
from circuitforge_core.resources.models import GpuInfo
from circuitforge_core.resources.profiles.schema import GpuProfile, load_profile
_PUBLIC_DIR = Path(__file__).parent.parent / "profiles" / "public"
# VRAM thresholds for public profile selection (MB)
_PROFILE_THRESHOLDS = [
(22000, "single-gpu-24gb"),
(14000, "single-gpu-16gb"),
(8000, "single-gpu-8gb"),
(5500, "single-gpu-6gb"),
(3500, "single-gpu-4gb"),
(0, "single-gpu-2gb"),
]
_log = logging.getLogger(__name__)
class ProfileRegistry:
def __init__(self, extra_dirs: list[Path] | None = None) -> None:
self._profiles: dict[str, GpuProfile] = {}
self._load_dir(_PUBLIC_DIR)
for d in (extra_dirs or []):
if d.exists():
self._load_dir(d)
def _load_dir(self, directory: Path) -> None:
for yaml_file in directory.glob("*.yaml"):
try:
profile = load_profile(yaml_file)
self._profiles[profile.name] = profile
except Exception as exc:
_log.warning("Skipping %s: %s", yaml_file, exc)
def load(self, path: Path) -> GpuProfile:
profile = load_profile(path)
self._profiles[profile.name] = profile
return profile
def list_public(self) -> list[GpuProfile]:
# CPU profiles (cpu-*) are intentionally excluded — this endpoint
# is used to match GPU hardware. CPU inference nodes self-select
# their profile via the CLI and are not listed for lease matching.
return [
p for p in self._profiles.values()
if p.name.startswith("single-gpu-")
]
def get(self, name: str) -> GpuProfile | None:
return self._profiles.get(name)
def auto_detect(self, gpus: list[GpuInfo]) -> GpuProfile:
primary_vram = gpus[0].vram_total_mb if gpus else 0
for threshold_mb, profile_name in _PROFILE_THRESHOLDS:
if primary_vram >= threshold_mb:
profile = self._profiles.get(profile_name)
if profile:
return profile
return self._profiles["single-gpu-2gb"]

View file

@ -1,173 +0,0 @@
from __future__ import annotations
import dataclasses
import time
import uuid
from dataclasses import dataclass
from typing import Literal
@dataclass
class ServiceAllocation:
allocation_id: str
service: str
node_id: str
gpu_id: int
model: str | None
caller: str
url: str
created_at: float
expires_at: float # 0 = no expiry
@dataclass
class ServiceInstance:
service: str
node_id: str
gpu_id: int
state: Literal["starting", "running", "idle", "stopped"]
model: str | None
url: str | None
idle_since: float | None = None
health_path: str = "/health"
class ServiceRegistry:
"""
In-memory registry of service allocations and instance state.
Allocations: per-caller request many per service instance.
Instances: per (service, node_id, gpu_id) one per running container.
"""
def __init__(self) -> None:
self._allocations: dict[str, ServiceAllocation] = {}
self._instances: dict[str, ServiceInstance] = {} # key: "service:node_id:gpu_id"
# ── allocation API ────────────────────────────────────────────────
def allocate(
self,
service: str,
node_id: str,
gpu_id: int,
model: str | None,
url: str,
caller: str,
ttl_s: float,
) -> ServiceAllocation:
alloc = ServiceAllocation(
allocation_id=str(uuid.uuid4()),
service=service,
node_id=node_id,
gpu_id=gpu_id,
model=model,
caller=caller,
url=url,
created_at=time.time(),
expires_at=time.time() + ttl_s if ttl_s > 0 else 0.0,
)
self._allocations[alloc.allocation_id] = alloc
# If an instance exists in idle/stopped state, mark it running again
key = f"{service}:{node_id}:{gpu_id}"
if key in self._instances:
inst = self._instances[key]
if inst.state in ("idle", "stopped"):
self._instances[key] = dataclasses.replace(
inst, state="running", idle_since=None
)
return alloc
def release(self, allocation_id: str) -> bool:
alloc = self._allocations.pop(allocation_id, None)
if alloc is None:
return False
# If no active allocations remain for this instance, mark it idle
key = f"{alloc.service}:{alloc.node_id}:{alloc.gpu_id}"
if self.active_allocations(alloc.service, alloc.node_id, alloc.gpu_id) == 0:
if key in self._instances:
self._instances[key] = dataclasses.replace(
self._instances[key], state="idle", idle_since=time.time()
)
return True
def active_allocations(self, service: str, node_id: str, gpu_id: int) -> int:
return sum(
1 for a in self._allocations.values()
if a.service == service and a.node_id == node_id and a.gpu_id == gpu_id
)
# ── instance API ─────────────────────────────────────────────────
def upsert_instance(
self,
service: str,
node_id: str,
gpu_id: int,
state: Literal["starting", "running", "idle", "stopped"],
model: str | None,
url: str | None,
health_path: str = "/health",
) -> ServiceInstance:
key = f"{service}:{node_id}:{gpu_id}"
existing = self._instances.get(key)
idle_since: float | None = None
if state == "idle":
# Preserve idle_since if already idle; set now if transitioning into idle
idle_since = existing.idle_since if (existing and existing.state == "idle") else time.time()
inst = ServiceInstance(
service=service, node_id=node_id, gpu_id=gpu_id,
state=state, model=model, url=url, idle_since=idle_since,
health_path=health_path,
)
self._instances[key] = inst
return inst
def get_allocation(self, allocation_id: str) -> ServiceAllocation | None:
return self._allocations.get(allocation_id)
def sweep_expired_allocations(self) -> list[str]:
"""
Remove all allocations whose TTL has elapsed and transition the
corresponding instance to 'idle' if no active allocations remain.
Returns the list of expired allocation_ids.
"""
now = time.time()
expired = [
alloc_id
for alloc_id, alloc in self._allocations.items()
if alloc.expires_at > 0 and now > alloc.expires_at
]
for alloc_id in expired:
self.release(alloc_id)
return expired
def all_allocations(self) -> list[ServiceAllocation]:
return list(self._allocations.values())
def all_instances(self) -> list[ServiceInstance]:
return list(self._instances.values())
def mark_stopped(self, service: str, node_id: str, gpu_id: int) -> None:
"""Transition an instance to 'stopped' state and clear idle_since."""
key = f"{service}:{node_id}:{gpu_id}"
if key in self._instances:
self._instances[key] = dataclasses.replace(
self._instances[key], state="stopped", idle_since=None
)
def idle_past_timeout(self, idle_stop_config: dict[str, int]) -> list[ServiceInstance]:
"""
Return instances in 'idle' state whose idle time exceeds their configured timeout.
idle_stop_config: {service_name: seconds} 0 means never stop automatically.
"""
now = time.time()
result = []
for inst in self._instances.values():
if inst.state != "idle" or inst.idle_since is None:
continue
timeout = idle_stop_config.get(inst.service, 0)
if timeout > 0 and (now - inst.idle_since) >= timeout:
result.append(inst)
return result

View file

@ -1,250 +0,0 @@
"""
cf-docuvision managed document understanding service.
Wraps ByteDance/Dolphin-v2 (Qwen2.5-VL backbone) behind a simple HTTP API.
Managed by cf-orch; started/stopped as a ProcessSpec service.
API
---
GET /health {"status": "ok", "model": "<path>"}
POST /extract ExtractResponse
Usage (standalone)::
python -m circuitforge_core.resources.docuvision.app \\
--model /Library/Assets/LLM/docuvision/models/dolphin-v2 \\
--port 8003 --gpu-id 0
"""
from __future__ import annotations
import argparse
import base64
import io
import json
import logging
from contextlib import asynccontextmanager
from typing import Any
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
logger = logging.getLogger(__name__)
# Module-level state — populated by _load_model() on first /extract call
_model: Any = None
_processor: Any = None
_model_path: str = ""
_device: str = "cpu"
# ── lazy loader ───────────────────────────────────────────────────────────────
def _load_model() -> None:
"""Lazy-load Dolphin-v2. Called once on first /extract request."""
global _model, _processor, _device
if _model is not None:
return
import torch
from transformers import AutoProcessor, AutoModelForCausalLM
logger.info("Loading Dolphin-v2 from %s ...", _model_path)
_device = "cuda" if torch.cuda.is_available() else "cpu"
_processor = AutoProcessor.from_pretrained(
_model_path,
trust_remote_code=True,
)
_model = AutoModelForCausalLM.from_pretrained(
_model_path,
trust_remote_code=True,
torch_dtype=torch.float16 if _device == "cuda" else torch.float32,
device_map=_device,
)
_model.eval()
logger.info("Dolphin-v2 loaded on %s", _device)
# ── FastAPI app ───────────────────────────────────────────────────────────────
@asynccontextmanager
async def _lifespan(app: FastAPI):
yield
app = FastAPI(title="cf-docuvision", lifespan=_lifespan)
# ── request / response models ─────────────────────────────────────────────────
class ExtractRequest(BaseModel):
"""
Either image_b64 (base64-encoded bytes) or image_path (absolute path) must
be provided. hint guides the extraction mode:
- "auto" - Dolphin-v2 detects layout and element types automatically
- "table" - optimise for tabular data (receipts, invoices, forms)
- "text" - optimise for dense prose (contracts, letters)
- "form" - optimise for form field extraction
"""
image_b64: str | None = None
image_path: str | None = None
hint: str = "auto"
class ElementOut(BaseModel):
type: str # heading | paragraph | list | table | figure | formula | code
text: str
bbox: list[float] | None = None # [x0, y0, x1, y1] normalised 0-1 if available
class TableOut(BaseModel):
html: str
bbox: list[float] | None = None
class ExtractResponse(BaseModel):
elements: list[ElementOut]
raw_text: str
tables: list[TableOut]
metadata: dict[str, Any]
# ── helpers ───────────────────────────────────────────────────────────────────
_HINT_PROMPTS: dict[str, str] = {
"auto": "Parse this document. Extract all elements with their types and text content.",
"table": "Extract all tables from this document as structured HTML. Also extract any line-item text.",
"text": "Extract all text from this document preserving paragraph and heading structure.",
"form": "Extract all form fields from this document. Return field labels and their values.",
}
def _image_from_request(req: ExtractRequest):
"""Return a PIL Image from either image_b64 or image_path."""
from PIL import Image
if req.image_b64:
img_bytes = base64.b64decode(req.image_b64)
return Image.open(io.BytesIO(img_bytes)).convert("RGB")
if req.image_path:
from pathlib import Path
p = Path(req.image_path)
if not p.exists():
raise HTTPException(status_code=404, detail=f"image_path not found: {req.image_path}")
return Image.open(p).convert("RGB")
raise HTTPException(status_code=422, detail="Either image_b64 or image_path must be provided")
def _parse_dolphin_output(raw: str) -> tuple[list[ElementOut], list[TableOut], str]:
"""
Parse Dolphin-v2's structured output into elements and tables.
Dolphin-v2 returns a JSON array of element dicts with keys:
type, text, [html], [bbox]
Falls back gracefully if the model returns plain text instead.
"""
elements: list[ElementOut] = []
tables: list[TableOut] = []
# Try JSON parse first
try:
parsed = json.loads(raw)
if isinstance(parsed, list):
for item in parsed:
etype = item.get("type", "paragraph")
text = item.get("text", "")
bbox = item.get("bbox")
if etype == "table":
tables.append(TableOut(html=item.get("html", text), bbox=bbox))
elements.append(ElementOut(type=etype, text=text, bbox=bbox))
raw_text = "\n".join(e.text for e in elements)
return elements, tables, raw_text
except (json.JSONDecodeError, TypeError):
pass
# Plain-text fallback: treat entire output as a single paragraph
elements = [ElementOut(type="paragraph", text=raw.strip())]
return elements, tables, raw.strip()
# ── routes ────────────────────────────────────────────────────────────────────
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok", "model": _model_path}
@app.post("/extract", response_model=ExtractResponse)
async def extract(req: ExtractRequest) -> ExtractResponse:
_load_model()
image = _image_from_request(req)
prompt = _HINT_PROMPTS.get(req.hint, _HINT_PROMPTS["auto"])
import torch
inputs = _processor(
text=prompt,
images=image,
return_tensors="pt",
).to(_device)
with torch.no_grad():
output_ids = _model.generate(
**inputs,
max_new_tokens=2048,
do_sample=False,
)
# Decode only the newly generated tokens
input_len = inputs["input_ids"].shape[1]
raw_output = _processor.decode(
output_ids[0][input_len:],
skip_special_tokens=True,
)
elements, tables, raw_text = _parse_dolphin_output(raw_output)
w, h = image.size
return ExtractResponse(
elements=elements,
raw_text=raw_text,
tables=tables,
metadata={
"hint": req.hint,
"width": w,
"height": h,
"model": _model_path,
"device": _device,
},
)
# ── CLI entry point ───────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="cf-docuvision service")
parser.add_argument("--model", required=True, help="Path to Dolphin-v2 model directory")
parser.add_argument("--port", type=int, default=8003)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--gpu-id", type=int, default=0)
args = parser.parse_args()
global _model_path
_model_path = args.model
import os
os.environ.setdefault("CUDA_VISIBLE_DEVICES", str(args.gpu_id))
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
uvicorn.run(app, host=args.host, port=args.port)
if __name__ == "__main__":
main()

View file

@ -1,137 +0,0 @@
"""Generic OpenAI-compatible inference server for HuggingFace causal LMs."""
from __future__ import annotations
import argparse
import time
import uuid
from contextlib import asynccontextmanager
from typing import Any
import torch
import uvicorn
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoModelForCausalLM, AutoTokenizer
_model: Any = None
_tokenizer: Any = None
_model_id: str = ""
_device: str = "cpu"
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
app = FastAPI(lifespan=lifespan)
class Message(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
model: str | None = None
messages: list[Message]
max_tokens: int | None = 512
temperature: float | None = 0.7
stream: bool | None = False
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok", "model": _model_id}
@app.get("/v1/models")
def list_models() -> dict[str, Any]:
return {
"object": "list",
"data": [{"id": _model_id, "object": "model", "owned_by": "cf-orch"}],
}
@app.post("/v1/chat/completions")
def chat_completions(req: ChatRequest) -> dict[str, Any]:
if _model is None:
raise HTTPException(503, detail="Model not loaded")
if req.stream:
raise HTTPException(501, detail="Streaming not supported")
conversation = [{"role": m.role, "content": m.content} for m in req.messages]
try:
encoded = _tokenizer.apply_chat_template(
conversation,
return_tensors="pt",
add_generation_prompt=True,
)
# transformers 5.x returns BatchEncoding; 4.x returned a bare tensor
input_ids = (encoded.input_ids if hasattr(encoded, "input_ids") else encoded).to(_device)
except Exception as exc:
raise HTTPException(500, detail=f"Tokenisation failed: {exc}")
max_new = req.max_tokens or 512
temp = req.temperature if req.temperature is not None else 0.7
gen_kwargs: dict[str, Any] = {
"max_new_tokens": max_new,
"do_sample": temp > 0,
"pad_token_id": _tokenizer.eos_token_id,
}
if temp > 0:
gen_kwargs["temperature"] = temp
with torch.inference_mode():
output_ids = _model.generate(input_ids, **gen_kwargs)
new_tokens = output_ids[0][input_ids.shape[-1]:]
reply = _tokenizer.decode(new_tokens, skip_special_tokens=True)
return {
"id": f"chatcmpl-{uuid.uuid4().hex[:8]}",
"object": "chat.completion",
"created": int(time.time()),
"model": _model_id,
"choices": [
{
"index": 0,
"message": {"role": "assistant", "content": reply},
"finish_reason": "stop",
}
],
"usage": {
"prompt_tokens": input_ids.shape[-1],
"completion_tokens": len(new_tokens),
"total_tokens": input_ids.shape[-1] + len(new_tokens),
},
}
def _load_model(model_path: str, gpu_id: int) -> None:
global _model, _tokenizer, _model_id, _device
_device = f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu"
_model_id = model_path
_tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
_model = AutoModelForCausalLM.from_pretrained(
model_path,
dtype=torch.float16 if "cuda" in _device else torch.float32,
device_map={"": _device},
trust_remote_code=True,
)
_model.eval()
def main() -> None:
parser = argparse.ArgumentParser(description="cf-orch generic LLM inference server")
parser.add_argument("--model", required=True)
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--host", default="0.0.0.0")
parser.add_argument("--gpu-id", type=int, default=0)
args = parser.parse_args()
_load_model(args.model, args.gpu_id)
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
if __name__ == "__main__":
main()

View file

@ -1,66 +0,0 @@
from __future__ import annotations
import time
import uuid
from dataclasses import dataclass, field
from typing import Optional
@dataclass(frozen=True)
class VRAMLease:
lease_id: str
gpu_id: int
node_id: str
mb_granted: int
holder_service: str
priority: int
expires_at: float # unix timestamp; 0.0 = no expiry
@classmethod
def create(
cls,
gpu_id: int,
node_id: str,
mb: int,
service: str,
priority: int,
ttl_s: float = 0.0,
) -> VRAMLease:
return cls(
lease_id=str(uuid.uuid4()),
gpu_id=gpu_id,
node_id=node_id,
mb_granted=mb,
holder_service=service,
priority=priority,
expires_at=time.time() + ttl_s if ttl_s > 0.0 else 0.0,
)
def is_expired(self) -> bool:
return self.expires_at > 0.0 and time.time() > self.expires_at
@dataclass(frozen=True)
class GpuInfo:
gpu_id: int
name: str
vram_total_mb: int
vram_used_mb: int
vram_free_mb: int
@dataclass(frozen=True)
class ResidentAllocation:
"""A model that is loaded and warm in VRAM but not actively serving a request."""
service: str
node_id: str
model_name: Optional[str] # None if service is running but model probe failed
first_seen: float = field(default_factory=time.time)
@dataclass
class NodeInfo:
node_id: str
agent_url: str
gpus: list[GpuInfo]
last_heartbeat: float = field(default_factory=time.time)

View file

@ -1,41 +0,0 @@
schema_version: 1
name: cpu-16gb
eviction_timeout_s: 30.0
services:
ollama:
max_mb: 0
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-stt:
max_mb: 0
priority: 2
shared: true
max_concurrent: 1
backend: moonshine
cf-tts:
max_mb: 0
priority: 2
shared: true
max_concurrent: 1
cf-embed:
max_mb: 0
priority: 2
shared: true
max_concurrent: 2
always_on: true
cf-classify:
max_mb: 0
priority: 2
shared: true
max_concurrent: 2
always_on: true
model_size_hints:
llm_max_params: 3b-q4
image_gen_max: none

View file

@ -1,41 +0,0 @@
schema_version: 1
name: cpu-32gb
eviction_timeout_s: 30.0
services:
ollama:
max_mb: 0
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-stt:
max_mb: 0
priority: 2
shared: true
max_concurrent: 2
backend: faster-whisper
cf-tts:
max_mb: 0
priority: 2
shared: true
max_concurrent: 2
cf-embed:
max_mb: 0
priority: 2
shared: true
max_concurrent: 4
always_on: true
cf-classify:
max_mb: 0
priority: 2
shared: true
max_concurrent: 4
always_on: true
model_size_hints:
llm_max_params: 7b-q4
image_gen_max: none

View file

@ -1,73 +0,0 @@
schema_version: 1
name: single-gpu-16gb
vram_total_mb: 16384
eviction_timeout_s: 10.0
services:
vllm:
max_mb: 9000
priority: 1
idle_stop_after_s: 600
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
port: 8000
host_port: 8000
cwd: "/Library/Development/CircuitForge/circuitforge-core"
ollama:
max_mb: 12288
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 3072
priority: 2
shared: true
max_concurrent: 4
cf-docuvision:
max_mb: 6144
priority: 2
shared: true
max_concurrent: 3
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
port: 8003
host_port: 8003
cwd: "/Library/Development/CircuitForge/circuitforge-core"
cf-stt:
max_mb: 1200
priority: 2
shared: true
max_concurrent: 3
backend: parakeet-tdt
cf-tts:
max_mb: 1024
priority: 2
shared: true
max_concurrent: 3
cf-embed:
max_mb: 512
priority: 2
shared: true
max_concurrent: 6
always_on: true
cf-classify:
max_mb: 512
priority: 2
shared: true
max_concurrent: 6
always_on: true
comfyui:
max_mb: 14336
priority: 4
model_size_hints:
llm_max_params: 34b
image_gen_max: flux-dev-fp8

View file

@ -1,73 +0,0 @@
schema_version: 1
name: single-gpu-24gb
vram_total_mb: 24576
eviction_timeout_s: 10.0
services:
vllm:
max_mb: 9000
priority: 1
idle_stop_after_s: 600
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
port: 8000
host_port: 8000
cwd: "/Library/Development/CircuitForge/circuitforge-core"
ollama:
max_mb: 18432
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 4096
priority: 2
shared: true
max_concurrent: 6
cf-docuvision:
max_mb: 8192
priority: 2
shared: true
max_concurrent: 4
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
port: 8003
host_port: 8003
cwd: "/Library/Development/CircuitForge/circuitforge-core"
cf-stt:
max_mb: 1200
priority: 2
shared: true
max_concurrent: 4
backend: parakeet-tdt
cf-tts:
max_mb: 1024
priority: 2
shared: true
max_concurrent: 4
cf-embed:
max_mb: 512
priority: 2
shared: true
max_concurrent: 8
always_on: true
cf-classify:
max_mb: 512
priority: 2
shared: true
max_concurrent: 8
always_on: true
comfyui:
max_mb: 20480
priority: 4
model_size_hints:
llm_max_params: 70b
image_gen_max: flux-dev-fp16

View file

@ -1,30 +0,0 @@
schema_version: 1
name: single-gpu-2gb
vram_total_mb: 2048
eviction_timeout_s: 15.0
services:
ollama:
max_mb: 1536
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 512
priority: 2
shared: true
max_concurrent: 1
cf-stt:
max_mb: 200
priority: 2
shared: true
max_concurrent: 1
backend: moonshine
model_size_hints:
llm_max_params: 3b
image_gen_max: none

View file

@ -1,38 +0,0 @@
schema_version: 1
name: single-gpu-4gb
vram_total_mb: 4096
eviction_timeout_s: 15.0
services:
ollama:
max_mb: 3072
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 1024
priority: 2
shared: true
max_concurrent: 1
cf-stt:
max_mb: 600
priority: 2
shared: true
max_concurrent: 1
backend: faster-whisper
cf-tts:
max_mb: 512
priority: 2
shared: true
max_concurrent: 1
comfyui:
max_mb: 3584
priority: 4
model_size_hints:
llm_max_params: 3b
image_gen_max: sd15-fp8

View file

@ -1,61 +0,0 @@
schema_version: 1
name: single-gpu-6gb
vram_total_mb: 6144
eviction_timeout_s: 10.0
services:
vllm:
max_mb: 5500
priority: 1
idle_stop_after_s: 600
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
port: 8000
host_port: 8000
cwd: "/Library/Development/CircuitForge/circuitforge-core"
ollama:
max_mb: 3584
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 1536
priority: 2
shared: true
max_concurrent: 2
cf-docuvision:
max_mb: 3072
priority: 2
shared: true
max_concurrent: 1
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
port: 8003
host_port: 8003
cwd: "/Library/Development/CircuitForge/circuitforge-core"
cf-stt:
max_mb: 600
priority: 2
shared: true
max_concurrent: 2
backend: faster-whisper
cf-tts:
max_mb: 768
priority: 2
shared: true
max_concurrent: 1
comfyui:
max_mb: 5120
priority: 4
model_size_hints:
llm_max_params: 7b
image_gen_max: sd15

View file

@ -1,68 +0,0 @@
schema_version: 1
name: single-gpu-8gb
vram_total_mb: 8192
eviction_timeout_s: 10.0
services:
vllm:
max_mb: 6500
priority: 1
idle_stop_after_s: 600
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.inference.llm_server --model /Library/Assets/LLM/vllm/models/{model} --port {port} --gpu-id {gpu_id}"
port: 8000
host_port: 8000
cwd: "/Library/Development/CircuitForge/circuitforge-core"
ollama:
max_mb: 4096
priority: 1
managed:
type: process
adopt: true
exec_path: "/usr/local/bin/ollama"
args_template: "serve"
port: 11434
host_port: 11434
health_path: /api/tags
cf-vision:
max_mb: 2048
priority: 2
shared: true
max_concurrent: 3
cf-docuvision:
max_mb: 4096
priority: 2
shared: true
max_concurrent: 2
managed:
type: process
exec_path: "/devl/miniconda3/envs/cf/bin/python"
args_template: "-m circuitforge_core.resources.docuvision.app --model /Library/Assets/LLM/docuvision/models/dolphin-v2 --port {port} --gpu-id {gpu_id}"
port: 8003
host_port: 8003
cwd: "/Library/Development/CircuitForge/circuitforge-core"
cf-stt:
max_mb: 1200
priority: 2
shared: true
max_concurrent: 2
backend: parakeet-tdt
cf-tts:
max_mb: 1024
priority: 2
shared: true
max_concurrent: 2
comfyui:
max_mb: 6144
priority: 4
managed:
type: process
exec_path: "/opt/miniconda3/envs/comfyui/bin/python"
args_template: "/opt/ComfyUI/main.py --listen 0.0.0.0 --port {port} --cuda-device {gpu_id}"
cwd: "/opt/ComfyUI"
port: 8188
host_port: 8188
model_size_hints:
llm_max_params: 8b
image_gen_max: sdxl-fp8

View file

@ -1,121 +0,0 @@
# circuitforge_core/resources/profiles/schema.py
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from pydantic import BaseModel, Field, model_validator
SUPPORTED_SCHEMA_VERSION = 1
class DockerSpec(BaseModel):
"""Spec for a Docker-managed service."""
image: str
port: int
host_port: int
command_template: str = ""
volumes: list[str] = Field(default_factory=list)
env: dict[str, str] = Field(default_factory=dict)
runtime: str = "nvidia"
ipc: str = "host"
model_config = {"frozen": True}
class ProcessSpec(BaseModel):
"""Spec for a process-managed service (non-Docker, e.g. conda env)."""
exec_path: str
args_template: str = ""
cwd: str = ""
env: dict[str, str] = Field(default_factory=dict)
port: int = 0
host_port: int = 0
# adopt=True: if the service is already listening on host_port, claim it rather
# than spawning a new process (useful for system daemons like Ollama).
adopt: bool = False
# Override the health probe path; defaults to /health (Ollama uses /api/tags).
health_path: str = "/health"
model_config = {"frozen": True}
class ServiceProfile(BaseModel):
max_mb: int
priority: int
shared: bool = False
max_concurrent: int = 1
always_on: bool = False
idle_stop_after_s: int = 0
backend: str | None = None
consumers: list[str] = Field(default_factory=list)
managed: DockerSpec | ProcessSpec | None = None
model_config = {"frozen": True}
@model_validator(mode="before")
@classmethod
def _parse_managed(cls, values: Any) -> Any:
if not isinstance(values, dict):
return values
raw = values.get("managed")
if raw is None:
return values
if not isinstance(raw, dict):
return values
spec_type = raw.get("type")
managed_fields = {k: v for k, v in raw.items() if k != "type"}
if spec_type == "docker":
values["managed"] = DockerSpec(**managed_fields)
elif spec_type == "process":
values["managed"] = ProcessSpec(**managed_fields)
else:
raise ValueError(f"Unknown managed service type: {spec_type!r}")
return values
class GpuNodeEntry(BaseModel):
id: int
vram_mb: int
role: str
card: str = "unknown"
always_on: bool = False
services: list[str] = Field(default_factory=list)
model_config = {"frozen": True}
class NodeProfile(BaseModel):
gpus: list[GpuNodeEntry]
agent_url: str | None = None
nas_mount: str | None = None
model_config = {"frozen": True}
class GpuProfile(BaseModel):
schema_version: int
name: str
vram_total_mb: int | None = None
eviction_timeout_s: float = 10.0
services: dict[str, ServiceProfile] = Field(default_factory=dict)
model_size_hints: dict[str, str] = Field(default_factory=dict)
nodes: dict[str, NodeProfile] = Field(default_factory=dict)
model_config = {"frozen": True}
def load_profile(path: Path) -> GpuProfile:
raw: dict[str, Any] = yaml.safe_load(path.read_text())
if not isinstance(raw, dict):
raise ValueError(f"Profile file {path} must be a YAML mapping, got {type(raw).__name__}")
version = raw.get("schema_version")
if version != SUPPORTED_SCHEMA_VERSION:
raise ValueError(
f"Unsupported schema_version {version!r} in {path}. "
f"Expected {SUPPORTED_SCHEMA_VERSION}."
)
return GpuProfile.model_validate(raw)

View file

@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
[project]
name = "circuitforge-core"
version = "0.7.0"
description = "Shared scaffold for CircuitForge products"
version = "0.8.0"
description = "Shared scaffold for CircuitForge products (MIT)"
requires-python = ">=3.11"
dependencies = [
"pyyaml>=6.0",
@ -14,32 +14,17 @@ dependencies = [
]
[project.optional-dependencies]
orch = [
"fastapi>=0.110",
"uvicorn[standard]>=0.29",
"httpx>=0.27",
"pydantic>=2.0",
"typer[all]>=0.12",
"psutil>=5.9",
]
tasks = [
"httpx>=0.27",
]
manage = [
"platformdirs>=4.0",
"typer[all]>=0.12",
]
dev = [
"circuitforge-core[orch]",
"circuitforge-core[tasks]",
"circuitforge-core[manage]",
"pytest>=8.0",
"pytest-asyncio>=0.23",
"httpx>=0.27",
]
[project.scripts]
cf-orch = "circuitforge_core.resources.cli:app"
cf-manage = "circuitforge_core.manage.cli:app"
[tool.setuptools.packages.find]

View file

@ -1,68 +0,0 @@
from __future__ import annotations
import pytest
from unittest.mock import MagicMock
from fastapi.testclient import TestClient
from circuitforge_core.resources.agent.app import create_agent_app
from circuitforge_core.resources.models import GpuInfo
from circuitforge_core.resources.agent.eviction_executor import EvictionResult
MOCK_GPUS = [
GpuInfo(
gpu_id=0,
name="RTX 4000",
vram_total_mb=8192,
vram_used_mb=1024,
vram_free_mb=7168,
),
]
@pytest.fixture
def agent_client():
mock_monitor = MagicMock()
mock_monitor.poll.return_value = MOCK_GPUS
mock_executor = MagicMock()
app = create_agent_app(
node_id="heimdall",
monitor=mock_monitor,
executor=mock_executor,
)
return TestClient(app), mock_monitor, mock_executor
def test_health_returns_ok(agent_client):
client, _, _ = agent_client
resp = client.get("/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok"
assert resp.json()["node_id"] == "heimdall"
def test_gpu_info_returns_gpu_list(agent_client):
client, _, _ = agent_client
resp = client.get("/gpu-info")
assert resp.status_code == 200
data = resp.json()
assert len(data["gpus"]) == 1
assert data["gpus"][0]["gpu_id"] == 0
assert data["gpus"][0]["name"] == "RTX 4000"
assert data["gpus"][0]["vram_free_mb"] == 7168
def test_evict_calls_executor(agent_client):
client, _, mock_executor = agent_client
mock_executor.evict_pid.return_value = EvictionResult(
success=True, method="sigterm", message="done"
)
resp = client.post("/evict", json={"pid": 1234, "grace_period_s": 5.0})
assert resp.status_code == 200
assert resp.json()["success"] is True
mock_executor.evict_pid.assert_called_once_with(pid=1234, grace_period_s=5.0)
def test_evict_requires_pid(agent_client):
client, _, _ = agent_client
resp = client.post("/evict", json={"grace_period_s": 5.0})
assert resp.status_code == 422

View file

@ -1,93 +0,0 @@
import asyncio
import time
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry, ServiceInstance
def test_build_idle_stop_config_empty_without_registry():
lm = LeaseManager()
supervisor = AgentSupervisor(lease_manager=lm)
assert supervisor._build_idle_stop_config() == {}
def test_build_idle_stop_config_from_profiles():
lm = LeaseManager()
mock_svc = MagicMock()
mock_svc.idle_stop_after_s = 600
mock_profile = MagicMock()
mock_profile.services = {"vllm": mock_svc}
mock_profile_registry = MagicMock()
mock_profile_registry.list_public.return_value = [mock_profile]
supervisor = AgentSupervisor(lease_manager=lm, profile_registry=mock_profile_registry)
config = supervisor._build_idle_stop_config()
assert config == {"vllm": 600}
@pytest.mark.asyncio
async def test_run_idle_sweep_posts_stop():
lm = LeaseManager()
service_registry = ServiceRegistry()
# Upsert instance as running, then allocate + release to transition it to idle
service_registry.upsert_instance(
service="vllm",
node_id="heimdall",
gpu_id=0,
state="running",
model="test-model",
url="http://heimdall:8000",
)
alloc = service_registry.allocate(
service="vllm",
node_id="heimdall",
gpu_id=0,
model="test-model",
url="http://heimdall:8000",
caller="test",
ttl_s=300.0,
)
service_registry.release(alloc.allocation_id)
# Backdate idle_since so it exceeds the timeout
import dataclasses
key = "vllm:heimdall:0"
inst = service_registry._instances[key]
service_registry._instances[key] = dataclasses.replace(inst, idle_since=time.time() - 700)
mock_profile_registry = MagicMock()
mock_svc = MagicMock()
mock_svc.idle_stop_after_s = 600
mock_profile = MagicMock()
mock_profile.services = {"vllm": mock_svc}
mock_profile_registry.list_public.return_value = [mock_profile]
supervisor = AgentSupervisor(
lease_manager=lm,
service_registry=service_registry,
profile_registry=mock_profile_registry,
)
supervisor.register("heimdall", "http://heimdall:7701")
posted_urls = []
async def fake_http_post(url: str) -> bool:
posted_urls.append(url)
return True
supervisor._http_post = fake_http_post
await supervisor._run_idle_sweep()
assert len(posted_urls) == 1
assert posted_urls[0] == "http://heimdall:7701/services/vllm/stop"
@pytest.mark.asyncio
async def test_run_idle_sweep_skips_without_registry():
lm = LeaseManager()
supervisor = AgentSupervisor(lease_manager=lm)
# Should return immediately without error
await supervisor._run_idle_sweep()

View file

@ -1,151 +0,0 @@
# tests/test_resources/test_agent_watchdog.py
"""
Tests for AgentSupervisor watchdog behaviour:
- restore_from_store() reloads known nodes from NodeStore on startup
- register() persists to NodeStore
- restored nodes start offline and come online after a successful poll
- NodeStore=None path is a no-op (backwards compatibility)
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.node_store import NodeStore
# ── fixtures ──────────────────────────────────────────────────────────────────
@pytest.fixture
def store(tmp_path: Path) -> NodeStore:
return NodeStore(db_path=tmp_path / "nodes.db")
@pytest.fixture
def supervisor(store: NodeStore) -> AgentSupervisor:
return AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
@pytest.fixture
def supervisor_no_store() -> AgentSupervisor:
return AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
# ── register() persists ───────────────────────────────────────────────────────
def test_register_persists_to_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
supervisor.register("heimdall", "http://127.0.0.1:7701")
rows = store.all()
assert len(rows) == 1
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
def test_register_updates_url_in_store(supervisor: AgentSupervisor, store: NodeStore) -> None:
supervisor.register("navi", "http://10.1.10.10:7701")
supervisor.register("navi", "http://10.1.10.10:9999")
rows = store.all()
assert len(rows) == 1
assert rows[0][1] == "http://10.1.10.10:9999"
def test_register_without_store_does_not_crash(supervisor_no_store: AgentSupervisor) -> None:
supervisor_no_store.register("heimdall", "http://127.0.0.1:7701")
assert supervisor_no_store.get_node_info("heimdall") is not None
# ── restore_from_store() ──────────────────────────────────────────────────────
def test_restore_loads_known_nodes(tmp_path: Path) -> None:
"""Nodes written by a previous supervisor session are restored into a fresh one."""
db = tmp_path / "nodes.db"
# Session 1: register two nodes
s1 = NodeStore(db_path=db)
sup1 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s1)
sup1.register("navi", "http://10.1.10.10:7701")
sup1.register("strahl", "http://10.1.10.20:7701")
# Session 2: fresh supervisor, same DB
s2 = NodeStore(db_path=db)
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
restored = sup2.restore_from_store()
assert restored == 2
assert sup2.get_node_info("navi") is not None
assert sup2.get_node_info("strahl") is not None
def test_restore_marks_nodes_offline(tmp_path: Path) -> None:
"""Restored nodes start offline — they haven't been polled yet."""
db = tmp_path / "nodes.db"
s1 = NodeStore(db_path=db)
AgentSupervisor(lease_manager=LeaseManager(), node_store=s1).register(
"navi", "http://10.1.10.10:7701"
)
s2 = NodeStore(db_path=db)
sup2 = AgentSupervisor(lease_manager=LeaseManager(), node_store=s2)
sup2.restore_from_store()
assert sup2.online_agents() == {}
def test_restore_returns_zero_without_store() -> None:
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=None)
assert sup.restore_from_store() == 0
def test_restore_skips_already_registered(tmp_path: Path) -> None:
"""Nodes manually registered before restore_from_store() are not duplicated."""
db = tmp_path / "nodes.db"
store = NodeStore(db_path=db)
store.upsert("heimdall", "http://127.0.0.1:7701")
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
sup.register("heimdall", "http://127.0.0.1:7701") # already in memory
restored = sup.restore_from_store()
assert restored == 0 # already present, not double-counted
# ── restored node comes online after poll ─────────────────────────────────────
@pytest.mark.asyncio
async def test_restored_node_comes_online_after_poll(tmp_path: Path) -> None:
"""After restore, a successful poll_agent() brings the node online."""
db = tmp_path / "nodes.db"
store = NodeStore(db_path=db)
store.upsert("navi", "http://10.1.10.10:7701")
sup = AgentSupervisor(lease_manager=LeaseManager(), node_store=store)
sup.restore_from_store()
# Stub poll_agent to succeed
gpu_payload = {"gpus": [{"gpu_id": 0, "name": "RTX 4000",
"vram_total_mb": 8192, "vram_used_mb": 512, "vram_free_mb": 7680}]}
resident_payload = {"residents": []}
mock_resp_gpu = MagicMock()
mock_resp_gpu.raise_for_status = MagicMock()
mock_resp_gpu.json.return_value = gpu_payload
mock_resp_res = MagicMock()
mock_resp_res.is_success = True
mock_resp_res.json.return_value = resident_payload
mock_client = AsyncMock()
mock_client.get = AsyncMock(side_effect=[mock_resp_gpu, mock_resp_res])
mock_client.__aenter__ = AsyncMock(return_value=mock_client)
mock_client.__aexit__ = AsyncMock(return_value=False)
with patch("circuitforge_core.resources.coordinator.agent_supervisor.httpx.AsyncClient",
return_value=mock_client):
result = await sup.poll_agent("navi")
assert result is True
assert "navi" in sup.online_agents()

View file

@ -1,33 +0,0 @@
from __future__ import annotations
from pathlib import Path
from unittest.mock import patch
from typer.testing import CliRunner
from circuitforge_core.resources.cli import app
runner = CliRunner()
def test_cli_help():
result = runner.invoke(app, ["--help"])
assert result.exit_code == 0
assert "cf-orch" in result.output.lower() or "Usage" in result.output
def test_status_command_shows_no_coordinator_message():
with patch("httpx.get", side_effect=ConnectionRefusedError("refused")):
result = runner.invoke(app, ["status"])
assert result.exit_code != 0 or "unreachable" in result.output.lower() \
or "coordinator" in result.output.lower()
def test_install_service_creates_systemd_unit(tmp_path: Path):
unit_path = tmp_path / "cf-orch.service"
with patch(
"circuitforge_core.resources.cli._SYSTEMD_UNIT_PATH", unit_path
):
result = runner.invoke(app, ["install-service", "--dry-run"])
assert result.exit_code == 0
assert "cf-orch.service" in result.output or "systemd" in result.output.lower()

View file

@ -1,94 +0,0 @@
import json
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
import httpretty
from circuitforge_core.resources.client import CFOrchClient, Allocation
_ALLOC_BODY = (
'{"allocation_id":"abc123","service":"vllm","node_id":"heimdall",'
'"gpu_id":0,"model":"Ouro-1.4B","url":"http://heimdall:8000","started":false,"warm":true}'
)
@httpretty.activate
def test_sync_allocate_returns_allocation():
httpretty.register_uri(
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
body=_ALLOC_BODY, content_type="application/json",
)
httpretty.register_uri(
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/abc123",
body='{"released":true}', content_type="application/json",
)
client = CFOrchClient("http://orch:7700")
with client.allocate("vllm", model_candidates=["Ouro-1.4B"], caller="test") as alloc:
assert isinstance(alloc, Allocation)
assert alloc.url == "http://heimdall:8000"
assert alloc.model == "Ouro-1.4B"
assert alloc.allocation_id == "abc123"
assert httpretty.last_request().method == "DELETE"
@httpretty.activate
def test_sync_allocate_ignores_404_on_release():
httpretty.register_uri(
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
body='{"allocation_id":"xyz","service":"vllm","node_id":"a","gpu_id":0,'
'"model":"m","url":"http://a:8000","started":false,"warm":false}',
content_type="application/json",
)
httpretty.register_uri(
httpretty.DELETE, "http://orch:7700/api/services/vllm/allocations/xyz",
status=404, body='{"detail":"not found"}', content_type="application/json",
)
client = CFOrchClient("http://orch:7700")
with client.allocate("vllm", model_candidates=["m"]) as alloc:
assert alloc.url == "http://a:8000"
# No exception raised — 404 on release is silently ignored
@httpretty.activate
def test_sync_allocate_raises_on_503():
httpretty.register_uri(
httpretty.POST, "http://orch:7700/api/services/vllm/allocate",
status=503, body='{"detail":"no capacity"}', content_type="application/json",
)
client = CFOrchClient("http://orch:7700")
with pytest.raises(RuntimeError, match="cf-orch allocation failed"):
with client.allocate("vllm", model_candidates=["m"]):
pass
async def test_async_allocate_works():
# httpretty only patches stdlib sockets; httpx async uses anyio sockets so
# we mock httpx.AsyncClient directly instead.
alloc_data = {
"allocation_id": "a1", "service": "vllm", "node_id": "n",
"gpu_id": 0, "model": "m", "url": "http://n:8000",
"started": False, "warm": False,
}
release_data = {"released": True}
def _make_response(data, status_code=200):
resp = MagicMock()
resp.is_success = status_code < 400
resp.status_code = status_code
resp.json.return_value = data
return resp
mock_post = AsyncMock(return_value=_make_response(alloc_data))
mock_delete = AsyncMock(return_value=_make_response(release_data))
mock_async_client = MagicMock()
mock_async_client.post = mock_post
mock_async_client.delete = mock_delete
mock_async_client.__aenter__ = AsyncMock(return_value=mock_async_client)
mock_async_client.__aexit__ = AsyncMock(return_value=False)
with patch("httpx.AsyncClient", return_value=mock_async_client):
client = CFOrchClient("http://orch:7700")
async with client.allocate_async("vllm", model_candidates=["m"]) as alloc:
assert alloc.url == "http://n:8000"
assert alloc.allocation_id == "a1"
mock_delete.assert_called_once()

View file

@ -1,132 +0,0 @@
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from fastapi.testclient import TestClient
from circuitforge_core.resources.coordinator.app import create_coordinator_app
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
from circuitforge_core.resources.models import GpuInfo, NodeInfo
def _make_supervisor_mock(online: bool = True):
sup = MagicMock()
record = AgentRecord(node_id="heimdall", agent_url="http://heimdall:7701")
record.gpus = [GpuInfo(0, "RTX 4000", 8192, 0, 8192)]
record.online = online
sup.online_agents.return_value = {"heimdall": record} if online else {}
sup.get_node_info.return_value = NodeInfo(
node_id="heimdall",
agent_url="http://heimdall:7701",
gpus=record.gpus,
last_heartbeat=0.0,
)
return sup
@pytest.fixture
def alloc_client():
lm = LeaseManager()
pr = ProfileRegistry()
sup = _make_supervisor_mock()
sr = ServiceRegistry()
app = create_coordinator_app(lease_manager=lm, profile_registry=pr, agent_supervisor=sup, service_registry=sr)
return TestClient(app), sup, sr
def test_allocate_returns_allocation_id_and_url(alloc_client):
client, sup, sr = alloc_client
with patch("httpx.AsyncClient") as mock_http:
mock_resp = MagicMock()
mock_resp.is_success = True
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
resp = client.post("/api/services/vllm/allocate", json={
"model_candidates": ["Ouro-1.4B"],
"ttl_s": 300.0,
"caller": "test",
})
assert resp.status_code == 200
data = resp.json()
assert "allocation_id" in data
assert data["service"] == "vllm"
assert data["node_id"] == "heimdall"
assert data["url"] == "http://heimdall:8000"
def test_allocate_returns_503_when_no_online_nodes(alloc_client):
client, sup, sr = alloc_client
sup.online_agents.return_value = {}
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": ["Ouro-1.4B"]})
assert resp.status_code == 503
def test_allocate_returns_422_for_empty_candidates(alloc_client):
client, _, sr = alloc_client
resp = client.post("/api/services/vllm/allocate", json={"model_candidates": []})
assert resp.status_code == 422
def test_allocate_returns_422_for_unknown_service(alloc_client):
client, _, sr = alloc_client
resp = client.post("/api/services/cf-made-up/allocate", json={"model_candidates": ["x"]})
assert resp.status_code == 422
def test_allocate_records_in_registry(alloc_client):
client, sup, sr = alloc_client
with patch("httpx.AsyncClient") as mock_http:
mock_resp = MagicMock()
mock_resp.is_success = True
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
resp = client.post("/api/services/vllm/allocate", json={
"model_candidates": ["Ouro-1.4B"],
"ttl_s": 300.0,
"caller": "test",
})
assert resp.status_code == 200
allocation_id = resp.json()["allocation_id"]
status_resp = client.get("/api/services/vllm/status")
assert status_resp.status_code == 200
status_data = status_resp.json()
assert status_data["service"] == "vllm"
alloc_ids = [a["allocation_id"] for a in status_data["allocations"]]
assert allocation_id in alloc_ids
def test_release_allocation(alloc_client):
client, sup, sr = alloc_client
with patch("httpx.AsyncClient") as mock_http:
mock_resp = MagicMock()
mock_resp.is_success = True
mock_resp.json.return_value = {"running": True, "url": "http://heimdall:8000"}
mock_http.return_value.__aenter__.return_value.post = AsyncMock(return_value=mock_resp)
resp = client.post("/api/services/vllm/allocate", json={
"model_candidates": ["Ouro-1.4B"],
"ttl_s": 300.0,
"caller": "test",
})
assert resp.status_code == 200
allocation_id = resp.json()["allocation_id"]
del_resp = client.delete(f"/api/services/vllm/allocations/{allocation_id}")
assert del_resp.status_code == 200
assert del_resp.json() == {"released": True, "allocation_id": allocation_id}
status_resp = client.get("/api/services/vllm/status")
alloc_ids = [a["allocation_id"] for a in status_resp.json()["allocations"]]
assert allocation_id not in alloc_ids
def test_release_allocation_not_found(alloc_client):
client, _, sr = alloc_client
resp = client.delete("/api/services/vllm/allocations/bad-id")
assert resp.status_code == 404

View file

@ -1,183 +0,0 @@
import pytest
from unittest.mock import MagicMock
from pathlib import Path
from fastapi.testclient import TestClient
from circuitforge_core.resources.coordinator.app import create_coordinator_app
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.models import GpuInfo, NodeInfo
from circuitforge_core.resources.profiles.schema import load_profile
@pytest.fixture
def coordinator_client():
lease_manager = LeaseManager()
lease_manager.register_gpu("heimdall", 0, 8192)
profile_registry = ProfileRegistry()
supervisor = MagicMock()
supervisor.all_nodes.return_value = [
NodeInfo(
node_id="heimdall",
agent_url="http://localhost:7701",
gpus=[GpuInfo(gpu_id=0, name="RTX 4000",
vram_total_mb=8192, vram_used_mb=0, vram_free_mb=8192)],
last_heartbeat=0.0,
)
]
supervisor.get_node_info.return_value = NodeInfo(
node_id="heimdall",
agent_url="http://localhost:7701",
gpus=[],
last_heartbeat=0.0,
)
app = create_coordinator_app(
lease_manager=lease_manager,
profile_registry=profile_registry,
agent_supervisor=supervisor,
service_registry=ServiceRegistry(),
)
return TestClient(app), lease_manager
def test_health_returns_ok(coordinator_client):
client, _ = coordinator_client
resp = client.get("/api/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok"
def test_get_nodes_returns_list(coordinator_client):
client, _ = coordinator_client
resp = client.get("/api/nodes")
assert resp.status_code == 200
nodes = resp.json()["nodes"]
assert len(nodes) == 1
assert nodes[0]["node_id"] == "heimdall"
def test_get_profiles_returns_public_profiles(coordinator_client):
client, _ = coordinator_client
resp = client.get("/api/profiles")
assert resp.status_code == 200
names = [p["name"] for p in resp.json()["profiles"]]
assert "single-gpu-8gb" in names
def test_post_lease_grants_lease(coordinator_client):
client, _ = coordinator_client
resp = client.post("/api/leases", json={
"node_id": "heimdall", "gpu_id": 0,
"mb": 2048, "service": "peregrine", "priority": 1,
})
assert resp.status_code == 200
data = resp.json()
assert data["lease"]["mb_granted"] == 2048
assert data["lease"]["holder_service"] == "peregrine"
assert "lease_id" in data["lease"]
def test_delete_lease_releases_it(coordinator_client):
client, _ = coordinator_client
resp = client.post("/api/leases", json={
"node_id": "heimdall", "gpu_id": 0,
"mb": 2048, "service": "peregrine", "priority": 1,
})
lease_id = resp.json()["lease"]["lease_id"]
del_resp = client.delete(f"/api/leases/{lease_id}")
assert del_resp.status_code == 200
assert del_resp.json()["released"] is True
def test_delete_unknown_lease_returns_404(coordinator_client):
client, _ = coordinator_client
resp = client.delete("/api/leases/nonexistent-id")
assert resp.status_code == 404
def test_get_leases_returns_active_leases(coordinator_client):
client, _ = coordinator_client
client.post("/api/leases", json={
"node_id": "heimdall", "gpu_id": 0,
"mb": 1024, "service": "kiwi", "priority": 2,
})
resp = client.get("/api/leases")
assert resp.status_code == 200
assert len(resp.json()["leases"]) == 1
def test_dashboard_serves_html(coordinator_client):
"""GET / returns the dashboard HTML page."""
client, _ = coordinator_client
resp = client.get("/")
assert resp.status_code == 200
assert "text/html" in resp.headers["content-type"]
# Verify key structural markers are present (without asserting exact markup)
assert "cf-orch" in resp.text
assert "/api/nodes" in resp.text
assert "/api/leases" in resp.text
def test_online_agents_excludes_offline():
lm = LeaseManager()
sup = AgentSupervisor(lm)
sup.register("online_node", "http://a:7701")
sup.register("offline_node", "http://b:7701")
sup._agents["online_node"].online = True
sup._agents["offline_node"].online = False
result = sup.online_agents()
assert "online_node" in result
assert "offline_node" not in result
def test_resident_keys_returns_set_of_node_service():
lm = LeaseManager()
lm.set_residents_for_node("heimdall", [("vllm", "Ouro-1.4B"), ("ollama", None)])
keys = lm.resident_keys()
assert keys == {"heimdall:vllm", "heimdall:ollama"}
def test_single_gpu_8gb_profile_has_idle_stop_after_s():
profile = load_profile(
Path("circuitforge_core/resources/profiles/public/single-gpu-8gb.yaml")
)
vllm_svc = profile.services.get("vllm")
assert vllm_svc is not None
assert hasattr(vllm_svc, "idle_stop_after_s")
assert vllm_svc.idle_stop_after_s == 600
def test_ensure_service_returns_503_when_vram_too_low():
"""VRAM pre-flight guard fires before any HTTP request when free VRAM < service max_mb."""
# Threshold = full max_mb (not half); 100 MB free on any profile triggers 503.
lease_manager = LeaseManager()
lease_manager.register_gpu("low-vram-node", 0, 512)
profile_registry = ProfileRegistry()
supervisor = MagicMock()
supervisor.get_node_info.return_value = NodeInfo(
node_id="low-vram-node",
agent_url="http://localhost:7701",
gpus=[GpuInfo(gpu_id=0, name="GTX 1050",
vram_total_mb=512, vram_used_mb=412, vram_free_mb=100)],
last_heartbeat=0.0,
)
supervisor.all_nodes.return_value = []
app = create_coordinator_app(
lease_manager=lease_manager,
profile_registry=profile_registry,
agent_supervisor=supervisor,
service_registry=ServiceRegistry(),
)
client = TestClient(app)
resp = client.post("/api/services/vllm/ensure", json={
"node_id": "low-vram-node",
"gpu_id": 0,
"params": {"model": "some-model"},
})
assert resp.status_code == 503
assert "Insufficient VRAM" in resp.json()["detail"]
# Guard must fire before any agent HTTP call is attempted.
supervisor.get_node_info.assert_called_once_with("low-vram-node")

View file

@ -1,148 +0,0 @@
"""Tests for HeimdallAuthMiddleware — TTL cache and request gating."""
import time
import pytest
from unittest.mock import patch, MagicMock
from fastapi import FastAPI
from fastapi.testclient import TestClient
from circuitforge_core.resources.coordinator.auth import (
HeimdallAuthMiddleware,
_ValidationCache,
CACHE_TTL_S,
)
# ── Cache unit tests ──────────────────────────────────────────────────────────
def test_cache_miss_returns_none():
cache = _ValidationCache()
assert cache.get("nonexistent") is None
def test_cache_stores_and_retrieves():
cache = _ValidationCache()
cache.set("key1", valid=True, tier="paid", user_id="u1")
entry = cache.get("key1")
assert entry is not None
assert entry.valid is True
assert entry.tier == "paid"
def test_cache_entry_expires():
cache = _ValidationCache(ttl_s=0.05)
cache.set("key1", valid=True, tier="paid", user_id="u1")
time.sleep(0.1)
assert cache.get("key1") is None
def test_cache_evict_removes_key():
cache = _ValidationCache()
cache.set("key1", valid=True, tier="paid", user_id="u1")
cache.evict("key1")
assert cache.get("key1") is None
def test_cache_prune_removes_expired():
cache = _ValidationCache(ttl_s=0.05)
cache.set("k1", valid=True, tier="paid", user_id="")
cache.set("k2", valid=True, tier="paid", user_id="")
time.sleep(0.1)
removed = cache.prune()
assert removed == 2
# ── Middleware integration tests ──────────────────────────────────────────────
def _make_app_with_auth(middleware: HeimdallAuthMiddleware) -> TestClient:
app = FastAPI()
app.middleware("http")(middleware)
@app.get("/api/health")
def health():
return {"status": "ok"}
@app.post("/api/services/vllm/allocate")
def allocate():
return {"allocation_id": "abc", "url": "http://gpu:8000"}
return TestClient(app, raise_server_exceptions=False)
def _patched_middleware(valid: bool, tier: str = "paid") -> HeimdallAuthMiddleware:
"""Return a middleware whose Heimdall call is pre-mocked."""
mw = HeimdallAuthMiddleware(
heimdall_url="http://heimdall.test",
min_tier="paid",
)
mw._validate_against_heimdall = MagicMock( # type: ignore[method-assign]
return_value=(valid, tier, "user-1" if valid else "")
)
return mw
def test_health_exempt_no_auth_required():
mw = _patched_middleware(valid=True)
client = _make_app_with_auth(mw)
resp = client.get("/api/health")
assert resp.status_code == 200
def test_missing_auth_header_returns_401():
mw = _patched_middleware(valid=True)
client = _make_app_with_auth(mw)
resp = client.post("/api/services/vllm/allocate")
assert resp.status_code == 401
def test_invalid_key_returns_403():
mw = _patched_middleware(valid=False)
client = _make_app_with_auth(mw)
resp = client.post(
"/api/services/vllm/allocate",
headers={"Authorization": "Bearer BAD-KEY"},
)
assert resp.status_code == 403
def test_valid_paid_key_passes():
mw = _patched_middleware(valid=True, tier="paid")
client = _make_app_with_auth(mw)
resp = client.post(
"/api/services/vllm/allocate",
headers={"Authorization": "Bearer CFG-KIWI-GOOD-GOOD-GOOD"},
)
assert resp.status_code == 200
def test_free_tier_key_rejected_when_min_is_paid():
mw = _patched_middleware(valid=True, tier="free")
client = _make_app_with_auth(mw)
resp = client.post(
"/api/services/vllm/allocate",
headers={"Authorization": "Bearer CFG-KIWI-FREE-FREE-FREE"},
)
assert resp.status_code == 403
assert "paid" in resp.json()["detail"]
def test_cache_prevents_second_heimdall_call():
mw = _patched_middleware(valid=True, tier="paid")
client = _make_app_with_auth(mw)
key = "CFG-KIWI-CACHED-KEY-1"
headers = {"Authorization": f"Bearer {key}"}
client.post("/api/services/vllm/allocate", headers=headers)
client.post("/api/services/vllm/allocate", headers=headers)
# Heimdall should only have been called once — second hit is from cache
assert mw._validate_against_heimdall.call_count == 1 # type: ignore[attr-defined]
def test_from_env_returns_none_without_heimdall_url(monkeypatch):
monkeypatch.delenv("HEIMDALL_URL", raising=False)
assert HeimdallAuthMiddleware.from_env() is None
def test_from_env_returns_middleware_when_set(monkeypatch):
monkeypatch.setenv("HEIMDALL_URL", "http://heimdall.test")
mw = HeimdallAuthMiddleware.from_env()
assert mw is not None
assert mw._heimdall == "http://heimdall.test"

View file

@ -1,215 +0,0 @@
# tests/test_resources/test_coordinator_probe.py
"""
Unit tests for _run_instance_probe_loop in coordinator/app.py.
Covers:
- healthy path: /health 200 state transitions starting running
- timeout path: no healthy response within _PROBE_TIMEOUT_S starting stopped
- cleanup path: non-starting instance cleans up its start_times entry
"""
from __future__ import annotations
import asyncio
from unittest.mock import MagicMock, patch
import pytest
from circuitforge_core.resources.coordinator.app import (
_PROBE_TIMEOUT_S,
_run_instance_probe_loop,
)
from circuitforge_core.resources.coordinator.service_registry import ServiceInstance, ServiceRegistry
# ── helpers ──────────────────────────────────────────────────────────────────
def _inst(**kwargs) -> ServiceInstance:
defaults = dict(
service="vllm", node_id="node1", gpu_id=0,
state="starting", model="qwen", url="http://localhost:8000",
)
defaults.update(kwargs)
return ServiceInstance(**defaults)
def _registry(*instances: ServiceInstance) -> MagicMock:
reg = MagicMock(spec=ServiceRegistry)
reg.all_instances.return_value = list(instances)
return reg
def _health_resp(status: int = 200) -> MagicMock:
"""Context-manager mock that simulates an HTTP response."""
resp = MagicMock()
resp.status = status
resp.__enter__ = lambda s: resp
resp.__exit__ = MagicMock(return_value=False)
return resp
async def _one_tick(coro_fn, registry, *, time_val: float = 1000.0, **url_patch):
"""
Run the probe loop for exactly one iteration then cancel it.
asyncio.sleep is patched to return immediately on the first call
and raise CancelledError on the second (ending the loop cleanly).
"""
calls = 0
async def _fake_sleep(_delay):
nonlocal calls
calls += 1
if calls > 1:
raise asyncio.CancelledError()
patches = [
patch("asyncio.sleep", new=_fake_sleep),
patch("time.time", return_value=time_val),
]
if url_patch:
patches.append(patch("urllib.request.urlopen", **url_patch))
ctx = [p.__enter__() for p in patches]
try:
await coro_fn(registry)
except asyncio.CancelledError:
pass
finally:
for p in reversed(patches):
p.__exit__(None, None, None)
# ── tests ────────────────────────────────────────────────────────────────────
@pytest.mark.asyncio
async def test_probe_transitions_starting_to_running():
"""GET /health → 200 while in starting state → upsert_instance(state='running')."""
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
calls = 0
async def fake_sleep(_delay):
nonlocal calls
calls += 1
if calls > 1:
raise asyncio.CancelledError()
with patch("asyncio.sleep", new=fake_sleep), \
patch("time.time", return_value=1000.0), \
patch("urllib.request.urlopen", return_value=_health_resp(200)):
try:
await _run_instance_probe_loop(reg)
except asyncio.CancelledError:
pass
reg.upsert_instance.assert_called_once_with(
service="vllm", node_id="node1", gpu_id=0,
state="running", model="qwen", url="http://localhost:8000",
)
@pytest.mark.asyncio
async def test_probe_transitions_starting_to_stopped_on_timeout():
"""No healthy response + time past _PROBE_TIMEOUT_S → upsert_instance(state='stopped').
Tick 1: seeds start_times[key] = 1000.0
Tick 2: time has advanced past _PROBE_TIMEOUT_S timeout fires stopped
Tick 3: CancelledError exits the loop
"""
reg = _registry(_inst(state="starting", url="http://localhost:8000"))
tick = 0
# Tick 1: t=1000 (seed); Tick 2: t=far_future (timeout fires)
times = [1000.0, 1000.0 + _PROBE_TIMEOUT_S + 1.0]
async def fake_sleep(_delay):
nonlocal tick
tick += 1
if tick > 2:
raise asyncio.CancelledError()
with patch("asyncio.sleep", new=fake_sleep), \
patch("time.time", side_effect=times * 10), \
patch("urllib.request.urlopen", side_effect=OSError("connection refused")):
try:
await _run_instance_probe_loop(reg)
except asyncio.CancelledError:
pass
reg.upsert_instance.assert_called_once_with(
service="vllm", node_id="node1", gpu_id=0,
state="stopped", model="qwen", url="http://localhost:8000",
)
@pytest.mark.asyncio
async def test_probe_cleans_up_start_times_for_non_starting():
"""
An instance that is no longer in 'starting' state should not cause
upsert_instance to be called, and its key should be removed from start_times.
We verify this indirectly: run two ticks first with state='starting' (seeds
the key and transitions to running), second with the updated registry returning
state='running' (should not call upsert again).
"""
starting_inst = _inst(state="starting", url="http://localhost:8000")
running_inst = _inst(state="running", url="http://localhost:8000")
tick = 0
# First tick: instance is starting → transitions to running
# Second tick: registry now returns running → no upsert
# Third tick: cancel
def instances_side_effect():
if tick <= 1:
return [starting_inst]
return [running_inst]
reg = MagicMock(spec=ServiceRegistry)
reg.all_instances.side_effect = instances_side_effect
async def fake_sleep(_delay):
nonlocal tick
tick += 1
if tick > 2:
raise asyncio.CancelledError()
with patch("asyncio.sleep", new=fake_sleep), \
patch("time.time", return_value=1000.0), \
patch("urllib.request.urlopen", return_value=_health_resp(200)):
try:
await _run_instance_probe_loop(reg)
except asyncio.CancelledError:
pass
# upsert should have been called exactly once (the starting→running transition)
assert reg.upsert_instance.call_count == 1
reg.upsert_instance.assert_called_once_with(
service="vllm", node_id="node1", gpu_id=0,
state="running", model="qwen", url="http://localhost:8000",
)
@pytest.mark.asyncio
async def test_probe_no_url_does_not_attempt_health_check():
"""Instance with no URL stays in starting state (no health check, no timeout yet)."""
reg = _registry(_inst(state="starting", url=None))
tick = 0
async def fake_sleep(_delay):
nonlocal tick
tick += 1
if tick > 1:
raise asyncio.CancelledError()
with patch("asyncio.sleep", new=fake_sleep), \
patch("time.time", return_value=1000.0), \
patch("urllib.request.urlopen") as mock_urlopen:
try:
await _run_instance_probe_loop(reg)
except asyncio.CancelledError:
pass
mock_urlopen.assert_not_called()
reg.upsert_instance.assert_not_called()

View file

@ -1,215 +0,0 @@
# tests/test_resources/test_docuvision.py
"""
Unit tests for cf-docuvision FastAPI service (circuitforge_core/resources/docuvision/app.py).
Covers:
- GET /health status + model path
- POST /extract image_b64, image_path, hint routing, metadata fields
- _parse_dolphin_output JSON list path, table detection, plain-text fallback
- _image_from_request missing both fields 422; bad image_path 404
"""
from __future__ import annotations
import base64
import io
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from fastapi.testclient import TestClient
from PIL import Image
import circuitforge_core.resources.docuvision.app as docuvision_module
from circuitforge_core.resources.docuvision.app import (
_parse_dolphin_output,
app,
)
# ── fixtures ──────────────────────────────────────────────────────────────────
def _make_jpeg_b64(width: int = 10, height: int = 10) -> str:
"""Return a base64-encoded 10x10 white JPEG."""
img = Image.new("RGB", (width, height), color=(255, 255, 255))
buf = io.BytesIO()
img.save(buf, format="JPEG")
return base64.b64encode(buf.getvalue()).decode()
@pytest.fixture(autouse=True)
def _reset_module_state():
"""Reset module-level model state between tests."""
docuvision_module._model = None
docuvision_module._processor = None
docuvision_module._model_path = "/fake/model"
docuvision_module._device = "cpu"
yield
docuvision_module._model = None
docuvision_module._processor = None
@pytest.fixture
def mock_model():
"""
Inject fake model + processor into the module so _load_model() is skipped.
The processor returns a dict-like with 'input_ids'; the model generate()
returns a tensor-like whose decode produces a JSON string.
"""
fake_ids = MagicMock()
fake_ids.shape = [1, 5] # input_len = 5
fake_inputs = {"input_ids": fake_ids}
fake_inputs_obj = MagicMock()
fake_inputs_obj.__getitem__ = lambda self, k: fake_inputs[k]
fake_inputs_obj.to = lambda device: fake_inputs_obj
fake_output = MagicMock()
fake_output.__getitem__ = lambda self, idx: MagicMock() # output_ids[0]
fake_model = MagicMock()
fake_model.generate.return_value = fake_output
fake_processor = MagicMock()
fake_processor.return_value = fake_inputs_obj
fake_processor.decode.return_value = json.dumps([
{"type": "heading", "text": "Invoice", "bbox": [0.0, 0.0, 1.0, 0.1]},
{"type": "table", "text": "row1", "html": "<table><tr><td>row1</td></tr></table>",
"bbox": [0.0, 0.1, 1.0, 0.5]},
])
docuvision_module._model = fake_model
docuvision_module._processor = fake_processor
return fake_model, fake_processor
@pytest.fixture
def client():
return TestClient(app)
# ── health ────────────────────────────────────────────────────────────────────
def test_health_returns_ok(client):
resp = client.get("/health")
assert resp.status_code == 200
data = resp.json()
assert data["status"] == "ok"
assert data["model"] == "/fake/model"
# ── _parse_dolphin_output ────────────────────────────────────────────────────
def test_parse_json_list_elements():
raw = json.dumps([
{"type": "heading", "text": "Title"},
{"type": "paragraph", "text": "Body text"},
])
elements, tables, raw_text = _parse_dolphin_output(raw)
assert len(elements) == 2
assert elements[0].type == "heading"
assert elements[0].text == "Title"
assert elements[1].type == "paragraph"
assert raw_text == "Title\nBody text"
assert tables == []
def test_parse_json_table_extracted():
raw = json.dumps([
{"type": "table", "text": "row", "html": "<table><tr><td>A</td></tr></table>",
"bbox": [0.0, 0.0, 1.0, 0.5]},
])
elements, tables, raw_text = _parse_dolphin_output(raw)
assert len(tables) == 1
assert tables[0].html == "<table><tr><td>A</td></tr></table>"
assert tables[0].bbox == [0.0, 0.0, 1.0, 0.5]
assert len(elements) == 1
assert elements[0].type == "table"
def test_parse_plain_text_fallback():
raw = "This is not JSON at all."
elements, tables, raw_text = _parse_dolphin_output(raw)
assert len(elements) == 1
assert elements[0].type == "paragraph"
assert elements[0].text == raw
assert tables == []
assert raw_text == raw
def test_parse_empty_string_fallback():
elements, tables, raw_text = _parse_dolphin_output("")
assert len(elements) == 1
assert elements[0].type == "paragraph"
assert elements[0].text == ""
def test_parse_json_missing_type_defaults_to_paragraph():
raw = json.dumps([{"text": "no type field"}])
elements, tables, _ = _parse_dolphin_output(raw)
assert elements[0].type == "paragraph"
# ── POST /extract ─────────────────────────────────────────────────────────────
def test_extract_image_b64(client, mock_model):
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "auto"})
assert resp.status_code == 200
data = resp.json()
assert "elements" in data
assert "raw_text" in data
assert "tables" in data
assert data["metadata"]["hint"] == "auto"
assert data["metadata"]["model"] == "/fake/model"
assert data["metadata"]["width"] == 10
assert data["metadata"]["height"] == 10
def test_extract_hint_table_routes_correct_prompt(client, mock_model):
_, fake_processor = mock_model
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "table"})
assert resp.status_code == 200
# Verify processor was called with the table-specific prompt
call_kwargs = fake_processor.call_args
assert "table" in call_kwargs.kwargs.get("text", "") or \
"table" in str(call_kwargs)
def test_extract_hint_unknown_falls_back_to_auto(client, mock_model):
"""An unrecognised hint silently falls back to the 'auto' prompt."""
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64(), "hint": "nonsense"})
assert resp.status_code == 200
def test_extract_image_path(tmp_path, client, mock_model):
img_file = tmp_path / "doc.png"
Image.new("RGB", (8, 8), color=(0, 0, 0)).save(img_file)
resp = client.post("/extract", json={"image_path": str(img_file)})
assert resp.status_code == 200
assert resp.json()["metadata"]["width"] == 8
def test_extract_image_path_not_found(client, mock_model):
resp = client.post("/extract", json={"image_path": "/nonexistent/path/img.png"})
assert resp.status_code == 404
def test_extract_no_image_raises_422(client, mock_model):
resp = client.post("/extract", json={"hint": "auto"})
assert resp.status_code == 422
def test_extract_response_includes_tables(client, mock_model):
"""Verify table objects surface in response when model returns table elements."""
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
assert resp.status_code == 200
data = resp.json()
assert len(data["tables"]) == 1
assert "<table>" in data["tables"][0]["html"]
def test_extract_device_in_metadata(client, mock_model):
resp = client.post("/extract", json={"image_b64": _make_jpeg_b64()})
assert resp.status_code == 200
assert "device" in resp.json()["metadata"]

View file

@ -1,67 +0,0 @@
import asyncio
import pytest
from unittest.mock import AsyncMock, patch
from circuitforge_core.resources.coordinator.eviction_engine import EvictionEngine
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
@pytest.fixture
def lease_manager():
mgr = LeaseManager()
mgr.register_gpu("heimdall", 0, 8192)
return mgr
@pytest.fixture
def engine(lease_manager):
return EvictionEngine(lease_manager=lease_manager, eviction_timeout_s=0.1)
@pytest.mark.asyncio
async def test_request_lease_grants_when_vram_available(engine, lease_manager):
lease = await engine.request_lease(
node_id="heimdall", gpu_id=0, mb=4096,
service="peregrine", priority=1,
agent_url="http://localhost:7701",
)
assert lease is not None
assert lease.mb_granted == 4096
@pytest.mark.asyncio
async def test_request_lease_evicts_and_grants(engine, lease_manager):
# Pre-fill with a low-priority lease
big_lease = await lease_manager.try_grant(
"heimdall", 0, 7000, "comfyui", priority=4
)
assert big_lease is not None
# Mock the agent eviction call
with patch(
"circuitforge_core.resources.coordinator.eviction_engine.EvictionEngine._call_agent_evict",
new_callable=AsyncMock,
) as mock_evict:
mock_evict.return_value = True
# Simulate the comfyui lease being released (as if the agent evicted it)
asyncio.get_event_loop().call_later(
0.05, lambda: asyncio.ensure_future(lease_manager.release(big_lease.lease_id))
)
lease = await engine.request_lease(
node_id="heimdall", gpu_id=0, mb=4096,
service="peregrine", priority=1,
agent_url="http://localhost:7701",
)
assert lease is not None
assert lease.holder_service == "peregrine"
@pytest.mark.asyncio
async def test_request_lease_returns_none_when_no_eviction_candidates(engine):
await engine.lease_manager.try_grant("heimdall", 0, 6000, "vllm", priority=1)
# Requesting 4GB but no lower-priority leases exist
lease = await engine.request_lease(
node_id="heimdall", gpu_id=0, mb=4096,
service="kiwi", priority=2,
agent_url="http://localhost:7701",
)
assert lease is None

View file

@ -1,43 +0,0 @@
import signal
from unittest.mock import patch, call
import pytest
from circuitforge_core.resources.agent.eviction_executor import EvictionExecutor, EvictionResult
def test_evict_by_pid_sends_sigterm_then_sigkill():
executor = EvictionExecutor(grace_period_s=0.01)
# pid_exists always True → grace period expires → SIGKILL fires
with patch("os.kill") as mock_kill, \
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
mock_psutil.pid_exists.return_value = True
result = executor.evict_pid(pid=1234, grace_period_s=0.01)
assert result.success is True
calls = mock_kill.call_args_list
assert call(1234, signal.SIGTERM) in calls
assert call(1234, signal.SIGKILL) in calls
def test_evict_pid_succeeds_on_sigterm_alone():
executor = EvictionExecutor(grace_period_s=0.1)
with patch("os.kill"), \
patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
mock_psutil.pid_exists.side_effect = [True, False] # gone after SIGTERM
result = executor.evict_pid(pid=5678, grace_period_s=0.01)
assert result.success is True
assert result.method == "sigterm"
def test_evict_pid_not_found_returns_failure():
executor = EvictionExecutor()
with patch("circuitforge_core.resources.agent.eviction_executor.psutil") as mock_psutil:
mock_psutil.pid_exists.return_value = False
result = executor.evict_pid(pid=9999)
assert result.success is False
assert "not found" in result.message.lower()
def test_eviction_result_is_immutable():
result = EvictionResult(success=True, method="sigterm", message="ok")
with pytest.raises((AttributeError, TypeError)):
result.success = False # type: ignore

View file

@ -1,60 +0,0 @@
from unittest.mock import patch
from circuitforge_core.resources.agent.gpu_monitor import GpuMonitor
SAMPLE_NVIDIA_SMI_OUTPUT = (
"0, Quadro RTX 4000, 8192, 6843, 1349\n"
"1, Quadro RTX 4000, 8192, 721, 7471\n"
)
def test_parse_returns_list_of_gpu_info():
monitor = GpuMonitor()
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
gpus = monitor.poll()
assert len(gpus) == 2
assert gpus[0].gpu_id == 0
assert gpus[0].name == "Quadro RTX 4000"
assert gpus[0].vram_total_mb == 8192
assert gpus[0].vram_used_mb == 6843
assert gpus[0].vram_free_mb == 1349
def test_parse_second_gpu():
monitor = GpuMonitor()
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = SAMPLE_NVIDIA_SMI_OUTPUT
gpus = monitor.poll()
assert gpus[1].gpu_id == 1
assert gpus[1].vram_used_mb == 721
assert gpus[1].vram_free_mb == 7471
def test_poll_returns_empty_list_when_nvidia_smi_unavailable():
monitor = GpuMonitor()
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run", side_effect=FileNotFoundError):
gpus = monitor.poll()
assert gpus == []
def test_poll_returns_empty_list_on_nonzero_exit():
monitor = GpuMonitor()
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
mock_run.return_value.returncode = 1
mock_run.return_value.stdout = ""
gpus = monitor.poll()
assert gpus == []
def test_poll_skips_malformed_lines():
monitor = GpuMonitor()
malformed = "0, RTX 4000, 8192, not_a_number, 1024\n1, RTX 4000, 8192, 512, 7680\n"
with patch("circuitforge_core.resources.agent.gpu_monitor.subprocess.run") as mock_run:
mock_run.return_value.returncode = 0
mock_run.return_value.stdout = malformed
gpus = monitor.poll()
assert len(gpus) == 1
assert gpus[0].gpu_id == 1

View file

@ -1,221 +0,0 @@
"""Integration test: full lease → eviction → re-grant cycle.
Runs coordinator in-process (no subprocesses, no real nvidia-smi).
Uses TestClient for HTTP, mocks AgentSupervisor to return fixed node state.
"""
import pytest
from unittest.mock import MagicMock
from fastapi.testclient import TestClient
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
from circuitforge_core.resources.coordinator.agent_supervisor import AgentSupervisor
from circuitforge_core.resources.coordinator.app import create_coordinator_app
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.models import GpuInfo, NodeInfo
@pytest.fixture
def system():
"""Create an in-process coordinator system with 8GB GPU and mock supervisor."""
lease_manager = LeaseManager()
lease_manager.register_gpu("local", 0, 8192)
mock_supervisor = MagicMock(spec=AgentSupervisor)
mock_supervisor.all_nodes.return_value = [
NodeInfo(
node_id="local",
agent_url="http://localhost:7701",
gpus=[GpuInfo(
gpu_id=0,
name="RTX 4000",
vram_total_mb=8192,
vram_used_mb=0,
vram_free_mb=8192,
)],
last_heartbeat=0.0,
)
]
mock_supervisor.get_node_info.return_value = NodeInfo(
node_id="local",
agent_url="http://localhost:7701",
gpus=[],
last_heartbeat=0.0,
)
profile_registry = ProfileRegistry()
app = create_coordinator_app(
lease_manager=lease_manager,
profile_registry=profile_registry,
agent_supervisor=mock_supervisor,
service_registry=ServiceRegistry(),
)
client = TestClient(app)
return client, lease_manager
def test_full_lease_cycle(system):
"""Test: grant, verify, release, verify gone."""
client, _ = system
# Grant a lease
resp = client.post("/api/leases", json={
"node_id": "local",
"gpu_id": 0,
"mb": 4096,
"service": "peregrine",
"priority": 1,
})
assert resp.status_code == 200
lease_data = resp.json()["lease"]
lease_id = lease_data["lease_id"]
assert lease_data["mb_granted"] == 4096
assert lease_data["holder_service"] == "peregrine"
# Verify it appears in active leases
resp = client.get("/api/leases")
assert resp.status_code == 200
leases = resp.json()["leases"]
assert any(l["lease_id"] == lease_id for l in leases)
# Release it
resp = client.delete(f"/api/leases/{lease_id}")
assert resp.status_code == 200
assert resp.json()["released"] is True
# Verify it's gone
resp = client.get("/api/leases")
assert resp.status_code == 200
leases = resp.json()["leases"]
assert not any(l["lease_id"] == lease_id for l in leases)
def test_vram_exhaustion_returns_503(system):
"""Test: fill GPU, then request with no eviction candidates returns 503."""
client, _ = system
# Fill GPU 0 with high-priority lease
resp = client.post("/api/leases", json={
"node_id": "local",
"gpu_id": 0,
"mb": 8000,
"service": "vllm",
"priority": 1,
})
assert resp.status_code == 200
# Try to get more VRAM with same priority (no eviction candidates)
resp = client.post("/api/leases", json={
"node_id": "local",
"gpu_id": 0,
"mb": 2000,
"service": "kiwi",
"priority": 1,
})
assert resp.status_code == 503
assert "Insufficient VRAM" in resp.json()["detail"]
def test_auto_detect_profile_for_8gb():
"""Test: ProfileRegistry auto-detects single-gpu-8gb for 8GB GPU."""
registry = ProfileRegistry()
gpu = GpuInfo(
gpu_id=0,
name="RTX 4000",
vram_total_mb=8192,
vram_used_mb=0,
vram_free_mb=8192,
)
profile = registry.auto_detect([gpu])
assert profile.name == "single-gpu-8gb"
# Verify profile has services configured
assert hasattr(profile, "services")
def test_node_endpoint_shows_nodes(system):
"""Test: GET /api/nodes returns the mocked node."""
client, _ = system
resp = client.get("/api/nodes")
assert resp.status_code == 200
nodes = resp.json()["nodes"]
assert len(nodes) == 1
assert nodes[0]["node_id"] == "local"
assert nodes[0]["agent_url"] == "http://localhost:7701"
assert len(nodes[0]["gpus"]) == 1
assert nodes[0]["gpus"][0]["name"] == "RTX 4000"
def test_profiles_endpoint_returns_public_profiles(system):
"""Test: GET /api/profiles returns standard public profiles."""
client, _ = system
resp = client.get("/api/profiles")
assert resp.status_code == 200
profiles = resp.json()["profiles"]
names = [p["name"] for p in profiles]
# Verify common public profiles are present
assert "single-gpu-8gb" in names
assert "single-gpu-6gb" in names
assert "single-gpu-2gb" in names
def test_multiple_leases_tracked_independently(system):
"""Test: multiple active leases are tracked correctly."""
client, _ = system
# Grant lease 1
resp1 = client.post("/api/leases", json={
"node_id": "local",
"gpu_id": 0,
"mb": 2048,
"service": "peregrine",
"priority": 2,
})
assert resp1.status_code == 200
lease1_id = resp1.json()["lease"]["lease_id"]
# Grant lease 2
resp2 = client.post("/api/leases", json={
"node_id": "local",
"gpu_id": 0,
"mb": 2048,
"service": "kiwi",
"priority": 2,
})
assert resp2.status_code == 200
lease2_id = resp2.json()["lease"]["lease_id"]
# Both should be in active leases
resp = client.get("/api/leases")
leases = resp.json()["leases"]
lease_ids = [l["lease_id"] for l in leases]
assert lease1_id in lease_ids
assert lease2_id in lease_ids
assert len(leases) == 2
# Release lease 1
resp = client.delete(f"/api/leases/{lease1_id}")
assert resp.status_code == 200
# Only lease 2 should remain
resp = client.get("/api/leases")
leases = resp.json()["leases"]
lease_ids = [l["lease_id"] for l in leases]
assert lease1_id not in lease_ids
assert lease2_id in lease_ids
assert len(leases) == 1
def test_delete_nonexistent_lease_returns_404(system):
"""Test: deleting a nonexistent lease returns 404."""
client, _ = system
resp = client.delete("/api/leases/nonexistent-lease-id")
assert resp.status_code == 404
assert "not found" in resp.json()["detail"]
def test_health_endpoint_returns_ok(system):
"""Test: GET /api/health returns status ok."""
client, _ = system
resp = client.get("/api/health")
assert resp.status_code == 200
assert resp.json()["status"] == "ok"

View file

@ -1,85 +0,0 @@
import pytest
from circuitforge_core.resources.coordinator.lease_manager import LeaseManager
@pytest.fixture
def mgr():
m = LeaseManager()
m.register_gpu(node_id="heimdall", gpu_id=0, total_mb=8192)
return m
@pytest.mark.asyncio
async def test_grant_succeeds_when_vram_available(mgr):
lease = await mgr.try_grant(
node_id="heimdall", gpu_id=0, mb=4096,
service="peregrine", priority=1
)
assert lease is not None
assert lease.mb_granted == 4096
assert lease.node_id == "heimdall"
assert lease.gpu_id == 0
@pytest.mark.asyncio
async def test_grant_fails_when_vram_insufficient(mgr):
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
service="vllm", priority=1)
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
service="kiwi", priority=2)
assert lease is None
@pytest.mark.asyncio
async def test_release_frees_vram(mgr):
lease = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
service="vllm", priority=1)
assert lease is not None
released = await mgr.release(lease.lease_id)
assert released is True
lease2 = await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=7000,
service="comfyui", priority=4)
assert lease2 is not None
@pytest.mark.asyncio
async def test_release_unknown_lease_returns_false(mgr):
result = await mgr.release("nonexistent-id")
assert result is False
@pytest.mark.asyncio
async def test_get_eviction_candidates_returns_lower_priority_leases(mgr):
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=3000,
service="comfyui", priority=4)
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=2000,
service="ollama", priority=1)
candidates = mgr.get_eviction_candidates(
node_id="heimdall", gpu_id=0,
needed_mb=3000, requester_priority=2
)
assert len(candidates) == 1
assert candidates[0].holder_service == "comfyui"
@pytest.mark.asyncio
async def test_list_leases_for_gpu(mgr):
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=1024,
service="peregrine", priority=1)
await mgr.try_grant(node_id="heimdall", gpu_id=0, mb=512,
service="kiwi", priority=2)
leases = mgr.list_leases(node_id="heimdall", gpu_id=0)
assert len(leases) == 2
def test_register_gpu_sets_total(mgr):
assert mgr.gpu_total_mb("heimdall", 0) == 8192
@pytest.mark.asyncio
async def test_used_mb_tracks_grants():
mgr = LeaseManager()
mgr.register_gpu("heimdall", 0, 8192)
await mgr.try_grant("heimdall", 0, 3000, "a", 1)
await mgr.try_grant("heimdall", 0, 2000, "b", 2)
assert mgr.used_mb("heimdall", 0) == 5000

View file

@ -1,47 +0,0 @@
import time
import pytest
from circuitforge_core.resources.models import VRAMLease, GpuInfo, NodeInfo
def test_vram_lease_create_assigns_unique_ids():
lease_a = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
service="peregrine", priority=1)
lease_b = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=4096,
service="peregrine", priority=1)
assert lease_a.lease_id != lease_b.lease_id
def test_vram_lease_create_with_ttl_sets_expiry():
before = time.time()
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=2048,
service="kiwi", priority=2, ttl_s=60.0)
after = time.time()
assert before + 60.0 <= lease.expires_at <= after + 60.0
def test_vram_lease_create_no_ttl_has_zero_expiry():
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
service="snipe", priority=2)
assert lease.expires_at == 0.0
def test_vram_lease_is_immutable():
lease = VRAMLease.create(gpu_id=0, node_id="heimdall", mb=1024,
service="snipe", priority=2)
with pytest.raises((AttributeError, TypeError)):
lease.mb_granted = 999 # type: ignore
def test_gpu_info_fields():
info = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
vram_used_mb=2048, vram_free_mb=6144)
assert info.vram_free_mb == 6144
def test_node_info_fields():
gpu = GpuInfo(gpu_id=0, name="RTX 4000", vram_total_mb=8192,
vram_used_mb=0, vram_free_mb=8192)
node = NodeInfo(node_id="heimdall", agent_url="http://localhost:7701",
gpus=[gpu], last_heartbeat=time.time())
assert node.node_id == "heimdall"
assert len(node.gpus) == 1

View file

@ -1,82 +0,0 @@
import pytest
from circuitforge_core.resources.coordinator.node_selector import select_node
from circuitforge_core.resources.coordinator.agent_supervisor import AgentRecord
from circuitforge_core.resources.models import GpuInfo
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
def _make_agent(node_id: str, free_mb: int, online: bool = True) -> AgentRecord:
r = AgentRecord(node_id=node_id, agent_url=f"http://{node_id}:7701")
r.gpus = [GpuInfo(gpu_id=0, name="RTX", vram_total_mb=8192,
vram_used_mb=8192 - free_mb, vram_free_mb=free_mb)]
r.online = online
return r
def test_selects_node_with_most_free_vram():
agents = {
"a": _make_agent("a", free_mb=2000),
"b": _make_agent("b", free_mb=6000),
}
registry = ProfileRegistry()
result = select_node(agents, "vllm", registry, resident_keys=set())
assert result == ("b", 0)
def test_prefers_warm_node_even_with_less_free_vram():
agents = {
"a": _make_agent("a", free_mb=2000),
"b": _make_agent("b", free_mb=6000),
}
registry = ProfileRegistry()
result = select_node(agents, "vllm", registry, resident_keys={"a:vllm"})
assert result == ("a", 0)
def test_excludes_offline_nodes():
agents = {
"a": _make_agent("a", free_mb=8000, online=False),
"b": _make_agent("b", free_mb=2000, online=True),
}
registry = ProfileRegistry()
result = select_node(agents, "vllm", registry, resident_keys=set())
assert result == ("b", 0)
def test_returns_none_when_no_node_has_profile_for_service():
agents = {"a": _make_agent("a", free_mb=8000)}
registry = ProfileRegistry()
result = select_node(agents, "cf-nonexistent-service", registry, resident_keys=set())
assert result is None
def test_returns_none_when_no_agents():
registry = ProfileRegistry()
result = select_node({}, "vllm", registry, resident_keys=set())
assert result is None
def test_prefers_node_that_fully_fits_service_over_one_that_does_not():
"""can_fit requires free_mb >= service max_mb (full ceiling, not half).
9500 MB guarantees above all profile ceilings (max is 9000); 1000 MB is below all.
"""
agents = {
"a": _make_agent("a", free_mb=1000),
"b": _make_agent("b", free_mb=9500),
}
registry = ProfileRegistry()
result = select_node(agents, "vllm", registry, resident_keys=set())
# "b" is the only node in the preferred (can_fit) pool
assert result == ("b", 0)
def test_falls_back_to_best_effort_when_no_node_fully_fits():
"""When nothing can_fit, select_node returns the best-VRAM node as fallback."""
agents = {
"a": _make_agent("a", free_mb=1000),
"b": _make_agent("b", free_mb=2000),
}
registry = ProfileRegistry()
# Neither has enough free VRAM; fallback picks highest effective_free_mb
result = select_node(agents, "vllm", registry, resident_keys=set())
assert result == ("b", 0)

View file

@ -1,87 +0,0 @@
# tests/test_resources/test_node_store.py
"""Unit tests for NodeStore — SQLite persistence layer for known agent nodes."""
from __future__ import annotations
import time
from pathlib import Path
import pytest
from circuitforge_core.resources.coordinator.node_store import NodeStore
@pytest.fixture
def store(tmp_path: Path) -> NodeStore:
return NodeStore(db_path=tmp_path / "test-nodes.db")
def test_upsert_and_all(store: NodeStore) -> None:
store.upsert("heimdall", "http://127.0.0.1:7701")
rows = store.all()
assert len(rows) == 1
assert rows[0] == ("heimdall", "http://127.0.0.1:7701")
def test_upsert_updates_url(store: NodeStore) -> None:
store.upsert("navi", "http://10.1.10.10:7701")
store.upsert("navi", "http://10.1.10.10:7702")
rows = store.all()
assert len(rows) == 1
assert rows[0][1] == "http://10.1.10.10:7702"
def test_multiple_nodes(store: NodeStore) -> None:
store.upsert("heimdall", "http://127.0.0.1:7701")
store.upsert("navi", "http://10.1.10.10:7701")
store.upsert("strahl", "http://10.1.10.20:7701")
assert len(store.all()) == 3
def test_remove(store: NodeStore) -> None:
store.upsert("heimdall", "http://127.0.0.1:7701")
store.upsert("navi", "http://10.1.10.10:7701")
store.remove("navi")
ids = [r[0] for r in store.all()]
assert "navi" not in ids
assert "heimdall" in ids
def test_prune_stale_removes_old_entries(store: NodeStore) -> None:
# Insert a node with a last_seen in the distant past
store._conn.execute(
"INSERT INTO known_nodes (node_id, agent_url, last_seen) VALUES (?, ?, ?)",
("ghost", "http://dead:7701", time.time() - 40 * 86400),
)
store._conn.commit()
store.upsert("live", "http://live:7701")
removed = store.prune_stale(max_age_days=30)
assert removed == 1
ids = [r[0] for r in store.all()]
assert "ghost" not in ids
assert "live" in ids
def test_prune_stale_keeps_recent(store: NodeStore) -> None:
store.upsert("recent", "http://recent:7701")
removed = store.prune_stale(max_age_days=30)
assert removed == 0
assert len(store.all()) == 1
def test_all_empty(store: NodeStore) -> None:
assert store.all() == []
def test_db_persists_across_instances(tmp_path: Path) -> None:
"""Data written by one NodeStore instance is visible to a new one on the same file."""
db = tmp_path / "shared.db"
s1 = NodeStore(db_path=db)
s1.upsert("navi", "http://10.1.10.10:7701")
s1.close()
s2 = NodeStore(db_path=db)
rows = s2.all()
assert len(rows) == 1
assert rows[0][0] == "navi"
s2.close()

View file

@ -1,176 +0,0 @@
# tests/test_resources/test_ollama_adopt.py
"""
Tests for the Ollama adopt-if-running path:
- ProcessSpec: adopt and health_path fields parsed from YAML
- ServiceManager.start(): adopt path claims running service; falls through if not running
- ServiceManager.is_running(): adopt path uses health probe, not proc table
- ServiceInstance.health_path persists through upsert_instance
- Probe loop uses inst.health_path instead of hardcoded /health
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
from circuitforge_core.resources.agent.service_manager import ServiceManager
from circuitforge_core.resources.coordinator.service_registry import ServiceRegistry
from circuitforge_core.resources.profiles.schema import GpuProfile, ProcessSpec, ServiceProfile, load_profile
# ── ProcessSpec schema ────────────────────────────────────────────────────────
def test_process_spec_defaults():
spec = ProcessSpec(exec_path="/usr/local/bin/ollama")
assert spec.adopt is False
assert spec.health_path == "/health"
def test_process_spec_adopt_fields():
spec = ProcessSpec(
exec_path="/usr/local/bin/ollama",
adopt=True,
health_path="/api/tags",
port=11434,
host_port=11434,
)
assert spec.adopt is True
assert spec.health_path == "/api/tags"
def test_profile_yaml_parses_adopt(tmp_path: Path):
yaml_text = """\
schema_version: 1
name: test
services:
ollama:
max_mb: 4096
priority: 1
managed:
type: process
adopt: true
exec_path: /usr/local/bin/ollama
args_template: serve
port: 11434
host_port: 11434
health_path: /api/tags
"""
p = tmp_path / "profile.yaml"
p.write_text(yaml_text)
profile = load_profile(p)
spec = profile.services["ollama"].managed
assert isinstance(spec, ProcessSpec)
assert spec.adopt is True
assert spec.health_path == "/api/tags"
assert spec.host_port == 11434
# ── ServiceManager adopt path ─────────────────────────────────────────────────
def _make_manager_with_ollama(advertise_host: str = "127.0.0.1") -> ServiceManager:
profile = GpuProfile(
schema_version=1,
name="test",
services={
"ollama": ServiceProfile(
max_mb=4096,
priority=1,
managed=ProcessSpec(
exec_path="/usr/local/bin/ollama",
args_template="serve",
port=11434,
host_port=11434,
adopt=True,
health_path="/api/tags",
),
)
},
)
return ServiceManager(node_id="heimdall", profile=profile, advertise_host=advertise_host)
def test_start_adopt_claims_running_service():
"""When Ollama is already healthy, start() returns its URL without spawning a process."""
mgr = _make_manager_with_ollama()
with patch.object(mgr, "_probe_health", return_value=True) as mock_probe:
url = mgr.start("ollama", gpu_id=0, params={})
assert url == "http://127.0.0.1:11434"
mock_probe.assert_called_once_with(11434, "/api/tags")
assert "ollama" not in mgr._procs # no subprocess spawned
def test_start_adopt_spawns_when_not_running():
"""When Ollama is not yet running, start() spawns it normally."""
mgr = _make_manager_with_ollama()
mock_proc = MagicMock()
mock_proc.poll.return_value = None
with patch.object(mgr, "_probe_health", return_value=False), \
patch("subprocess.Popen", return_value=mock_proc) as mock_popen:
url = mgr.start("ollama", gpu_id=0, params={})
assert url == "http://127.0.0.1:11434"
mock_popen.assert_called_once()
assert "ollama" in mgr._procs
def test_is_running_adopt_uses_health_probe():
"""is_running() for adopt=True services checks the health endpoint, not the proc table."""
mgr = _make_manager_with_ollama()
with patch.object(mgr, "_probe_health", return_value=True):
assert mgr.is_running("ollama") is True
with patch.object(mgr, "_probe_health", return_value=False):
assert mgr.is_running("ollama") is False
def test_probe_health_returns_true_on_200():
mgr = _make_manager_with_ollama()
mock_resp = MagicMock()
mock_resp.status = 200
mock_resp.__enter__ = lambda s: mock_resp
mock_resp.__exit__ = MagicMock(return_value=False)
with patch("urllib.request.urlopen", return_value=mock_resp):
assert mgr._probe_health(11434, "/api/tags") is True
def test_probe_health_returns_false_on_connection_error():
mgr = _make_manager_with_ollama()
with patch("urllib.request.urlopen", side_effect=OSError("refused")):
assert mgr._probe_health(11434, "/api/tags") is False
# ── ServiceRegistry health_path ───────────────────────────────────────────────
def test_upsert_instance_stores_health_path():
reg = ServiceRegistry()
inst = reg.upsert_instance(
service="ollama", node_id="heimdall", gpu_id=0,
state="running", model=None, url="http://127.0.0.1:11434",
health_path="/api/tags",
)
assert inst.health_path == "/api/tags"
def test_upsert_instance_default_health_path():
reg = ServiceRegistry()
inst = reg.upsert_instance(
service="vllm", node_id="heimdall", gpu_id=0,
state="starting", model="qwen", url="http://127.0.0.1:8000",
)
assert inst.health_path == "/health"
def test_all_gpu_profiles_have_ollama_managed_block():
"""Sanity check: all public GPU profiles now have a managed block for ollama."""
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
registry = ProfileRegistry()
for profile in registry.list_public():
svc = profile.services.get("ollama")
if svc is None:
continue # profile may not define ollama
assert svc.managed is not None, f"{profile.name}: ollama missing managed block"
assert isinstance(svc.managed, ProcessSpec)
assert svc.managed.adopt is True, f"{profile.name}: ollama adopt should be True"
assert svc.managed.health_path == "/api/tags", f"{profile.name}: wrong health_path"

View file

@ -1,101 +0,0 @@
# tests/test_resources/test_profile_registry.py
import pytest
from unittest.mock import MagicMock
from circuitforge_core.resources.profiles.schema import (
GpuProfile, ServiceProfile, load_profile
)
from circuitforge_core.resources.coordinator.profile_registry import ProfileRegistry
def test_load_8gb_profile(tmp_path):
yaml_content = """
schema_version: 1
name: single-gpu-8gb
vram_total_mb: 8192
eviction_timeout_s: 10.0
services:
vllm:
max_mb: 5120
priority: 1
cf-vision:
max_mb: 2048
priority: 2
shared: true
max_concurrent: 3
"""
profile_file = tmp_path / "test.yaml"
profile_file.write_text(yaml_content)
profile = load_profile(profile_file)
assert profile.name == "single-gpu-8gb"
assert profile.schema_version == 1
assert profile.vram_total_mb == 8192
assert profile.eviction_timeout_s == 10.0
assert "vllm" in profile.services
assert profile.services["vllm"].max_mb == 5120
assert profile.services["vllm"].priority == 1
assert profile.services["cf-vision"].shared is True
assert profile.services["cf-vision"].max_concurrent == 3
def test_load_profile_rejects_wrong_schema_version(tmp_path):
yaml_content = "schema_version: 99\nname: future\n"
profile_file = tmp_path / "future.yaml"
profile_file.write_text(yaml_content)
with pytest.raises(ValueError, match="schema_version"):
load_profile(profile_file)
def test_service_profile_defaults():
svc = ServiceProfile(max_mb=1024, priority=2)
assert svc.shared is False
assert svc.max_concurrent == 1
assert svc.always_on is False
assert svc.backend is None
assert svc.consumers == []
def test_profile_registry_loads_public_profiles():
registry = ProfileRegistry()
profiles = registry.list_public()
names = [p.name for p in profiles]
assert "single-gpu-8gb" in names
assert "single-gpu-6gb" in names
assert "single-gpu-2gb" in names
def test_profile_registry_auto_detect_selects_8gb():
registry = ProfileRegistry()
mock_gpus = [
MagicMock(vram_total_mb=8192),
]
profile = registry.auto_detect(mock_gpus)
assert profile.name == "single-gpu-8gb"
def test_profile_registry_auto_detect_selects_6gb():
registry = ProfileRegistry()
mock_gpus = [MagicMock(vram_total_mb=6144)]
profile = registry.auto_detect(mock_gpus)
assert profile.name == "single-gpu-6gb"
def test_profile_registry_auto_detect_selects_2gb():
registry = ProfileRegistry()
mock_gpus = [MagicMock(vram_total_mb=2048)]
profile = registry.auto_detect(mock_gpus)
assert profile.name == "single-gpu-2gb"
def test_profile_registry_load_from_path(tmp_path):
yaml_content = (
"schema_version: 1\nname: custom\n"
"vram_total_mb: 12288\neviction_timeout_s: 5.0\n"
)
p = tmp_path / "custom.yaml"
p.write_text(yaml_content)
registry = ProfileRegistry()
profile = registry.load(p)
assert profile.name == "custom"
assert profile.vram_total_mb == 12288

View file

@ -1,194 +0,0 @@
"""Tests for ServiceManager ProcessSpec support."""
from __future__ import annotations
from unittest.mock import MagicMock, patch
import pytest
from circuitforge_core.resources.agent.service_manager import ServiceManager
from circuitforge_core.resources.profiles.schema import (
GpuProfile,
ProcessSpec,
ServiceProfile,
)
def _make_profile(args_template: str = "--port {port} --gpu-id {gpu_id}") -> GpuProfile:
return GpuProfile(
schema_version=1,
name="test",
vram_total_mb=8192,
services={
"vllm": ServiceProfile(
max_mb=5120,
priority=1,
managed=ProcessSpec(
exec_path="/usr/bin/python",
args_template=args_template,
port=8000,
host_port=8000,
cwd="/tmp",
),
),
"no_managed": ServiceProfile(max_mb=1024, priority=2),
},
)
@pytest.fixture
def manager():
return ServiceManager(node_id="test-node", profile=_make_profile(), advertise_host="127.0.0.1")
# ---------------------------------------------------------------------------
# is_running
# ---------------------------------------------------------------------------
def test_is_running_returns_false_when_no_proc(manager):
assert manager.is_running("vllm") is False
def test_is_running_returns_false_when_proc_exited(manager):
mock_proc = MagicMock()
mock_proc.poll.return_value = 1 # exited
manager._procs["vllm"] = mock_proc
assert manager.is_running("vllm") is False
def test_is_running_returns_false_when_port_not_listening(manager):
mock_proc = MagicMock()
mock_proc.poll.return_value = None # still running
manager._procs["vllm"] = mock_proc
with patch("socket.create_connection", side_effect=OSError("refused")):
assert manager.is_running("vllm") is False
def test_is_running_returns_true_when_proc_alive_and_port_open(manager):
mock_proc = MagicMock()
mock_proc.poll.return_value = None # still running
manager._procs["vllm"] = mock_proc
mock_socket = MagicMock()
mock_socket.__enter__ = MagicMock(return_value=mock_socket)
mock_socket.__exit__ = MagicMock(return_value=False)
with patch("socket.create_connection", return_value=mock_socket):
assert manager.is_running("vllm") is True
def test_is_running_unknown_service_returns_false(manager):
assert manager.is_running("nonexistent") is False
def test_is_running_no_managed_spec_returns_false(manager):
assert manager.is_running("no_managed") is False
# ---------------------------------------------------------------------------
# start
# ---------------------------------------------------------------------------
def test_start_launches_process_and_returns_url(manager):
with patch("subprocess.Popen") as mock_popen, \
patch.object(manager, "is_running", return_value=False):
mock_popen.return_value = MagicMock()
url = manager.start("vllm", gpu_id=0, params={"model": "mymodel"})
assert url == "http://127.0.0.1:8000"
mock_popen.assert_called_once()
call_args = mock_popen.call_args
cmd = call_args[0][0]
assert cmd[0] == "/usr/bin/python"
assert "--port" in cmd
assert "8000" in cmd
assert "--gpu-id" in cmd
assert "0" in cmd
def test_start_returns_url_immediately_when_already_running(manager):
with patch.object(manager, "is_running", return_value=True):
with patch("subprocess.Popen") as mock_popen:
url = manager.start("vllm", gpu_id=0, params={})
assert url == "http://127.0.0.1:8000"
mock_popen.assert_not_called()
def test_start_raises_for_unknown_service(manager):
with pytest.raises(ValueError, match="not in profile"):
manager.start("nonexistent", gpu_id=0, params={})
def test_start_stores_proc_in_procs(manager):
mock_proc = MagicMock()
with patch("subprocess.Popen", return_value=mock_proc), \
patch.object(manager, "is_running", return_value=False):
manager.start("vllm", gpu_id=0, params={})
assert manager._procs["vllm"] is mock_proc
# ---------------------------------------------------------------------------
# stop
# ---------------------------------------------------------------------------
def test_stop_terminates_running_process(manager):
mock_proc = MagicMock()
manager._procs["vllm"] = mock_proc
result = manager.stop("vllm")
assert result is True
mock_proc.terminate.assert_called_once()
mock_proc.wait.assert_called_once()
assert "vllm" not in manager._procs
def test_stop_kills_process_that_wont_terminate(manager):
mock_proc = MagicMock()
mock_proc.wait.side_effect = Exception("timeout")
manager._procs["vllm"] = mock_proc
result = manager.stop("vllm")
assert result is True
mock_proc.kill.assert_called_once()
def test_stop_returns_true_when_no_proc_tracked(manager):
# No proc in _procs — still returns True (idempotent stop)
result = manager.stop("vllm")
assert result is True
def test_stop_returns_false_for_unknown_service(manager):
result = manager.stop("nonexistent")
assert result is False
# ---------------------------------------------------------------------------
# list_running / get_url
# ---------------------------------------------------------------------------
def test_list_running_returns_running_services(manager):
def _is_running(svc: str) -> bool:
return svc == "vllm"
with patch.object(manager, "is_running", side_effect=_is_running):
running = manager.list_running()
assert running == ["vllm"]
def test_get_url_returns_none_when_not_running(manager):
with patch.object(manager, "is_running", return_value=False):
assert manager.get_url("vllm") is None
def test_get_url_returns_url_when_running(manager):
with patch.object(manager, "is_running", return_value=True):
assert manager.get_url("vllm") == "http://127.0.0.1:8000"

View file

@ -1,86 +0,0 @@
import time
import dataclasses
import pytest
from circuitforge_core.resources.coordinator.service_registry import (
ServiceRegistry, ServiceAllocation, ServiceInstance,
)
@pytest.fixture
def registry():
return ServiceRegistry()
def test_allocate_creates_allocation(registry):
alloc = registry.allocate(
service="vllm", node_id="heimdall", gpu_id=0,
model="Ouro-1.4B", url="http://heimdall:8000",
caller="test", ttl_s=300.0,
)
assert alloc.service == "vllm"
assert alloc.node_id == "heimdall"
assert alloc.allocation_id # non-empty UUID string
def test_active_allocations_count(registry):
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "b", 300.0)
assert registry.active_allocations("vllm", "heimdall", 0) == 2
def test_release_decrements_count(registry):
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "a", 300.0)
registry.release(alloc.allocation_id)
assert registry.active_allocations("vllm", "heimdall", 0) == 0
def test_release_nonexistent_returns_false(registry):
assert registry.release("nonexistent-id") is False
def test_upsert_instance_sets_running_state(registry):
registry.upsert_instance("vllm", "heimdall", 0, state="running",
model="Ouro-1.4B", url="http://heimdall:8000")
instances = registry.all_instances()
assert len(instances) == 1
assert instances[0].state == "running"
def test_release_last_alloc_marks_instance_idle(registry):
registry.upsert_instance("vllm", "heimdall", 0, state="running",
model="Ouro-1.4B", url="http://heimdall:8000")
alloc = registry.allocate("vllm", "heimdall", 0, "Ouro-1.4B", "http://heimdall:8000", "a", 300.0)
registry.release(alloc.allocation_id)
instance = registry.all_instances()[0]
assert instance.state == "idle"
assert instance.idle_since is not None
def test_new_alloc_on_idle_instance_marks_it_running(registry):
registry.upsert_instance("vllm", "heimdall", 0, state="idle",
model="M", url="http://h:8000")
registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "x", 300.0)
assert registry.all_instances()[0].state == "running"
def test_sweep_expired_allocations(registry):
# Register a running instance so idle-transition logic has something to act on.
registry.upsert_instance("vllm", "heimdall", 0, state="running",
model="M", url="http://h:8000")
# Create an allocation with a very short TTL (1 second).
alloc = registry.allocate("vllm", "heimdall", 0, "M", "http://h:8000", "caller", ttl_s=1)
assert registry.active_allocations("vllm", "heimdall", 0) == 1
# Wait for TTL to elapse.
time.sleep(1.1)
expired = registry.sweep_expired_allocations()
# The allocation should have been swept.
assert alloc.allocation_id in expired
assert registry.active_allocations("vllm", "heimdall", 0) == 0
# The instance should have transitioned to idle since no allocations remain.
instance = registry.all_instances()[0]
assert instance.state == "idle"
assert instance.idle_since is not None