"""Avocet — cf-orch benchmark integration API. Wraps cf-orch's benchmark.py script and exposes it via the Avocet API. Config is read from label_tool.yaml under the `cforch:` key. All endpoints are registered on `router` (a FastAPI APIRouter). api.py includes this router with prefix="/api/cforch". Module-level globals (_CONFIG_DIR, _BENCH_RUNNING, _bench_proc) follow the same testability pattern as sft.py — override _CONFIG_DIR via set_config_dir() in test fixtures. """ from __future__ import annotations import json import logging import os import re import select as _select import subprocess as _subprocess import tempfile from pathlib import Path from typing import Any import urllib.parse import yaml from fastapi import APIRouter, HTTPException from fastapi.responses import StreamingResponse logger = logging.getLogger(__name__) _ROOT = Path(__file__).parent.parent _CONFIG_DIR: Path | None = None # override in tests _BENCH_RUNNING: bool = False _bench_proc: Any = None # live Popen object while benchmark runs router = APIRouter() # ── Testability seams ────────────────────────────────────────────────────────── def set_config_dir(path: Path | None) -> None: global _CONFIG_DIR _CONFIG_DIR = path # ── Internal helpers ─────────────────────────────────────────────────────────── def _config_file() -> Path: if _CONFIG_DIR is not None: return _CONFIG_DIR / "label_tool.yaml" return _ROOT / "config" / "label_tool.yaml" def _load_cforch_config() -> dict: """Read label_tool.yaml cforch section, falling back to environment variables. Priority (highest to lowest): 1. label_tool.yaml cforch: key 2. Environment variables (CF_ORCH_URL, CF_LICENSE_KEY, OLLAMA_HOST, OLLAMA_MODEL) """ f = _config_file() file_cfg: dict = {} if f.exists(): try: raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} file_cfg = raw.get("cforch", {}) or {} except yaml.YAMLError as exc: logger.warning("Failed to parse cforch config %s: %s", f, exc) # Env var fallbacks — only used when the yaml key is absent or empty def _coalesce(file_val: str, env_key: str) -> str: return file_val if file_val else os.environ.get(env_key, "") return { **file_cfg, "coordinator_url": _coalesce(file_cfg.get("coordinator_url", ""), "CF_ORCH_URL"), "license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"), "ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"), "ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"), "judge_url": _coalesce(file_cfg.get("judge_url", ""), "CF_JUDGE_URL"), "hf_token": _coalesce(file_cfg.get("hf_token", ""), "HF_TOKEN"), } def _validate_service_url(url: str, param_name: str) -> str: """Validate that a URL is a well-formed http/https URL with a hostname. Guards against SSRF: only http/https is allowed; the URL must have a non-empty host. Does not enforce an allowlist — call sites are internal tooling, not a public API. """ if not url: return url try: parsed = urllib.parse.urlparse(url) except Exception: raise HTTPException(400, f"{param_name}: not a valid URL") if parsed.scheme not in ("http", "https"): raise HTTPException(400, f"{param_name}: URL must start with http:// or https://") if not parsed.hostname: raise HTTPException(400, f"{param_name}: URL has no hostname") return url def _strip_ansi(text: str) -> str: """Remove ANSI escape codes from a string.""" return re.sub(r'\x1b\[[0-9;]*m', '', text) def _find_latest_summary(results_dir: str | None) -> Path | None: """Find the newest summary.json under results_dir, or None if not found.""" if not results_dir: return None rdir = Path(results_dir) if not rdir.exists(): return None # Subdirs are named YYYY-MM-DD-HHMMSS; sort lexicographically for chronological order subdirs = sorted( [d for d in rdir.iterdir() if d.is_dir()], key=lambda d: d.name, ) for subdir in reversed(subdirs): summary = subdir / "summary.json" if summary.exists(): return summary return None # ── GET /tasks ───────────────────────────────────────────────────────────────── @router.get("/tasks") def get_tasks() -> dict: """Return task list from bench_tasks.yaml.""" cfg = _load_cforch_config() tasks_path = cfg.get("bench_tasks", "") if not tasks_path: return {"tasks": [], "types": []} p = Path(tasks_path) if not p.exists(): return {"tasks": [], "types": []} try: raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {} except yaml.YAMLError as exc: logger.warning("Failed to parse bench_tasks.yaml %s: %s", p, exc) return {"tasks": [], "types": []} tasks_raw = raw.get("tasks", []) or [] tasks: list[dict] = [] seen_types: list[str] = [] types_set: set[str] = set() for t in tasks_raw: if not isinstance(t, dict): continue tasks.append({ "id": t.get("id", ""), "name": t.get("name", ""), "type": t.get("type", ""), "prompt": (t.get("prompt") or "").strip(), "system": (t.get("system") or "").strip(), }) task_type = t.get("type", "") if task_type and task_type not in types_set: seen_types.append(task_type) types_set.add(task_type) return {"tasks": tasks, "types": seen_types} # ── GET /models ──────────────────────────────────────────────────────────────── # Services and roles surfaced in the benchmark model picker. # Covers all cf-orch service types that benchmark.py can route tasks to. _BENCH_SERVICES = frozenset({ "cf-text", "vllm", # LLM text generation "cf-stt", # speech-to-text "cf-tts", # text-to-speech "cf-vision", # image classification / embedding "cf-voice", # audio context classification }) _BENCH_ROLES = frozenset({ "generator", "vlm", # LLM roles "stt", "alm", # speech recognition "tts", # speech synthesis "vision", "embedding", # image understanding "classifier", # audio classification (cf-voice) }) @router.get("/models") def get_models() -> dict: """Return model list from bench_models.yaml merged with locally installed models. bench_models.yaml entries are listed first and take precedence; any installed model whose repo_id is already present in the YAML is skipped. Only models whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision, cf-voice) are surfaced from the installed registry. """ cfg = _load_cforch_config() models_path = cfg.get("bench_models", "") models: list[dict] = [] bench_ids: set[str] = set() if models_path: p = Path(models_path) if p.exists(): try: raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {} except yaml.YAMLError as exc: logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc) raw = {} for m in (raw.get("models", []) or []): if not isinstance(m, dict): continue model_id = m.get("id", "") models.append({ "name": m.get("name", ""), "id": model_id, "service": m.get("service", "ollama"), "tags": m.get("tags", []) or [], "vram_estimate_mb": m.get("vram_estimate_mb", 0), }) if model_id: bench_ids.add(model_id) # Merge installed generator models not already in bench_models.yaml. try: from app.models import list_installed # local import avoids circular dependency at module load for installed in list_installed(): model_id: str = installed.get("model_id") or "" service: str = installed.get("service") or "" role: str = installed.get("role") or "" if not model_id: continue if service not in _BENCH_SERVICES or role not in _BENCH_ROLES: continue if model_id in bench_ids: continue display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id models.append({ "name": display_name, "id": model_id, "service": service, "tags": [role], "vram_estimate_mb": installed.get("vram_mb") or 0, }) bench_ids.add(model_id) except Exception as exc: logger.warning("Could not merge installed models into model list: %s", exc) return {"models": models} # ── GET /run ─────────────────────────────────────────────────────────────────── @router.get("/nodes") def get_nodes() -> dict: """Proxy the coordinator's /api/nodes list, returning node_id + online status. Online is inferred from last_heartbeat: any node with a recent heartbeat is online. Returns an empty list if the coordinator is unreachable. """ cfg = _load_cforch_config() coordinator_url = cfg.get("coordinator_url", "").rstrip("/") if not coordinator_url: return {"nodes": []} try: import httpx as _httpx resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0) resp.raise_for_status() raw_nodes = resp.json().get("nodes", []) return { "nodes": [ { "node_id": n.get("node_id", ""), "online": n.get("last_heartbeat") is not None, "gpus": [ { "gpu_id": g.get("gpu_id"), "name": g.get("name", ""), "vram_total_mb": g.get("vram_total_mb", 0), "vram_free_mb": g.get("vram_free_mb", 0), } for g in n.get("gpus", []) ], } for n in raw_nodes ] } except Exception as exc: logger.warning("Could not fetch nodes from coordinator: %s", exc) return {"nodes": []} @router.get("/run") def run_benchmark( task_ids: str = "", model_ids: str = "", model_tags: str = "", coordinator_url: str = "", ollama_url: str = "", judge_url: str = "", judge_backend: str = "chat", workers: int = 1, node_ids: str = "", ) -> StreamingResponse: """Spawn cf-orch benchmark.py and stream stdout as SSE progress events.""" global _BENCH_RUNNING, _bench_proc # Check if the process is actually still alive; reset stale flag if not. if _BENCH_RUNNING: if _bench_proc is not None and _bench_proc.poll() is None: raise HTTPException(409, "A benchmark is already running") _BENCH_RUNNING = False _bench_proc = None cfg = _load_cforch_config() bench_script = cfg.get("bench_script", "") bench_tasks = cfg.get("bench_tasks", "") bench_models = cfg.get("bench_models", "") results_dir = cfg.get("results_dir", "") python_bin = cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python") cfg_coordinator = cfg.get("coordinator_url", "") cfg_ollama = cfg.get("ollama_url", "") cfg_license_key = cfg.get("license_key", "") cfg_judge_url = cfg.get("judge_url", "") # Validate URL params before spawning the subprocess. # _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts). _validate_service_url(coordinator_url, "coordinator_url") _validate_service_url(ollama_url, "ollama_url") _validate_service_url(judge_url, "judge_url") def generate(): global _BENCH_RUNNING, _bench_proc if not bench_script or not Path(bench_script).exists(): yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n" return # Build effective models file: bench_models.yaml + any installed models # whose IDs were selected but are absent from the YAML (e.g. downloaded # via the Models view). Written to a temp file so benchmark.py sees one # unified list; cleaned up in the finally block. effective_models_file = bench_models _tmp_models_path: str | None = None if model_ids and bench_models and Path(bench_models).exists(): requested_ids = set(model_ids.split(",")) try: raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {} bench_entries: list[dict] = raw_bench.get("models", []) or [] bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)} missing_ids = requested_ids - bench_id_set if missing_ids: from app.models import list_installed installed_map = { m["model_id"]: m for m in list_installed() if m.get("model_id") and m.get("service") in _BENCH_SERVICES } extra: list[dict] = [] for mid in missing_ids: if mid in installed_map: inst = installed_map[mid] entry: dict[str, Any] = { "id": mid, "name": mid.split("/", 1)[-1] if "/" in mid else mid, "service": inst.get("service", "cf-text"), "vram_estimate_mb": inst.get("vram_mb") or 0, "tags": [inst.get("role", "generator")], "temperature": 0.0, } local_path = inst.get("path", "") or inst.get("local_path", "") if local_path: entry["model_path"] = local_path extra.append(entry) if extra: merged = {"models": bench_entries + extra} tf = tempfile.NamedTemporaryFile( mode="w", suffix=".yaml", delete=False, prefix="avocet_bench_models_", ) yaml.dump(merged, tf) tf.close() _tmp_models_path = tf.name effective_models_file = _tmp_models_path except Exception as exc: logger.warning("Could not merge installed models into temp bench file: %s", exc) cmd = [ python_bin, bench_script, "--tasks", bench_tasks, "--models", effective_models_file, "--output", results_dir, ] if task_ids: cmd.extend(["--filter-tasks"] + task_ids.split(",")) if model_ids: cmd.extend(["--filter-models"] + model_ids.split(",")) if model_tags: cmd.extend(["--filter-tags"] + model_tags.split(",")) # query param overrides config, config overrides env var (already resolved by _load_cforch_config) effective_coordinator = coordinator_url if coordinator_url else cfg_coordinator effective_ollama = ollama_url if ollama_url else cfg_ollama if effective_coordinator: cmd.extend(["--coordinator", effective_coordinator]) if effective_ollama: cmd.extend(["--ollama-url", effective_ollama]) effective_judge = judge_url if judge_url else cfg_judge_url if effective_judge: cmd.extend(["--judge-url", effective_judge]) if judge_backend and judge_backend != "chat": cmd.extend(["--judge-backend", judge_backend]) if workers > 1: cmd.extend(["--workers", str(workers)]) if node_ids: cmd.extend(["--nodes"] + node_ids.split(",")) # Pass license key as env var so subprocess can authenticate with cf-orch proc_env = {**os.environ} if cfg_license_key: proc_env["CF_LICENSE_KEY"] = cfg_license_key _BENCH_RUNNING = True try: proc = _subprocess.Popen( cmd, stdout=_subprocess.PIPE, stderr=_subprocess.STDOUT, text=True, bufsize=1, env=proc_env, ) _bench_proc = proc _IDLE_TIMEOUT_S = 120 # kill if no output for 2 minutes (node crash) try: while True: ready = _select.select([proc.stdout], [], [], _IDLE_TIMEOUT_S) if not ready[0]: # No output for IDLE_TIMEOUT_S — node likely crashed proc.terminate() try: proc.wait(timeout=5) except _subprocess.TimeoutExpired: proc.kill() msg = f"Benchmark timed out — no output for {_IDLE_TIMEOUT_S}s (cluster node may have crashed)" yield f"data: {json.dumps({'type': 'error', 'message': msg})}\n\n" break line = proc.stdout.readline() if not line: break line = _strip_ansi(line.rstrip()) if line: yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n" proc.wait() if proc.returncode == 0: summary_path = _find_latest_summary(results_dir) if summary_path is not None: try: summary = json.loads(summary_path.read_text(encoding="utf-8")) yield f"data: {json.dumps({'type': 'result', 'summary': summary})}\n\n" except Exception as exc: logger.warning("Failed to read summary.json: %s", exc) yield f"data: {json.dumps({'type': 'complete'})}\n\n" else: yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n" finally: _bench_proc = None except Exception as exc: yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n" finally: _BENCH_RUNNING = False if _tmp_models_path: try: os.unlink(_tmp_models_path) except OSError: pass return StreamingResponse( generate(), media_type="text/event-stream", headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"}, ) # ── GET /config ──────────────────────────────────────────────────────────────── @router.get("/config") def get_cforch_config() -> dict: """Return resolved cf-orch connection config (env vars merged with yaml). Redacts license_key — only returns whether it is set, not the value. Used by the Settings UI to show current connection state. """ cfg = _load_cforch_config() return { "coordinator_url": cfg.get("coordinator_url", ""), "ollama_url": cfg.get("ollama_url", ""), "ollama_model": cfg.get("ollama_model", ""), "judge_url": cfg.get("judge_url", ""), "license_key_set": bool(cfg.get("license_key", "")), "source": "env" if not _config_file().exists() else "yaml+env", } # ── GET /results ─────────────────────────────────────────────────────────────── @router.get("/results") def get_results() -> list: """Return the latest benchmark summary.json from results_dir.""" cfg = _load_cforch_config() results_dir = cfg.get("results_dir", "") summary_path = _find_latest_summary(results_dir) if summary_path is None: raise HTTPException(404, "No benchmark results found") try: return json.loads(summary_path.read_text(encoding="utf-8")) except Exception as exc: raise HTTPException(500, f"Failed to read summary.json: {exc}") from exc # ── POST /cancel ─────────────────────────────────────────────────────────────── @router.post("/cancel") def cancel_benchmark() -> dict: """Kill the running benchmark subprocess.""" global _BENCH_RUNNING, _bench_proc if not _BENCH_RUNNING: raise HTTPException(404, "No benchmark is currently running") if _bench_proc is not None: try: _bench_proc.terminate() except Exception as exc: logger.warning("Failed to terminate benchmark process: %s", exc) _BENCH_RUNNING = False _bench_proc = None return {"status": "cancelled"}