feat: plans benchmark harness — model scoring for CF planning prompts

Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue
component, and registers /api/plans-bench in api.py. Also extends models
registry (cf-text catalog integration), cforch client, LlmEvalTab, and
ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
This commit is contained in:
pyr0ball 2026-05-02 23:36:04 -07:00
parent e11db5ccd9
commit bce932461a
14 changed files with 3137 additions and 59 deletions

View file

@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
# Set one of these to use a cloud LLM instead of a local model.
# ANTHROPIC_API_KEY=sk-ant-...
# OPENAI_API_KEY=sk-...
# ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
# Generate at https://huggingface.co/settings/tokens and accept model terms first.
# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx

View file

@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
from app.train.train import router as train_router
app.include_router(train_router, prefix="/api/train")
from app.plans_bench import router as plans_bench_router
app.include_router(plans_bench_router, prefix="/api/plans-bench")
# In-memory last-action store (single user, local tool — in-memory is fine)
_last_action: dict | None = None
from app.dashboard import router as dashboard_router
app.include_router(dashboard_router, prefix="/api")

View file

@ -17,9 +17,12 @@ import logging
import os
import re
import subprocess as _subprocess
import tempfile
from pathlib import Path
from typing import Any
import urllib.parse
import yaml
from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse
@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
"license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"),
"ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"),
"ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"),
"judge_url": _coalesce(file_cfg.get("judge_url", ""), "CF_JUDGE_URL"),
"hf_token": _coalesce(file_cfg.get("hf_token", ""), "HF_TOKEN"),
}
def _validate_service_url(url: str, param_name: str) -> str:
"""Validate that a URL is a well-formed http/https URL with a hostname.
Guards against SSRF: only http/https is allowed; the URL must have a
non-empty host. Does not enforce an allowlist call sites are internal
tooling, not a public API.
"""
if not url:
return url
try:
parsed = urllib.parse.urlparse(url)
except Exception:
raise HTTPException(400, f"{param_name}: not a valid URL")
if parsed.scheme not in ("http", "https"):
raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
if not parsed.hostname:
raise HTTPException(400, f"{param_name}: URL has no hostname")
return url
def _strip_ansi(text: str) -> str:
"""Remove ANSI escape codes from a string."""
return re.sub(r'\x1b\[[0-9;]*m', '', text)
@ -147,48 +172,141 @@ def get_tasks() -> dict:
# ── GET /models ────────────────────────────────────────────────────────────────
# Services and roles surfaced in the benchmark model picker.
# Covers all cf-orch service types that benchmark.py can route tasks to.
_BENCH_SERVICES = frozenset({
"cf-text", "vllm", # LLM text generation
"cf-stt", # speech-to-text
"cf-tts", # text-to-speech
"cf-vision", # image classification / embedding
"cf-voice", # audio context classification
})
_BENCH_ROLES = frozenset({
"generator", "vlm", # LLM roles
"stt", "alm", # speech recognition
"tts", # speech synthesis
"vision", "embedding", # image understanding
"classifier", # audio classification (cf-voice)
})
@router.get("/models")
def get_models() -> dict:
"""Return model list from bench_models.yaml."""
"""Return model list from bench_models.yaml merged with locally installed models.
bench_models.yaml entries are listed first and take precedence; any installed
model whose repo_id is already present in the YAML is skipped. Only models
whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
cf-voice) are surfaced from the installed registry.
"""
cfg = _load_cforch_config()
models_path = cfg.get("bench_models", "")
if not models_path:
return {"models": []}
p = Path(models_path)
if not p.exists():
return {"models": []}
try:
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
return {"models": []}
models_raw = raw.get("models", []) or []
models: list[dict] = []
for m in models_raw:
if not isinstance(m, dict):
continue
models.append({
"name": m.get("name", ""),
"id": m.get("id", ""),
"service": m.get("service", "ollama"),
"tags": m.get("tags", []) or [],
"vram_estimate_mb": m.get("vram_estimate_mb", 0),
})
bench_ids: set[str] = set()
if models_path:
p = Path(models_path)
if p.exists():
try:
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
raw = {}
for m in (raw.get("models", []) or []):
if not isinstance(m, dict):
continue
model_id = m.get("id", "")
models.append({
"name": m.get("name", ""),
"id": model_id,
"service": m.get("service", "ollama"),
"tags": m.get("tags", []) or [],
"vram_estimate_mb": m.get("vram_estimate_mb", 0),
})
if model_id:
bench_ids.add(model_id)
# Merge installed generator models not already in bench_models.yaml.
try:
from app.models import list_installed # local import avoids circular dependency at module load
for installed in list_installed():
model_id: str = installed.get("model_id") or ""
service: str = installed.get("service") or ""
role: str = installed.get("role") or ""
if not model_id:
continue
if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
continue
if model_id in bench_ids:
continue
display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
models.append({
"name": display_name,
"id": model_id,
"service": service,
"tags": [role],
"vram_estimate_mb": installed.get("vram_mb") or 0,
})
bench_ids.add(model_id)
except Exception as exc:
logger.warning("Could not merge installed models into model list: %s", exc)
return {"models": models}
# ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/nodes")
def get_nodes() -> dict:
"""Proxy the coordinator's /api/nodes list, returning node_id + online status.
Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
Returns an empty list if the coordinator is unreachable.
"""
cfg = _load_cforch_config()
coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
if not coordinator_url:
return {"nodes": []}
try:
import httpx as _httpx
resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
resp.raise_for_status()
raw_nodes = resp.json().get("nodes", [])
return {
"nodes": [
{
"node_id": n.get("node_id", ""),
"online": n.get("last_heartbeat") is not None,
"gpus": [
{
"gpu_id": g.get("gpu_id"),
"name": g.get("name", ""),
"vram_total_mb": g.get("vram_total_mb", 0),
"vram_free_mb": g.get("vram_free_mb", 0),
}
for g in n.get("gpus", [])
],
}
for n in raw_nodes
]
}
except Exception as exc:
logger.warning("Could not fetch nodes from coordinator: %s", exc)
return {"nodes": []}
@router.get("/run")
def run_benchmark(
task_ids: str = "",
model_ids: str = "",
model_tags: str = "",
coordinator_url: str = "",
ollama_url: str = "",
judge_url: str = "",
judge_backend: str = "chat",
workers: int = 1,
node_ids: str = "",
) -> StreamingResponse:
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
global _BENCH_RUNNING, _bench_proc
@ -205,6 +323,13 @@ def run_benchmark(
cfg_coordinator = cfg.get("coordinator_url", "")
cfg_ollama = cfg.get("ollama_url", "")
cfg_license_key = cfg.get("license_key", "")
cfg_judge_url = cfg.get("judge_url", "")
# Validate URL params before spawning the subprocess.
# _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
_validate_service_url(coordinator_url, "coordinator_url")
_validate_service_url(ollama_url, "ollama_url")
_validate_service_url(judge_url, "judge_url")
def generate():
global _BENCH_RUNNING, _bench_proc
@ -213,16 +338,68 @@ def run_benchmark(
yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
return
# Build effective models file: bench_models.yaml + any installed models
# whose IDs were selected but are absent from the YAML (e.g. downloaded
# via the Models view). Written to a temp file so benchmark.py sees one
# unified list; cleaned up in the finally block.
effective_models_file = bench_models
_tmp_models_path: str | None = None
if model_ids and bench_models and Path(bench_models).exists():
requested_ids = set(model_ids.split(","))
try:
raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
bench_entries: list[dict] = raw_bench.get("models", []) or []
bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
missing_ids = requested_ids - bench_id_set
if missing_ids:
from app.models import list_installed
installed_map = {
m["model_id"]: m
for m in list_installed()
if m.get("model_id") and m.get("service") in _BENCH_SERVICES
}
extra: list[dict] = []
for mid in missing_ids:
if mid in installed_map:
inst = installed_map[mid]
entry: dict[str, Any] = {
"id": mid,
"name": mid.split("/", 1)[-1] if "/" in mid else mid,
"service": inst.get("service", "cf-text"),
"vram_estimate_mb": inst.get("vram_mb") or 0,
"tags": [inst.get("role", "generator")],
"temperature": 0.0,
}
local_path = inst.get("path", "") or inst.get("local_path", "")
if local_path:
entry["model_path"] = local_path
extra.append(entry)
if extra:
merged = {"models": bench_entries + extra}
tf = tempfile.NamedTemporaryFile(
mode="w", suffix=".yaml", delete=False,
prefix="avocet_bench_models_",
)
yaml.dump(merged, tf)
tf.close()
_tmp_models_path = tf.name
effective_models_file = _tmp_models_path
except Exception as exc:
logger.warning("Could not merge installed models into temp bench file: %s", exc)
cmd = [
python_bin,
bench_script,
"--tasks", bench_tasks,
"--models", bench_models,
"--models", effective_models_file,
"--output", results_dir,
]
if task_ids:
cmd.extend(["--filter-tasks"] + task_ids.split(","))
if model_ids:
cmd.extend(["--filter-models"] + model_ids.split(","))
if model_tags:
cmd.extend(["--filter-tags"] + model_tags.split(","))
@ -233,6 +410,15 @@ def run_benchmark(
cmd.extend(["--coordinator", effective_coordinator])
if effective_ollama:
cmd.extend(["--ollama-url", effective_ollama])
effective_judge = judge_url if judge_url else cfg_judge_url
if effective_judge:
cmd.extend(["--judge-url", effective_judge])
if judge_backend and judge_backend != "chat":
cmd.extend(["--judge-backend", judge_backend])
if workers > 1:
cmd.extend(["--workers", str(workers)])
if node_ids:
cmd.extend(["--nodes"] + node_ids.split(","))
# Pass license key as env var so subprocess can authenticate with cf-orch
proc_env = {**os.environ}
@ -273,6 +459,11 @@ def run_benchmark(
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
finally:
_BENCH_RUNNING = False
if _tmp_models_path:
try:
os.unlink(_tmp_models_path)
except OSError:
pass
return StreamingResponse(
generate(),
@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
"coordinator_url": cfg.get("coordinator_url", ""),
"ollama_url": cfg.get("ollama_url", ""),
"ollama_model": cfg.get("ollama_model", ""),
"judge_url": cfg.get("judge_url", ""),
"license_key_set": bool(cfg.get("license_key", "")),
"source": "env" if not _config_file().exists() else "yaml+env",
}
@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
# ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results")
def get_results() -> dict:
def get_results() -> list:
"""Return the latest benchmark summary.json from results_dir."""
cfg = _load_cforch_config()
results_dir = cfg.get("results_dir", "")

View file

@ -15,6 +15,7 @@ from __future__ import annotations
import json
import logging
import os
import re
import shutil
import threading
from datetime import datetime, timezone
@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(
router = APIRouter()
# ── HuggingFace auth ─────────────────────────────────────────────────────────
def _get_hf_token() -> str | None:
"""Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
config_file = _ROOT / "config" / "label_tool.yaml"
if config_file.exists():
try:
import yaml as _yaml
raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
if token:
return token
except Exception:
pass
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
# ── GGUF quantization detection ───────────────────────────────────────────────
# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
_QUANT_RE = re.compile(
r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
re.IGNORECASE,
)
# ── Download progress shared state ────────────────────────────────────────────
# Updated by the background download thread; read by GET /download/stream.
_download_progress: dict[str, Any] = {}
@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
"audio-classification": {"adapter": None, "role": "classifier", "service": "cf-voice"},
# TTS — cf-tts text-to-speech service
"text-to-speech": {"adapter": None, "role": "tts", "service": "cf-tts"},
# Vision — cf-vision image classification / embedding / VLM service
# Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
"image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
"zero-shot-image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
"image-feature-extraction": {"adapter": None, "role": "embedding", "service": "cf-vision"},
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "cf-vision"},
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "cf-vision"},
# Generative VLMs (image+text → text) — run under vllm, not cf-vision.
# cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
# LLaVA, and InternVL are textgen models that happen to accept image inputs.
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "vllm"},
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "vllm"},
# Image generation — cf-image (text → image; distinct from cf-vision image understanding)
"text-to-image": {"adapter": None, "role": "image-gen", "service": "cf-image"},
# Embedding — cf-core shared embedding layer
@ -195,10 +223,17 @@ def _get_queue_entry(entry_id: str) -> dict | None:
def _catalog_key(repo_id: str) -> str:
"""Derive a readable catalog key from repo_id.
ibm-granite/granite-4.1-8b granite-4.1-8b
facebook/bart-large-cnn bart-large-cnn
ibm-granite/granite-4.1-8b granite-4.1-8b
facebook/bart-large-cnn bart-large-cnn
WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF opus4.7-gods.ghost.codex-4b
The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
"""
return repo_id.split("/", 1)[-1].lower()
key = repo_id.split("/", 1)[-1].lower()
if key.endswith(".gguf"):
key = key[:-5]
return key
def _insert_catalog_entry(content: str, entry_lines: str) -> str:
@ -290,6 +325,15 @@ def _register_in_node_catalogs(
max_mb: int = cf_text.get("max_mb", 0)
catalog: dict = cf_text.get("catalog") or {}
# If the node has a different local model dir, remap the NFS path.
model_base = cf_text.get("model_base_path", "").rstrip("/")
if model_base:
nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
model_name = local_path.name
effective_path_str = f"{model_base}/{model_name}"
else:
effective_path_str = local_path_str
# Skip if key already exists
if model_key in catalog:
logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
@ -301,10 +345,10 @@ def _register_in_node_catalogs(
for entry in catalog.values()
if isinstance(entry, dict)
}
if local_path_str in registered_paths or any(
p.startswith(local_path_str + "/") for p in registered_paths
if effective_path_str in registered_paths or any(
p.startswith(effective_path_str + "/") for p in registered_paths
):
logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name)
logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
continue
# Determine whether model fits at FP16 or needs 4-bit
@ -330,12 +374,18 @@ def _register_in_node_catalogs(
if needs_4bit
else f" # FP16 file-size estimate"
)
env_block = (
f" env:\n"
f" CF_TEXT_4BIT: \"1\"\n"
if needs_4bit else ""
)
entry_block = (
f" # auto-registered by avocet on download\n"
f" {model_key}:\n"
f" path: {local_path_str}\n"
f" path: {effective_path_str}\n"
f" vram_mb: {vram_for_node}{vram_comment}\n"
f" description: \"{desc}\"\n"
f"{env_block}"
)
new_content = _insert_catalog_entry(content, entry_block)
@ -388,12 +438,17 @@ def _run_download(
role: str | None = None,
service: str | None = None,
model_size_bytes: int = 0,
quant_pattern: str | None = None,
) -> None:
"""Background thread: download model via huggingface_hub.snapshot_download.
model_size_bytes is the sum of file sizes reported by the HF API (siblings).
It is used to estimate vram_mb and written to model_info.json so cf-orch can
budget VRAM when allocating a cf-text instance for this model.
quant_pattern: when set, restricts snapshot_download to only files matching
*{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
from GGUF-only repos like bartowski/*.
"""
global _download_progress
local_dir = _model_dir_for(repo_id, service)
@ -422,10 +477,20 @@ def _run_download(
local_dir.mkdir(parents=True, exist_ok=True)
poll_thread.start()
snapshot_download(
repo_id=repo_id,
local_dir=str(local_dir),
)
dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
hf_token = _get_hf_token()
if hf_token:
dl_kwargs["token"] = hf_token
if quant_pattern:
# Include both cases: repos use mixed conventions (Q6_K vs q6_k).
dl_kwargs["allow_patterns"] = [
f"*{quant_pattern.upper()}*.gguf",
f"*{quant_pattern.lower()}*.gguf",
"*.json",
"README.md",
]
snapshot_download(**dl_kwargs)
# Estimate VRAM from reported file size.
# HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
)
logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)
# Estimate model size from siblings list
# Detect GGUF files and parse quant names from siblings list.
# For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
# a per-quant size picker instead of downloading every variant.
siblings = data.get("siblings") or []
model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
gguf_files: list[dict] = []
for s in siblings:
if not isinstance(s, dict):
continue
fname: str = s.get("rfilename", "")
if not fname.lower().endswith(".gguf"):
continue
m = _QUANT_RE.search(fname)
gguf_files.append({
"filename": fname,
"size": s.get("size", 0) or 0,
"quant_name": m.group(1).upper() if m else None,
})
gguf_files.sort(key=lambda f: f["size"])
# model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
# For GGUF repos the frontend will substitute the selected quant's size on submit.
if gguf_files:
model_size_bytes: int = sum(f["size"] for f in gguf_files)
else:
model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
# Description: first 300 chars of card data (modelId field used as fallback)
card_data = data.get("cardData") or {}
@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
"compatible": compatible,
"warning": warning,
"model_size_bytes": model_size_bytes,
"gguf_files": gguf_files if gguf_files else None,
"description": description,
"tags": data.get("tags") or [],
"downloads": data.get("downloads") or 0,
@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
# Stored in the queue entry so approve can pass it to _run_download
# without a second HF API round-trip.
model_size_bytes: int = 0
# GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
# restricts to *{quant_pattern}*.gguf instead of fetching all variants.
quant_pattern: str | None = None
@router.post("/queue", status_code=201)
@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
"role": req.role,
"service": req.service,
"model_size_bytes": req.model_size_bytes,
"quant_pattern": req.quant_pattern,
"status": "pending",
"queued_at": datetime.now(timezone.utc).isoformat(),
}
@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
entry.get("role"),
entry.get("service"),
entry.get("model_size_bytes", 0),
entry.get("quant_pattern"),
),
daemon=True,
name=f"model-download-{entry_id}",
@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
return {"ok": True}
# ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
class QueuePatchRequest(BaseModel):
service: str | None = None
role: str | None = None
@router.patch("/queue/{entry_id}")
def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
"""Update mutable fields (service, role) on a pending queue entry."""
entry = _get_queue_entry(entry_id)
if entry is None:
raise HTTPException(404, f"Queue entry {entry_id!r} not found")
if entry.get("status") != "pending":
raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
updates: dict = {}
if body.service is not None:
updates["service"] = body.service
if body.role is not None:
updates["role"] = body.role
updated = _update_queue_entry(entry_id, updates)
return updated or {}
# ── DELETE /queue/{id} ─────────────────────────────────────────────────────────
@router.delete("/queue/{entry_id}")

View file

@ -41,11 +41,15 @@ cforch:
# Python interpreter with cf-orch installed
python_bin: /devl/miniconda3/envs/cf/bin/python
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
# coordinator_url: http://localhost:7700
# license_key: CFG-AVCT-xxxx-xxxx-xxxx
# ollama_url: http://localhost:11434
# ollama_model: llama3.2:3b
# judge_url: http://10.1.10.158:8008 # Sif cf-text — LLM-as-judge secondary scorer
# judge_url: http://10.1.10.71:8008 # Heimdall cf-text (alternative)
# Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
# hf_token: hf_xxxxxxxxxxxxxxxxxxxx # HuggingFace token — required for gated/terms-restricted models
# Imitate tab — pull real samples from sibling CF product APIs and run them
# through local LLMs to build a corrections dataset.

View file

@ -90,6 +90,12 @@ usage() {
echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]"
echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]"
echo ""
echo " Planning Benchmark:"
echo -e " ${GREEN}plans-bench [args]${NC} Run benchmark_plans.py (args passed through)"
echo -e " ${GREEN}plans-list${NC} Shortcut: --list-models"
echo -e " ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
echo -e " ${GREEN}plans-compare <m1> <m2> [more]${NC} Compare models side-by-side"
echo ""
echo " Writing Style Benchmark:"
echo -e " ${GREEN}style-bench [args]${NC} Run benchmark_style.py (args passed through)"
echo -e " ${GREEN}style-list${NC} List available ollama models for style bench"
@ -127,6 +133,8 @@ case "$CMD" in
fi
mkdir -p "$LOG_DIR"
API_LOG="${LOG_DIR}/api.log"
# Load .env if present — sets HF_TOKEN and other optional overrides.
[[ -f .env ]] && set -a && source .env && set +a
info "Building Vue SPA…"
(cd web && npm run build) >> "$API_LOG" 2>&1
info "Starting FastAPI on port ${API_PORT}"
@ -179,6 +187,9 @@ case "$CMD" in
mkdir -p "$LOG_DIR"
DEV_API_LOG="${LOG_DIR}/dev-api.log"
# Load .env if present — sets HF_TOKEN and other optional overrides.
[[ -f .env ]] && set -a && source .env && set +a
if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
else
@ -255,6 +266,30 @@ case "$CMD" in
exec "$0" benchmark --compare "$@"
;;
plans-bench)
info "Running planning benchmark (${ENV_UI})…"
"$PYTHON_UI" scripts/benchmark_plans.py "$@"
;;
plans-list)
exec "$0" plans-bench --list-models
;;
plans-run)
if [[ $# -lt 1 ]]; then
error "Usage: ./manage.sh plans-run <model-key> [extra args]"
fi
MODEL="$1"; shift
exec "$0" plans-bench --model "$MODEL" --verbose "$@"
;;
plans-compare)
if [[ $# -lt 2 ]]; then
error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
fi
exec "$0" plans-bench --compare "$@" --verbose
;;
style-bench)
info "Running writing style benchmark (${ENV_BM})…"
if [[ ! -x "$PYTHON_BM" ]]; then

719
scripts/benchmark_plans.py Normal file
View file

@ -0,0 +1,719 @@
#!/usr/bin/env python
"""CF-specific planning benchmark — compare base models before fine-tuning.
Sends held-out CircuitForge planning prompts to one or more models via the
cf-text (local) or cf-orch API, then scores responses against CF-specific
rubrics. Use this to select the best base model for SFT.
Scoring rubrics (each 0-1, summed to total/N):
- task_structure : uses checkbox syntax (- [ ]), git commit steps
- tier_awareness : mentions Free/Paid/Premium/Ultra tiers
- privacy_pillar : mentions privacy/local-inference/no-logging
- safety_pillar : mentions safety, human approval, or reversibility
- accessibility : mentions ND/accessibility/adaptive needs
- license_split : mentions MIT vs BSL or open-core model
- file_paths : uses plausible file path references
- cf_conventions : uses conda run -n cf, /Library/Development/, or known CF dirs
- paired_coherence : (paired only) plan references the design doc's feature name
- length_ok : 3002500 words (under-short = hallucination risk; over-long = padding)
Usage
-----
# List available model targets
python scripts/benchmark_plans.py --list-models
# Run all held-out prompts against a single model, print report
python scripts/benchmark_plans.py --model llama3.2-3b
# Compare two models side-by-side
python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
# Run with a custom API base (cf-text default: http://localhost:8080/v1)
python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
# Export detailed results JSON
python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any
import httpx
# ── Paths ──────────────────────────────────────────────────────────────────────
_ROOT = Path(__file__).parent.parent
_DATA_DIR = _ROOT / "data"
CF_TEXT_BASE = "http://localhost:8080/v1"
CF_ORCH_BASE = "http://localhost:8090/v1"
CF_COORD_URL = "http://10.1.10.71:7700" # cf-orch coordinator (LAN)
# ── Held-out prompts ───────────────────────────────────────────────────────────
# These are NOT in the training export (no matching docs in circuitforge-plans/).
# Each prompt exercises a different CF planning domain.
HELD_OUT_PROMPTS: list[dict[str, Any]] = [
{
"id": "ho_001",
"name": "kiwi_barcode_ocr",
"domain": "feature_plan",
"prompt": (
"You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
"Write a detailed implementation plan for adding barcode scanning via device camera "
"and receipt OCR to the item-add flow.\n\n"
"The plan should include: file structure (create/modify), step-by-step task checklist "
"with checkboxes, any DB migrations, and git commit steps."
),
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
},
{
"id": "ho_002",
"name": "peregrine_ats_scoring",
"domain": "feature_design",
"prompt": (
"Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
"Context: Peregrine users paste job descriptions and their resume. "
"We want to score how well the resume keywords match the JD and suggest rewrites. "
"Describe the architecture, data flow, and key design decisions."
),
"expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
},
{
"id": "ho_003",
"name": "tier_gate_local_llm",
"domain": "architecture",
"prompt": (
"Design the tier-gating architecture for a new CircuitForge product. "
"The product should:\n"
"- Default to local LLM inference for all tiers\n"
"- Unlock cloud LLM for Paid tier and above\n"
"- Keep fine-tuned model weights for Premium/Ultra only\n\n"
"Describe how the tier check integrates with the LLM router, "
"what happens when a Free user tries a Paid-tier feature, "
"and how BYOK (bring-your-own-key) fits in."
),
"expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
},
{
"id": "ho_004",
"name": "heimdall_webhook_plan",
"domain": "feature_plan",
"prompt": (
"Break the following Heimdall feature into a detailed implementation plan with "
"file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
"Heimdall is the CircuitForge license server (FastAPI + SQLite). "
"The webhook needs to handle checkout.session.completed, "
"customer.subscription.updated, and customer.subscription.deleted events."
),
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
},
{
"id": "ho_005",
"name": "nd_accessible_onboarding",
"domain": "ux_design",
"prompt": (
"You are a product designer working on Harrier, a CircuitForge tool for "
"helping people navigate government benefits applications.\n\n"
"Design the onboarding flow for neurodivergent (ND) users. "
"Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
"and rejection sensitivity. The flow should reduce cognitive load and "
"never use urgency or panic patterns."
),
"expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
},
{
"id": "ho_006",
"name": "circuitforge_core_extraction",
"domain": "architecture",
"prompt": (
"Produce a CircuitForge-style design document for the following circuitforge-core "
"feature — shared ActivityPub federation module.\n\n"
"Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
"to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
"Design the module API, describe what belongs in MIT vs BSL, and note federation "
"privacy constraints."
),
"expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
},
{
"id": "ho_007",
"name": "snipe_trust_score_plan",
"domain": "feature_plan",
"prompt": (
"You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
"Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
"The score should combine: feedback ratio, account age, item-specifics completeness, "
"listing photo quality, and shipping time accuracy. "
"Include file structure, test plan, and migration steps."
),
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
},
{
"id": "ho_008",
"name": "avocet_training_pipeline",
"domain": "feature_plan",
"prompt": (
"Break the following Avocet feature into a detailed implementation plan — "
"end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
"Avocet is the CircuitForge email classifier training tool. "
"The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
"quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
"in the Avocet model queue if it beats the baseline."
),
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
},
{
"id": "ho_009",
"name": "privacy_data_flow",
"domain": "architecture",
"prompt": (
"Design the data privacy architecture for a CircuitForge cloud product. "
"Describe: what PII is collected, how it's stored, retention policy, "
"obfuscation strategy for cloud-side logs, and how consent is obtained "
"in plain language. The product handles job applications (resumes, cover letters)."
),
"expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
},
{
"id": "ho_010",
"name": "git_workflow_doc",
"domain": "process_doc",
"prompt": (
"Write a developer process document for CircuitForge: conventional commit and "
"branch workflow for a BSL 1.1 open-core product.\n\n"
"Cover: commit message format (type: description), branch naming, "
"when to use feature branches vs direct main commits, "
"how the MIT/BSL split affects which commits go in which branch, "
"and how CI gates on gitleaks for secret scanning."
),
"expected_signals": ["license_split", "cf_conventions", "task_structure"],
},
]
# ── Rubric scoring ─────────────────────────────────────────────────────────────
_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
@dataclass
class RubricScore:
task_structure: float = 0.0
tier_awareness: float = 0.0
privacy_pillar: float = 0.0
safety_pillar: float = 0.0
accessibility: float = 0.0
license_split: float = 0.0
file_paths: float = 0.0
cf_conventions: float = 0.0
length_ok: float = 0.0
def total(self) -> float:
vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
self.safety_pillar, self.accessibility, self.license_split,
self.file_paths, self.cf_conventions, self.length_ok]
return sum(vals) / len(vals)
def as_dict(self) -> dict[str, float]:
return asdict(self)
def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
words = len(response.split())
s = RubricScore()
# Task structure: needs checkboxes AND at least one commit step
checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
has_commit = bool(_COMMIT_RE.search(response))
s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
# Tier awareness
s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
# Privacy pillar
s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
# Safety pillar
s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
# Accessibility
s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
# License split awareness
s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
# File paths: at least 3 plausible path references
s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
# CF conventions
s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
# Length: 2002500 words is healthy; outside = partial credit
if 200 <= words <= 2500:
s.length_ok = 1.0
elif words < 200:
s.length_ok = words / 200
else:
s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
return s
# ── Model client ───────────────────────────────────────────────────────────────
# Registry of named model targets (shorthand → {api_base, model_name})
MODEL_REGISTRY: dict[str, dict[str, str]] = {
"deepseek-r1-1.5b": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-r1-1.5b",
"description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
},
"deepseek-r1-7b-4bit": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-r1-7b-4bit",
"description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
},
"deepseek-coder-6.7b-4bit": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-coder-6.7b-4bit",
"description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
},
"granite-4.1-8b": {
"api_base": CF_TEXT_BASE,
"model": "granite-4.1-8b",
"description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
},
"qwen2.5-3b": {
"api_base": CF_TEXT_BASE,
"model": "qwen2.5-3b",
"description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
},
"qwen2.5-7b": {
"api_base": CF_TEXT_BASE,
"model": "qwen2.5-7b",
"description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
},
}
# ── cf-orch allocation ─────────────────────────────────────────────────────────
def _cforch_allocate(
model_id: str,
cforch_url: str,
startup_timeout_s: float = 300.0,
) -> tuple[str, str] | None:
"""Allocate a cf-text instance for model_id via the cf-orch coordinator.
Returns (service_url, allocation_id) on success, None on failure.
service_url is the direct node URL exposing /v1/chat/completions.
"""
try:
resp = httpx.post(
f"{cforch_url}/api/services/cf-text/allocate",
json={
"model_candidates": [model_id],
"caller": "avocet",
"pipeline": "plans_benchmark",
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
service_url: str = data["url"]
allocation_id: str = data.get("allocation_id", "")
node_id: str = data.get("node_id", "")
gpu_id: int | None = data.get("gpu_id")
if data.get("started", False) and not data.get("warm", True):
# Use \n so the SSE generator sees the line immediately
print(f" [cold start] loading {model_id!r} — polling every 3s…", flush=True)
t0 = time.monotonic()
deadline = t0 + startup_timeout_s
probe_misses = 0
while time.monotonic() < deadline:
elapsed = time.monotonic() - t0
try:
status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
if status.is_success:
instances = status.json().get("instances", [])
match = next(
(i for i in instances
if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
None,
)
if match:
probe_misses = 0
state = match.get("state", "")
if state == "running":
print(f" [cold start] ready in {elapsed:.0f}s", flush=True)
return service_url, allocation_id
elif state == "stopped":
print(f" [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
return None
else:
# still starting — emit keepalive so SSE stream stays alive
print(f" [cold start] state={state!r} elapsed={elapsed:.0f}s", flush=True)
else:
probe_misses += 1
print(f" [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
if probe_misses >= 6:
try:
h = httpx.get(f"{service_url}/health", timeout=3.0)
if h.is_success:
print(f" [cold start] ready via health check in {elapsed:.0f}s", flush=True)
return service_url, allocation_id
except Exception:
pass
else:
print(f" [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
except Exception as poll_exc:
print(f" [cold start] poll error: {poll_exc} elapsed={elapsed:.0f}s", flush=True)
time.sleep(3.0)
print(f" [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
return None
return service_url, allocation_id
except Exception as exc:
print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
return None
def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
"""Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
t0 = time.monotonic()
resp = httpx.post(
f"{service_url.rstrip('/')}/v1/chat/completions",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 2048,
"temperature": 0.2,
},
timeout=timeout,
)
resp.raise_for_status()
latency = time.monotonic() - t0
text = resp.json()["choices"][0]["message"]["content"]
return text, latency
def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
"""Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
t0 = time.monotonic()
resp = httpx.post(
f"{api_base}/chat/completions",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 2048,
"temperature": 0.2,
},
timeout=timeout,
)
resp.raise_for_status()
latency = time.monotonic() - t0
text = resp.json()["choices"][0]["message"]["content"]
return text, latency
# ── Benchmark runner ───────────────────────────────────────────────────────────
@dataclass
class PromptResult:
prompt_id: str
prompt_name: str
model_key: str
response: str
latency_s: float
word_count: int
scores: dict[str, float]
total_score: float
error: str | None = None
def run_benchmark(
model_key: str,
model_name: str,
prompts: list[dict[str, Any]] | None = None,
verbose: bool = False,
# cf-orch path
use_cforch: bool = False,
cforch_url: str = CF_COORD_URL,
# direct path (used when not cf-orch)
api_base: str = CF_TEXT_BASE,
) -> list[PromptResult]:
"""Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
if prompts is None:
prompts = HELD_OUT_PROMPTS
# Allocate once per model when using cf-orch
service_url: str | None = None
if use_cforch:
print(f" Allocating {model_name!r} via cf-orch…", flush=True)
alloc = _cforch_allocate(model_name, cforch_url)
if alloc is None:
# Return all prompts as errors
return [
PromptResult(
prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
error=f"cf-orch allocation failed for {model_name!r}",
)
for p in prompts
]
service_url, _alloc_id = alloc
results: list[PromptResult] = []
for p in prompts:
if verbose:
print(f" [{p['id']}] {p['name']}", end="", flush=True)
try:
if service_url:
response, latency = _call_model_direct(service_url, model_name, p["prompt"])
else:
response, latency = _call_model(api_base, model_name, p["prompt"])
rubric = score_response(response, p)
result = PromptResult(
prompt_id=p["id"],
prompt_name=p["name"],
model_key=model_key,
response=response,
latency_s=round(latency, 2),
word_count=len(response.split()),
scores=rubric.as_dict(),
total_score=round(rubric.total(), 3),
)
if verbose:
print(f"score={result.total_score:.3f} ({result.word_count}w, {latency:.1f}s)")
except Exception as exc:
result = PromptResult(
prompt_id=p["id"],
prompt_name=p["name"],
model_key=model_key,
response="",
latency_s=0.0,
word_count=0,
scores={},
total_score=0.0,
error=str(exc),
)
if verbose:
print(f"ERROR: {exc}")
results.append(result)
return results
# ── Reporting ──────────────────────────────────────────────────────────────────
def _print_single_report(results: list[PromptResult], model_key: str) -> None:
ok = [r for r in results if not r.error]
err = [r for r in results if r.error]
if not ok:
print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
return
avg_total = sum(r.total_score for r in ok) / len(ok)
avg_latency = sum(r.latency_s for r in ok) / len(ok)
# Aggregate per-rubric averages
rubric_keys = list(ok[0].scores.keys())
rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
print(f"\n{'='*60}")
print(f" Model : {model_key}")
print(f" Prompts: {len(ok)}/{len(results)} passed ({len(err)} errors)")
print(f" Overall score : {avg_total:.3f} (avg latency {avg_latency:.1f}s)")
print(f"\n Rubric breakdown:")
for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
bar = "" * int(v * 20)
print(f" {k:<22} {v:.3f} {bar}")
print(f"\n Per-prompt scores:")
for r in sorted(ok, key=lambda x: -x.total_score):
flag = "" if r.total_score < 0.3 else " "
print(f" {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f} ({r.word_count}w)")
if err:
print(f"\n Errors:")
for r in err:
print(f" {r.prompt_id} {r.prompt_name}: {r.error}")
print(f"{'='*60}\n")
def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
model_keys = list(all_results.keys())
prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
# Scores by (model, prompt_id)
score_map: dict[tuple[str, str], float] = {}
for mk, results in all_results.items():
for r in results:
score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
col_w = 10
header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
print(f"\n{'='*len(header)}")
print(" COMPARISON TABLE")
print(f"{'='*len(header)}")
print(f" {header}")
print(f" {'-'*len(header)}")
for pid in prompt_ids:
pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
row = f" {pname:<35}"
best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
for mk in model_keys:
v = score_map.get((mk, pid), 0.0)
marker = "*" if v == best and len(model_keys) > 1 else " "
row += f"{v:.3f}{marker} "
print(row)
print(f" {'-'*len(header)}")
avgs_row = f" {'AVERAGE':<35}"
best_avg = -1.0
avgs: dict[str, float] = {}
for mk in model_keys:
vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
avgs[mk] = sum(vals) / len(vals)
best_avg = max(best_avg, avgs[mk])
for mk in model_keys:
marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
avgs_row += f"{avgs[mk]:.3f}{marker} "
print(avgs_row)
print(f"{'='*len(header)}\n")
if len(model_keys) > 1:
winner = max(avgs, key=lambda k: avgs[k])
print(f" Winner: {winner} (avg {avgs[winner]:.3f})\n")
# ── CLI ────────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--list-models", action="store_true",
help="Print registered model shortcuts and exit")
parser.add_argument("--model", metavar="KEY",
help="Benchmark a single model (registry key or raw model name)")
parser.add_argument("--compare", nargs="+", metavar="KEY",
help="Compare two or more models side-by-side")
parser.add_argument("--cforch", action="store_true",
help="Route inference through cf-orch coordinator (allocate per model)")
parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
parser.add_argument("--api-base", default=None,
help="Direct API base URL when not using cf-orch")
parser.add_argument("--model-name", default=None,
help="Override model name sent to API (single-model runs only)")
parser.add_argument("--prompts", nargs="+", metavar="ID",
help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
parser.add_argument("--output", type=Path, default=None,
help="Write detailed JSON results to this path")
parser.add_argument("--workers", type=int, default=1, metavar="N",
help="Run N models concurrently (default 1). Set to number of available nodes.")
parser.add_argument("--verbose", "-v", action="store_true",
help="Print per-prompt progress")
args = parser.parse_args()
if args.list_models:
print("\nRegistered model shortcuts:")
for key, info in MODEL_REGISTRY.items():
print(f" {key:<20} {info['description']}")
print(f"\nDefault endpoints:")
print(f" direct {CF_TEXT_BASE}")
print(f" cf-orch {CF_COORD_URL}")
return
prompts = HELD_OUT_PROMPTS
if args.prompts:
ids = set(args.prompts)
prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
if not prompts:
print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
sys.exit(1)
model_keys: list[str] = []
if args.compare:
model_keys = args.compare
elif args.model:
model_keys = [args.model]
else:
parser.print_help()
sys.exit(0)
all_results: dict[str, list[PromptResult]] = {}
print_lock = threading.Lock()
def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
if mk in MODEL_REGISTRY:
reg = MODEL_REGISTRY[mk]
model_name = args.model_name or reg["model"]
direct_base = args.api_base or reg["api_base"]
else:
model_name = args.model_name or mk
direct_base = args.api_base or CF_TEXT_BASE
if args.cforch:
with print_lock:
print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url}) model={model_name}")
results = run_benchmark(
mk, model_name, prompts=prompts, verbose=args.verbose,
use_cforch=True, cforch_url=args.cforch_url,
)
else:
with print_lock:
print(f"\nRunning [{mk}] → {direct_base} model={model_name}")
results = run_benchmark(
mk, model_name, prompts=prompts, verbose=args.verbose,
api_base=direct_base,
)
with print_lock:
_print_single_report(results, mk)
return mk, results
workers = max(1, args.workers)
if workers == 1 or len(model_keys) == 1:
for mk in model_keys:
mk_out, results = _run_one(mk)
all_results[mk_out] = results
else:
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
for fut in as_completed(futures):
mk_out, results = fut.result()
all_results[mk_out] = results
if len(model_keys) > 1:
_print_comparison_table(all_results)
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
payload = {
mk: [asdict(r) for r in results]
for mk, results in all_results.items()
}
with open(args.output, "w", encoding="utf-8") as f:
json.dump(payload, f, indent=2, ensure_ascii=False)
print(f"Wrote detailed results to {args.output}")
if __name__ == "__main__":
main()

458
scripts/export_plans.py Normal file
View file

@ -0,0 +1,458 @@
"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
Each record is a HuggingFace chat-format example:
{
"id": "<sha256>",
"messages": [
{"role": "user", "content": "<reconstructed planning prompt>"},
{"role": "assistant", "content": "<cleaned document content>"}
],
"meta": {
"source": "peregrine/2026-03-03-feedback-button-design.md",
"product": "peregrine",
"doc_type": "design", # design | plan | spec | implementation | other
"date": "2026-03-03",
"paired_with": "...", # sibling path, or null
"word_count": 1847,
"pair_role": "context" # "context" | "target" | "standalone"
}
}
Pairing strategy
----------------
When a design doc and a plan doc share the same date + feature-name prefix,
they are treated as a pair:
- design plan: instruction = "Given this design doc, write the implementation plan."
context appended = full design doc content.
- Solo docs get a synthetic instruction from the title + first overview section.
Usage
-----
# Preview stats and 5 sample records
python scripts/export_plans.py --preview
# Write full output
python scripts/export_plans.py --output data/plan_pairs.jsonl
# Restrict to specific products
python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
"""
from __future__ import annotations
import argparse
import hashlib
import json
import re
import sys
from pathlib import Path
from typing import Iterator
# ── Paths ──────────────────────────────────────────────────────────────────────
_SCRIPT_DIR = Path(__file__).parent
_AVOCET_ROOT = _SCRIPT_DIR.parent
_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
# ── Doc type detection ─────────────────────────────────────────────────────────
_TYPE_RE = re.compile(
r"-(design|plan|spec|implementation|specs|plans)s?$",
re.IGNORECASE,
)
_SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
# Boilerplate lines to strip from document content before using as output.
_BOILERPLATE_RE = re.compile(
r"""
^\s*>\s*\*\*For\s+agentic\s+workers.* # superpowers agent hints
|^\s*>\s*REQUIRED\s+SUB-SKILL.*
|^\s*\*\*Date:\*\*.* # metadata header lines
|\*\*Status:\*\*\s*Complete.* # completed-feature noise
|\*\*Status:\*\*\s*Done.*
|\*\*Product:\*\*.*
|\*\*Repo:\*\*.*
|\*\*Tech\s+Stack:\*\*.*
|\*\*Candidate:\*\*.* # old synthetic personas
|^Candidate:.*
|^Team:.*
""",
re.VERBOSE | re.MULTILINE,
)
# Old repo/path names to normalise to current equivalents.
_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
(re.compile(r"/devl/job-seeker", re.IGNORECASE), "/Library/Development/CircuitForge/peregrine"),
(re.compile(r"\bjob-seeker\b", re.IGNORECASE), "peregrine"),
(re.compile(r"Alex Rivera", re.IGNORECASE), "[user]"),
]
# Instruction paraphrase templates per doc type.
# Each entry is (user_prefix, paired_prefix).
# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
_DESIGN_INSTRUCTIONS = [
"Write a design document for {product}: {title}.\n\nContext: {overview}",
"You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
"Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
]
_PLAN_INSTRUCTIONS = [
"Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
"Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
"You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
]
_PAIRED_INSTRUCTIONS = [
(
"You are a software architect working on {product}, a CircuitForge product. "
"Given the following design document, write a detailed implementation plan "
"(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
"---\n{design_context}\n---"
),
(
"The following is a design spec for a {product} feature. "
"Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
"---\n{design_context}\n---"
),
(
"Convert this {product} design document into an actionable implementation plan. "
"Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
"---\n{design_context}\n---"
),
]
def _doc_type(stem: str) -> str:
m = _TYPE_RE.search(stem)
if not m:
return "other"
raw = m.group(1).lower().rstrip("s")
return {"implementation": "plan"}.get(raw, raw)
def _date_feature(stem: str) -> tuple[str, str]:
"""Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
if m:
return m.group(1), m.group(2)
return "", stem
# ── Content extraction ─────────────────────────────────────────────────────────
def _extract_title(content: str) -> str:
m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
return m.group(1).strip() if m else ""
def _extract_overview(content: str) -> str:
"""Return first substantive paragraph or h2 section body (≤300 chars)."""
# Superpowers plans have an explicit **Goal:** line — prefer that.
goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
if goal_m:
return goal_m.group(1).strip()[:300]
# Otherwise use the body of the first h2 section.
h2_m = re.search(
r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
content,
re.MULTILINE,
)
if h2_m:
body = h2_m.group(1).strip()
# Strip markdown bullet/code noise for the instruction
body = re.sub(r"```[\s\S]*?```", "", body)
body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
body = re.sub(r"\s+", " ", body).strip()
return body[:300]
return ""
def _clean_content(content: str) -> str:
"""Remove boilerplate, normalize old paths/names, collapse whitespace."""
cleaned = _BOILERPLATE_RE.sub("", content)
for pattern, replacement in _PATH_NORMALIZATIONS:
cleaned = pattern.sub(replacement, cleaned)
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
return cleaned.strip()
def _quality_flags(content: str) -> list[str]:
"""Return a list of quality issue labels found in cleaned content."""
flags = []
if "Alex Rivera" in content or "[user]" in content:
flags.append("persona-residue")
if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
flags.append("completed-status")
return flags
def _make_instruction(
title: str,
product: str,
doc_type: str,
overview: str,
design_context: str | None = None,
variant: int = 0,
) -> str:
"""Synthesise a natural planning prompt for this document.
variant: 0-2 selects which paraphrase template to use. Caller cycles
through all three to produce multiple training examples per document.
"""
product_label = product.replace("-", " ").title() if product else "CircuitForge"
idx = variant % 3
if design_context:
tmpl = _PAIRED_INSTRUCTIONS[idx]
return tmpl.format(
product=product_label,
design_context=design_context[:2500],
)
templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
tmpl = templates[idx]
return tmpl.format(
product=product_label,
title=title,
overview=overview or "",
type_phrase="planning document",
)
def _record_id(content: str, source: str) -> str:
return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
# ── Pair discovery ─────────────────────────────────────────────────────────────
def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
"""Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
by_prefix: dict[str, list[tuple[str, Path]]] = {}
for path in plans_dir.rglob("*.md"):
if any(part in _SKIP_DIRS for part in path.parts):
continue
if path.name == "README.md":
continue
stem = path.stem
date, feature = _date_feature(stem)
if not date:
continue
key = str(path.parent / f"{date}-{feature}")
by_prefix.setdefault(key, []).append((_doc_type(stem), path))
return by_prefix
# ── Record generation ──────────────────────────────────────────────────────────
def _records_for_group(
doc_type_paths: list[tuple[str, Path]],
plans_dir: Path,
) -> Iterator[dict]:
"""Yield one or more training records for a group of related docs."""
# Separate design vs plan docs within this group
designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
plans_ = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
others = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
all_paths = doc_type_paths
if designs and plans_:
# Paired: yield a design→plan record (3 instruction variants)
design_type, design_path = designs[0]
plan_type, plan_path = plans_[0]
design_content = design_path.read_text(encoding="utf-8")
plan_content = plan_path.read_text(encoding="utf-8")
product = _product_from_path(plan_path, plans_dir)
title = _extract_title(plan_content) or plan_path.stem
cleaned = _clean_content(plan_content)
design_cleaned = _clean_content(design_content)
flags = _quality_flags(cleaned)
if len(cleaned.split()) >= 80:
rel_src = str(plan_path.relative_to(plans_dir))
rel_design = str(design_path.relative_to(plans_dir))
for variant in range(3):
instruction = _make_instruction(
title=title,
product=product,
doc_type="plan",
overview=_extract_overview(design_content),
design_context=design_cleaned,
variant=variant,
)
yield {
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
"messages": [
{"role": "user", "content": instruction},
{"role": "assistant", "content": cleaned},
],
"meta": {
"source": rel_src,
"product": product,
"doc_type": "plan",
"date": _date_feature(plan_path.stem)[0],
"paired_with": rel_design,
"word_count": len(cleaned.split()),
"pair_role": "target",
"variant": variant,
"quality_flags": flags,
},
}
# Also yield the design doc as standalone variants
all_paths = [(t, p) for t, p in all_paths if p != plan_path]
# Remaining docs as standalone records (3 instruction variants each)
for doc_type, path in all_paths:
content = path.read_text(encoding="utf-8")
cleaned = _clean_content(content)
if len(cleaned.split()) < 80:
continue
product = _product_from_path(path, plans_dir)
title = _extract_title(content) or path.stem
overview = _extract_overview(content)
flags = _quality_flags(cleaned)
rel_src = str(path.relative_to(plans_dir))
for variant in range(3):
instruction = _make_instruction(
title=title,
product=product,
doc_type=doc_type,
overview=overview,
variant=variant,
)
yield {
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
"messages": [
{"role": "user", "content": instruction},
{"role": "assistant", "content": cleaned},
],
"meta": {
"source": rel_src,
"product": product,
"doc_type": doc_type,
"date": _date_feature(path.stem)[0],
"paired_with": None,
"word_count": len(cleaned.split()),
"pair_role": "standalone",
"variant": variant,
"quality_flags": flags,
},
}
def _product_from_path(path: Path, plans_dir: Path) -> str:
rel = path.relative_to(plans_dir)
return rel.parts[0] if len(rel.parts) > 1 else "shared"
# ── Main export ────────────────────────────────────────────────────────────────
def export(
plans_dir: Path,
products: list[str] | None = None,
) -> list[dict]:
groups = _find_pairs(plans_dir)
records: list[dict] = []
seen_ids: set[str] = set()
for group_key, doc_type_paths in groups.items():
# Filter by product if requested
if products:
paths = [p for _, p in doc_type_paths]
prods = {_product_from_path(p, plans_dir) for p in paths}
if not prods.intersection(products):
continue
for record in _records_for_group(doc_type_paths, plans_dir):
if record["id"] not in seen_ids:
seen_ids.add(record["id"])
records.append(record)
return records
# ── CLI ────────────────────────────────────────────────────────────────────────
def _print_stats(records: list[dict]) -> None:
from collections import Counter
products = Counter(r["meta"]["product"] for r in records)
doc_types = Counter(r["meta"]["doc_type"] for r in records)
pair_roles = Counter(r["meta"]["pair_role"] for r in records)
wc = [r["meta"]["word_count"] for r in records]
wc.sort()
print(f"\n{'='*55}")
print(f" Total records: {len(records)}")
print(f" Word counts : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
print(f"\n By product:")
for p, n in products.most_common():
print(f" {p:<22} {n}")
print(f"\n By doc type:")
for t, n in doc_types.most_common():
print(f" {t:<22} {n}")
print(f"\n Pair roles:")
for r, n in pair_roles.most_common():
print(f" {r:<22} {n}")
print(f"{'='*55}\n")
def _print_sample(records: list[dict], n: int = 3) -> None:
import random
sample = random.sample(records, min(n, len(records)))
for i, rec in enumerate(sample, 1):
meta = rec["meta"]
user_msg = rec["messages"][0]["content"]
asst_msg = rec["messages"][1]["content"]
print(f"\n{''*55}")
print(f"SAMPLE {i}/{n} [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
print(f"source: {meta['source']}")
print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
print(f"\n{''*55}\n")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
parser.add_argument("--output", type=Path, default=None,
help="Write JSONL to this path (omit for preview-only)")
parser.add_argument("--products", default=None,
help="Comma-separated product filter, e.g. peregrine,kiwi")
parser.add_argument("--preview", action="store_true",
help="Print stats + sample records, don't write output")
parser.add_argument("--samples", type=int, default=3,
help="Number of sample records to show in preview (default 3)")
args = parser.parse_args()
products = [p.strip() for p in args.products.split(",")] if args.products else None
print(f"Scanning {args.plans_dir}", file=sys.stderr)
records = export(args.plans_dir, products=products)
_print_stats(records)
if args.preview or args.output is None:
_print_sample(records, n=args.samples)
if args.output is None:
print("(Pass --output <path> to write JSONL)")
return
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
for rec in records:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"Wrote {len(records)} records to {args.output}")
if __name__ == "__main__":
main()

View file

@ -14,7 +14,9 @@ from fastapi.testclient import TestClient
@pytest.fixture(autouse=True)
def reset_cforch_globals(tmp_path):
"""Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
"""Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
list_installed to return [] so real disk model directories don't bleed into
tests that don't exercise the installed-model merge path."""
from app import cforch as cforch_module
prev_config_dir = cforch_module._CONFIG_DIR
@ -25,7 +27,8 @@ def reset_cforch_globals(tmp_path):
cforch_module._BENCH_RUNNING = False
cforch_module._bench_proc = None
yield tmp_path
with patch("app.models.list_installed", return_value=[]):
yield tmp_path
cforch_module.set_config_dir(prev_config_dir)
cforch_module._BENCH_RUNNING = prev_running
@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
assert m["vram_estimate_mb"] == 6000
def test_models_merges_installed_generators(client, config_dir, tmp_path):
"""Installed cf-text/vllm generator models appear in the model list,
deduplicated against bench_models.yaml entries."""
models_file = tmp_path / "bench_models.yaml"
_write_models_yaml(models_file, [
{"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
{"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
])
_write_config(config_dir, {"bench_models": str(models_file)})
fake_installed = [
# should be included — cf-text generator not already in YAML
{"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
# should be deduped — repo_id matches a YAML entry
{"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
# should be excluded — classifier, not a generator
{"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
]
with patch("app.models.list_installed", return_value=fake_installed):
r = client.get("/api/cforch/models")
assert r.status_code == 200
ids = [m["id"] for m in r.json()["models"]]
assert "llama3:8b" in ids # from YAML
assert "ibm-granite/granite-4.1-8b" in ids # from YAML (not duplicated)
assert "meta-llama/Llama-3.1-8B" in ids # merged from installed
assert "cross-encoder/ms-marco-MiniLM-L6" not in ids # filtered out (reranker)
assert ids.count("ibm-granite/granite-4.1-8b") == 1 # no duplicate
# ── GET /run ───────────────────────────────────────────────────────────────────
def test_run_returns_409_when_already_running(client):

View file

@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
except _HTTPException as exc:
assert exc.status_code in (400, 404)
raise
# ── Catalog registration ───────────────────────────────────────────────────────
_MINIMAL_YAML = """\
services:
cf-text:
max_mb: {max_mb}
catalog:
existing-model:
path: /some/path
vram_mb: 1000
description: "placeholder"
"""
def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
p = tmp_path / "testnode.yaml"
p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
return p
def test_catalog_registration_fp16_no_env_block(tmp_path):
"""When model fits at FP16, no env block should be written."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/SmallModel",
local_path=tmp_path / "org--SmallModel",
vram_mb_fp16=4000,
role="generator",
)
assert "testnode" in updated
content = node_yaml.read_text()
# _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
assert "smallmodel:" in content
assert "CF_TEXT_4BIT" not in content
assert "env:" not in content
def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
"""When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/BigModel",
local_path=tmp_path / "org--BigModel",
vram_mb_fp16=20000, # won't fit at FP16 on 8 GB
role="generator",
)
assert "testnode" in updated
content = node_yaml.read_text()
# _catalog_key: "org/BigModel" → "bigmodel"
assert "bigmodel:" in content
assert "env:" in content
assert 'CF_TEXT_4BIT: "1"' in content
assert "CF_TEXT_4BIT=1 required" in content # description note
def test_catalog_registration_too_large_skipped(tmp_path):
"""Model too large even at 4-bit should not be registered."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/HugeModel",
local_path=tmp_path / "org--HugeModel",
vram_mb_fp16=80000, # 4-bit ~22 GB, still won't fit on 8 GB
role="generator",
)
assert updated == []
content = node_yaml.read_text()
assert "hugemodel" not in content

View file

@ -21,21 +21,28 @@
:class="{ active: benchMode === 'style' }"
@click="benchMode = 'style'"
> Writing Style</button>
<button
class="mode-btn"
:class="{ active: benchMode === 'plans' }"
@click="benchMode = 'plans'"
>📐 Planning</button>
</div>
<ClassifierTab v-if="benchMode === 'classifier'" />
<LlmEvalTab v-if="benchMode === 'llm'" />
<StyleTab v-if="benchMode === 'style'" />
<ClassifierTab v-if="benchMode === 'classifier'" />
<LlmEvalTab v-if="benchMode === 'llm'" />
<StyleTab v-if="benchMode === 'style'" />
<PlansBenchTab v-if="benchMode === 'plans'" />
</div>
</template>
<script setup lang="ts">
import { ref } from 'vue'
import ClassifierTab from './ClassifierTab.vue'
import LlmEvalTab from './LlmEvalTab.vue'
import StyleTab from './StyleTab.vue'
import ClassifierTab from './ClassifierTab.vue'
import LlmEvalTab from './LlmEvalTab.vue'
import StyleTab from './StyleTab.vue'
import PlansBenchTab from './PlansBenchTab.vue'
type BenchMode = 'classifier' | 'llm' | 'style'
type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
const benchMode = ref<BenchMode>('classifier')
</script>

View file

@ -6,6 +6,8 @@
<summary class="picker-summary">
<span class="picker-title">📋 Task Selection</span>
<span class="picker-badge">{{ llmTaskBadge }}</span>
<button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
<button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
</summary>
<div class="picker-body">
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks</div>
@ -44,6 +46,8 @@
<summary class="picker-summary">
<span class="picker-title">🎯 Model Selection</span>
<span class="picker-badge">{{ llmModelBadge }}</span>
<button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
<button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
</summary>
<div class="picker-body">
<div v-if="llmModelsLoading" class="picker-loading">Loading models</div>
@ -78,6 +82,33 @@
</div>
</details>
<!-- Node Selection -->
<div class="node-picker" v-if="llmNodes.length > 0">
<span class="node-picker-label">Nodes:</span>
<label
v-for="node in llmNodes"
:key="node.node_id"
class="node-chip"
:class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
:title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
>
<input
type="checkbox"
class="node-chip-check"
:checked="enabledNodes.has(node.node_id)"
:disabled="!node.online || llmRunning"
@change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
/>
{{ node.node_id }}
<span class="node-chip-status" v-if="!node.online">offline</span>
</label>
<span class="node-picker-hint">
{{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
? 'auto-routing (all nodes)'
: `restricted to: ${enabledNodeIds.join(', ')}` }}
</span>
</div>
<!-- Run Controls -->
<div class="run-controls">
<button
@ -88,6 +119,24 @@
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
</button>
<button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark"> Cancel</button>
<input
v-model="llmJudgeUrl"
class="judge-url-input"
placeholder="Judge URL — leave empty to skip LLM judge scoring"
:disabled="llmRunning"
title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
/>
<label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
<span class="workers-prefix">workers</span>
<input
v-model.number="llmWorkers"
type="number"
min="1"
max="8"
class="workers-input"
:disabled="llmRunning"
/>
</label>
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
Select at least one task and one model to run.
</span>
@ -119,6 +168,7 @@
<tr>
<th class="hm-label-col">Model</th>
<th class="hm-model-col">overall</th>
<th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
<th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
<th class="hm-model-col">tok/s</th>
</tr>
@ -130,6 +180,12 @@
class="hm-value-cell"
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
>{{ pct(row.avg_quality_score) }}</td>
<td
v-if="llmHasJudge"
class="hm-value-cell hm-judge-cell"
:class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
title="LLM-as-judge secondary score"
>{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
<td
v-for="col in llmTaskTypeCols"
:key="col"
@ -168,6 +224,12 @@ interface CfOrchModel {
vram_estimate_mb?: number
}
interface CfOrchNode {
node_id: string
online: boolean
gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
}
interface LlmModelResult {
model_name: string
model_id: string
@ -175,9 +237,11 @@ interface LlmModelResult {
avg_tokens_per_sec: number
avg_completion_ms: number
avg_quality_score: number
avg_judge_score: number | null
finetune_candidates: number
error_count: number
quality_by_task_type: Record<string, number>
judge_score_by_task_type?: Record<string, number>
}
// State
@ -195,6 +259,10 @@ const llmError = ref('')
const llmResults = ref<LlmModelResult[]>([])
const llmEventSource = ref<EventSource | null>(null)
const llmLogEl = ref<HTMLElement | null>(null)
const llmJudgeUrl = ref('')
const llmWorkers = ref(1)
const llmNodes = ref<CfOrchNode[]>([])
const enabledNodes = ref<Set<string>>(new Set())
// Computed
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
return [...types].sort()
})
const llmHasJudge = computed(() =>
llmResults.value.some(r => r.avg_judge_score != null)
)
const enabledNodeIds = computed(() =>
llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
)
const llmBestByCol = computed((): Record<string, string> => {
const best: Record<string, string> = {}
if (llmResults.value.length === 0) return best
@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
}
best['overall'] = bestId
if (llmHasJudge.value) {
bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) {
if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
bestVal = r.avg_judge_score; bestId = r.model_id
}
}
best['judge'] = bestId
}
for (const col of llmTaskTypeCols.value) {
bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) {
@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
}
selectedLlmModels.value = next
}
function selectAllTasks() { selectedLlmTasks.value = new Set(llmTasks.value.map(t => t.id)) }
function clearAllTasks() { selectedLlmTasks.value = new Set() }
function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
function clearAllModels() { selectedLlmModels.value = new Set() }
function toggleNode(id: string, checked: boolean) {
const next = new Set(enabledNodes.value)
checked ? next.add(id) : next.delete(id)
enabledNodes.value = next
}
// Data loaders
async function loadLlmTasks() {
@ -335,6 +430,21 @@ async function loadLlmResults() {
}
}
async function loadLlmConfig() {
const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
if (data?.judge_url && !llmJudgeUrl.value) {
llmJudgeUrl.value = data.judge_url
}
}
async function loadLlmNodes() {
const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
if (data?.nodes) {
llmNodes.value = data.nodes
enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
}
}
// Run / cancel
function startLlmBenchmark() {
llmRunning.value = true
@ -344,6 +454,15 @@ function startLlmBenchmark() {
const params = new URLSearchParams()
const taskIds = [...selectedLlmTasks.value].join(',')
if (taskIds) params.set('task_ids', taskIds)
const modelIds = [...selectedLlmModels.value].join(',')
if (modelIds) params.set('model_ids', modelIds)
if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
if (isRestricted && enabledNodeIds.value.length > 0) {
params.set('node_ids', enabledNodeIds.value.join(','))
}
const es = new EventSource(`/api/cforch/run?${params}`)
llmEventSource.value = es
@ -387,6 +506,8 @@ onMounted(() => {
loadLlmTasks()
loadLlmModels()
loadLlmResults()
loadLlmConfig()
loadLlmNodes()
})
</script>
@ -451,6 +572,43 @@ onMounted(() => {
color: var(--color-text-secondary, #6b7a99);
}
.judge-url-input {
flex: 1;
min-width: 14rem;
max-width: 24rem;
padding: 0.35rem 0.6rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #fff);
color: var(--color-text, #1a2338);
font-size: 0.8rem;
font-family: var(--font-mono, monospace);
}
.judge-url-input:disabled { opacity: 0.5; }
.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
.workers-label {
display: flex;
align-items: center;
gap: 0.3rem;
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
white-space: nowrap;
}
.workers-prefix { font-family: var(--font-mono, monospace); }
.workers-input {
width: 3.2rem;
padding: 0.35rem 0.4rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #fff);
color: var(--color-text, #1a2338);
font-size: 0.8rem;
font-family: var(--font-mono, monospace);
text-align: center;
}
.workers-input:disabled { opacity: 0.5; }
/* ── Run log ────────────────────────────────────────────── */
.run-log {
border: 1px solid var(--color-border, #d0d7e8);
@ -592,6 +750,15 @@ onMounted(() => {
white-space: nowrap;
}
.hm-judge-col {
background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
}
.hm-judge-cell {
background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
font-style: italic;
opacity: 0.9;
}
/* ── Model Picker ───────────────────────────────────────── */
.model-picker {
border: 1px solid var(--color-border, #d0d7e8);
@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼ '; }
margin-left: auto;
}
.picker-bulk-btn {
padding: 0.1rem 0.45rem;
font-size: 0.7rem;
font-family: var(--font-mono, monospace);
background: var(--color-surface, #fff);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
transition: background 0.12s, color 0.12s;
flex-shrink: 0;
}
.picker-bulk-btn:hover {
background: var(--app-primary, #2A6080);
color: #fff;
border-color: var(--app-primary, #2A6080);
}
.picker-body {
padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8);
@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼ '; }
.picker-model-list { padding-left: 0; }
.picker-model-name { max-width: 14ch; }
}
/* ── Node picker ────────────────────────────────────── */
.node-picker {
display: flex;
align-items: center;
gap: 0.5rem;
flex-wrap: wrap;
padding: 0.5rem 0.75rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
background: var(--color-surface-raised, #e4ebf5);
}
.node-picker-label {
font-size: 0.78rem;
font-weight: 600;
color: var(--color-text-secondary, #6b7a99);
text-transform: uppercase;
letter-spacing: 0.04em;
white-space: nowrap;
}
.node-chip {
display: inline-flex;
align-items: center;
gap: 0.3rem;
padding: 0.2rem 0.55rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 1rem;
background: var(--color-surface, #fff);
font-size: 0.78rem;
font-family: var(--font-mono, monospace);
color: var(--color-text, #1a2338);
cursor: pointer;
transition: background 0.12s, opacity 0.12s;
}
.node-chip--off {
opacity: 0.45;
background: transparent;
}
.node-chip--offline {
opacity: 0.35;
cursor: not-allowed;
font-style: italic;
}
.node-chip-check { accent-color: var(--app-primary, #2A6080); }
.node-chip-status {
font-size: 0.66rem;
color: var(--color-text-secondary, #6b7a99);
}
.node-picker-hint {
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
font-family: var(--font-mono, monospace);
margin-left: auto;
}
</style>

View file

@ -51,8 +51,31 @@
<span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
{{ lookupResult.adapter_recommendation }}
</span>
<span v-if="lookupResult.size != null" class="preview-size">
{{ humanBytes(lookupResult.size) }}
<span v-if="selectedQuantSize > 0" class="preview-size">
{{ humanBytes(selectedQuantSize) }}
</span>
</div>
<!-- GGUF quantization picker only shown for GGUF repos -->
<div v-if="lookupResult.gguf_files?.length" class="quant-picker">
<label class="quant-label" for="quant-select">Quantization</label>
<select
id="quant-select"
v-model="selectedQuant"
class="quant-select"
aria-label="Select quantization variant"
>
<option :value="null" disabled>Select quantization</option>
<option
v-for="f in lookupResult.gguf_files"
:key="f.filename"
:value="f.quant_name ?? f.filename"
>
{{ f.quant_name ?? f.filename }} {{ humanBytes(f.size) }}
</option>
</select>
<span class="quant-hint">
Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
</span>
</div>
@ -67,7 +90,7 @@
<button
class="btn-primary btn-add-queue"
:disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue"
:disabled="!canAddToQueue"
@click="addToQueue"
>
{{ addingToQueue ? 'Adding…' : 'Add to queue' }}
@ -99,9 +122,39 @@
<span v-if="model.role" class="chip chip-role">{{ model.role }}</span>
<span v-if="model.service" class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
<span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
<span v-if="model.quant_pattern" class="chip chip-quant">{{ model.quant_pattern }}</span>
</div>
<!-- Allow manual service/role assignment for unrecognized pipeline tags -->
<div v-if="!model.service" class="classify-row queue-classify">
<select
class="classify-select"
:value="classifyDraft[model.id]?.service ?? ''"
@change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
aria-label="Assign service"
>
<option value="" disabled>Service</option>
<option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
</select>
<select
class="classify-select"
:value="classifyDraft[model.id]?.role ?? ''"
:disabled="!classifyDraft[model.id]?.service"
@change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
aria-label="Assign role"
>
<option value="" disabled>Role</option>
<option
v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
:key="role"
:value="role"
>{{ role }}</option>
</select>
</div>
<div class="model-card-actions">
<button class="btn-primary btn-sm" @click="approveModel(model.id)">
<button
class="btn-primary btn-sm"
@click="approveModel(model.id, classifyDraft[model.id])"
>
Approve download
</button>
</div>
@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'
// Type definitions
interface GgufFile {
filename: string
size: number
quant_name: string | null
}
interface LookupResult {
repo_id: string
pipeline_tag: string | null
@ -260,7 +319,8 @@ interface LookupResult {
service: string | null
compatible: boolean
warning: string | null
size: number | null
model_size_bytes: number
gguf_files: GgufFile[] | null
description: string | null
already_installed: boolean
already_queued: boolean
@ -274,6 +334,7 @@ interface QueuedModel {
adapter_recommendation: string | null
role: string | null
service: string | null
quant_pattern: string | null
}
interface InstalledModel {
@ -302,6 +363,26 @@ const lookupLoading = ref(false)
const lookupError = ref<string | null>(null)
const lookupResult = ref<LookupResult | null>(null)
const addingToQueue = ref(false)
const selectedQuant = ref<string | null>(null)
// Size of the selected GGUF file, or total model size for non-GGUF repos.
const selectedQuantSize = computed<number>(() => {
const r = lookupResult.value
if (!r) return 0
if (r.gguf_files?.length && selectedQuant.value) {
const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
return f?.size ?? r.model_size_bytes
}
return r.model_size_bytes
})
// Disable "Add to queue" when a GGUF repo but no quant chosen yet.
const canAddToQueue = computed(() => {
const r = lookupResult.value
if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
if (r.gguf_files?.length && !selectedQuant.value) return false
return true
})
const queuedModels = ref<QueuedModel[]>([])
const installedModels = ref<InstalledModel[]>([])
@ -411,6 +492,7 @@ async function doLookup() {
lookupLoading.value = true
lookupError.value = null
lookupResult.value = null
selectedQuant.value = null
try {
const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
@ -442,7 +524,15 @@ async function addToQueue() {
const res = await fetch('/api/models/queue', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }),
body: JSON.stringify({
repo_id,
pipeline_tag,
adapter_recommendation,
role,
service,
model_size_bytes: selectedQuantSize.value,
quant_pattern: selectedQuant.value,
}),
})
if (res.ok) {
lookupResult.value = { ...lookupResult.value, already_queued: true }
@ -454,8 +544,16 @@ async function addToQueue() {
}
}
async function approveModel(id: string) {
async function approveModel(id: string, draft?: { service: string; role: string }) {
try {
// If the user picked a service/role for an unrecognized model, patch it first.
if (draft?.service && draft?.role) {
await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
method: 'PATCH',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ service: draft.service, role: draft.role }),
})
}
const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
if (res.ok) {
await loadQueue()
@ -774,6 +872,44 @@ onUnmounted(() => {
align-self: flex-start;
}
/* ── Quant picker ── */
.quant-picker {
display: flex;
flex-direction: column;
gap: 0.35rem;
}
.quant-label {
font-size: 0.8rem;
font-weight: 600;
color: var(--color-text-muted, #4a5c7a);
text-transform: uppercase;
letter-spacing: 0.04em;
}
.quant-select {
padding: 0.4rem 0.6rem;
border: 1px solid var(--color-border, #a8b8d0);
border-radius: var(--radius-md, 0.5rem);
background: var(--color-surface, #f0f4fb);
color: var(--color-text, #1a2338);
font-size: 0.9rem;
font-family: var(--font-mono, monospace);
cursor: pointer;
}
.quant-hint {
font-size: 0.78rem;
color: var(--color-text-muted, #4a5c7a);
}
.chip-quant {
background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
color: var(--color-primary, #2A6080);
font-family: var(--font-mono, monospace);
font-size: 0.75rem;
}
/* ── Model cards (queue + downloads) ── */
.model-card {
border: 1px solid var(--color-border, #a8b8d0);

File diff suppressed because it is too large Load diff