feat: plans benchmark harness — model scoring for CF planning prompts

Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue
component, and registers /api/plans-bench in api.py. Also extends models
registry (cf-text catalog integration), cforch client, LlmEvalTab, and
ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
This commit is contained in:
pyr0ball 2026-05-02 23:36:04 -07:00
parent e11db5ccd9
commit bce932461a
14 changed files with 3137 additions and 59 deletions

View file

@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
# Set one of these to use a cloud LLM instead of a local model. # Set one of these to use a cloud LLM instead of a local model.
# ANTHROPIC_API_KEY=sk-ant-... # ANTHROPIC_API_KEY=sk-ant-...
# OPENAI_API_KEY=sk-... # OPENAI_API_KEY=sk-...
# ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
# Generate at https://huggingface.co/settings/tokens and accept model terms first.
# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx

View file

@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
from app.train.train import router as train_router from app.train.train import router as train_router
app.include_router(train_router, prefix="/api/train") app.include_router(train_router, prefix="/api/train")
from app.plans_bench import router as plans_bench_router
app.include_router(plans_bench_router, prefix="/api/plans-bench")
# In-memory last-action store (single user, local tool — in-memory is fine)
_last_action: dict | None = None
from app.dashboard import router as dashboard_router from app.dashboard import router as dashboard_router
app.include_router(dashboard_router, prefix="/api") app.include_router(dashboard_router, prefix="/api")

View file

@ -17,9 +17,12 @@ import logging
import os import os
import re import re
import subprocess as _subprocess import subprocess as _subprocess
import tempfile
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
import urllib.parse
import yaml import yaml
from fastapi import APIRouter, HTTPException from fastapi import APIRouter, HTTPException
from fastapi.responses import StreamingResponse from fastapi.responses import StreamingResponse
@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
"license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"), "license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"),
"ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"), "ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"),
"ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"), "ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"),
"judge_url": _coalesce(file_cfg.get("judge_url", ""), "CF_JUDGE_URL"),
"hf_token": _coalesce(file_cfg.get("hf_token", ""), "HF_TOKEN"),
} }
def _validate_service_url(url: str, param_name: str) -> str:
"""Validate that a URL is a well-formed http/https URL with a hostname.
Guards against SSRF: only http/https is allowed; the URL must have a
non-empty host. Does not enforce an allowlist call sites are internal
tooling, not a public API.
"""
if not url:
return url
try:
parsed = urllib.parse.urlparse(url)
except Exception:
raise HTTPException(400, f"{param_name}: not a valid URL")
if parsed.scheme not in ("http", "https"):
raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
if not parsed.hostname:
raise HTTPException(400, f"{param_name}: URL has no hostname")
return url
def _strip_ansi(text: str) -> str: def _strip_ansi(text: str) -> str:
"""Remove ANSI escape codes from a string.""" """Remove ANSI escape codes from a string."""
return re.sub(r'\x1b\[[0-9;]*m', '', text) return re.sub(r'\x1b\[[0-9;]*m', '', text)
@ -147,48 +172,141 @@ def get_tasks() -> dict:
# ── GET /models ──────────────────────────────────────────────────────────────── # ── GET /models ────────────────────────────────────────────────────────────────
# Services and roles surfaced in the benchmark model picker.
# Covers all cf-orch service types that benchmark.py can route tasks to.
_BENCH_SERVICES = frozenset({
"cf-text", "vllm", # LLM text generation
"cf-stt", # speech-to-text
"cf-tts", # text-to-speech
"cf-vision", # image classification / embedding
"cf-voice", # audio context classification
})
_BENCH_ROLES = frozenset({
"generator", "vlm", # LLM roles
"stt", "alm", # speech recognition
"tts", # speech synthesis
"vision", "embedding", # image understanding
"classifier", # audio classification (cf-voice)
})
@router.get("/models") @router.get("/models")
def get_models() -> dict: def get_models() -> dict:
"""Return model list from bench_models.yaml.""" """Return model list from bench_models.yaml merged with locally installed models.
bench_models.yaml entries are listed first and take precedence; any installed
model whose repo_id is already present in the YAML is skipped. Only models
whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
cf-voice) are surfaced from the installed registry.
"""
cfg = _load_cforch_config() cfg = _load_cforch_config()
models_path = cfg.get("bench_models", "") models_path = cfg.get("bench_models", "")
if not models_path:
return {"models": []}
p = Path(models_path)
if not p.exists():
return {"models": []}
try:
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
return {"models": []}
models_raw = raw.get("models", []) or []
models: list[dict] = [] models: list[dict] = []
for m in models_raw: bench_ids: set[str] = set()
if not isinstance(m, dict):
continue if models_path:
models.append({ p = Path(models_path)
"name": m.get("name", ""), if p.exists():
"id": m.get("id", ""), try:
"service": m.get("service", "ollama"), raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
"tags": m.get("tags", []) or [], except yaml.YAMLError as exc:
"vram_estimate_mb": m.get("vram_estimate_mb", 0), logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
}) raw = {}
for m in (raw.get("models", []) or []):
if not isinstance(m, dict):
continue
model_id = m.get("id", "")
models.append({
"name": m.get("name", ""),
"id": model_id,
"service": m.get("service", "ollama"),
"tags": m.get("tags", []) or [],
"vram_estimate_mb": m.get("vram_estimate_mb", 0),
})
if model_id:
bench_ids.add(model_id)
# Merge installed generator models not already in bench_models.yaml.
try:
from app.models import list_installed # local import avoids circular dependency at module load
for installed in list_installed():
model_id: str = installed.get("model_id") or ""
service: str = installed.get("service") or ""
role: str = installed.get("role") or ""
if not model_id:
continue
if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
continue
if model_id in bench_ids:
continue
display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
models.append({
"name": display_name,
"id": model_id,
"service": service,
"tags": [role],
"vram_estimate_mb": installed.get("vram_mb") or 0,
})
bench_ids.add(model_id)
except Exception as exc:
logger.warning("Could not merge installed models into model list: %s", exc)
return {"models": models} return {"models": models}
# ── GET /run ─────────────────────────────────────────────────────────────────── # ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/nodes")
def get_nodes() -> dict:
"""Proxy the coordinator's /api/nodes list, returning node_id + online status.
Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
Returns an empty list if the coordinator is unreachable.
"""
cfg = _load_cforch_config()
coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
if not coordinator_url:
return {"nodes": []}
try:
import httpx as _httpx
resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
resp.raise_for_status()
raw_nodes = resp.json().get("nodes", [])
return {
"nodes": [
{
"node_id": n.get("node_id", ""),
"online": n.get("last_heartbeat") is not None,
"gpus": [
{
"gpu_id": g.get("gpu_id"),
"name": g.get("name", ""),
"vram_total_mb": g.get("vram_total_mb", 0),
"vram_free_mb": g.get("vram_free_mb", 0),
}
for g in n.get("gpus", [])
],
}
for n in raw_nodes
]
}
except Exception as exc:
logger.warning("Could not fetch nodes from coordinator: %s", exc)
return {"nodes": []}
@router.get("/run") @router.get("/run")
def run_benchmark( def run_benchmark(
task_ids: str = "", task_ids: str = "",
model_ids: str = "",
model_tags: str = "", model_tags: str = "",
coordinator_url: str = "", coordinator_url: str = "",
ollama_url: str = "", ollama_url: str = "",
judge_url: str = "",
judge_backend: str = "chat",
workers: int = 1,
node_ids: str = "",
) -> StreamingResponse: ) -> StreamingResponse:
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events.""" """Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
global _BENCH_RUNNING, _bench_proc global _BENCH_RUNNING, _bench_proc
@ -205,6 +323,13 @@ def run_benchmark(
cfg_coordinator = cfg.get("coordinator_url", "") cfg_coordinator = cfg.get("coordinator_url", "")
cfg_ollama = cfg.get("ollama_url", "") cfg_ollama = cfg.get("ollama_url", "")
cfg_license_key = cfg.get("license_key", "") cfg_license_key = cfg.get("license_key", "")
cfg_judge_url = cfg.get("judge_url", "")
# Validate URL params before spawning the subprocess.
# _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
_validate_service_url(coordinator_url, "coordinator_url")
_validate_service_url(ollama_url, "ollama_url")
_validate_service_url(judge_url, "judge_url")
def generate(): def generate():
global _BENCH_RUNNING, _bench_proc global _BENCH_RUNNING, _bench_proc
@ -213,16 +338,68 @@ def run_benchmark(
yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n" yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
return return
# Build effective models file: bench_models.yaml + any installed models
# whose IDs were selected but are absent from the YAML (e.g. downloaded
# via the Models view). Written to a temp file so benchmark.py sees one
# unified list; cleaned up in the finally block.
effective_models_file = bench_models
_tmp_models_path: str | None = None
if model_ids and bench_models and Path(bench_models).exists():
requested_ids = set(model_ids.split(","))
try:
raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
bench_entries: list[dict] = raw_bench.get("models", []) or []
bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
missing_ids = requested_ids - bench_id_set
if missing_ids:
from app.models import list_installed
installed_map = {
m["model_id"]: m
for m in list_installed()
if m.get("model_id") and m.get("service") in _BENCH_SERVICES
}
extra: list[dict] = []
for mid in missing_ids:
if mid in installed_map:
inst = installed_map[mid]
entry: dict[str, Any] = {
"id": mid,
"name": mid.split("/", 1)[-1] if "/" in mid else mid,
"service": inst.get("service", "cf-text"),
"vram_estimate_mb": inst.get("vram_mb") or 0,
"tags": [inst.get("role", "generator")],
"temperature": 0.0,
}
local_path = inst.get("path", "") or inst.get("local_path", "")
if local_path:
entry["model_path"] = local_path
extra.append(entry)
if extra:
merged = {"models": bench_entries + extra}
tf = tempfile.NamedTemporaryFile(
mode="w", suffix=".yaml", delete=False,
prefix="avocet_bench_models_",
)
yaml.dump(merged, tf)
tf.close()
_tmp_models_path = tf.name
effective_models_file = _tmp_models_path
except Exception as exc:
logger.warning("Could not merge installed models into temp bench file: %s", exc)
cmd = [ cmd = [
python_bin, python_bin,
bench_script, bench_script,
"--tasks", bench_tasks, "--tasks", bench_tasks,
"--models", bench_models, "--models", effective_models_file,
"--output", results_dir, "--output", results_dir,
] ]
if task_ids: if task_ids:
cmd.extend(["--filter-tasks"] + task_ids.split(",")) cmd.extend(["--filter-tasks"] + task_ids.split(","))
if model_ids:
cmd.extend(["--filter-models"] + model_ids.split(","))
if model_tags: if model_tags:
cmd.extend(["--filter-tags"] + model_tags.split(",")) cmd.extend(["--filter-tags"] + model_tags.split(","))
@ -233,6 +410,15 @@ def run_benchmark(
cmd.extend(["--coordinator", effective_coordinator]) cmd.extend(["--coordinator", effective_coordinator])
if effective_ollama: if effective_ollama:
cmd.extend(["--ollama-url", effective_ollama]) cmd.extend(["--ollama-url", effective_ollama])
effective_judge = judge_url if judge_url else cfg_judge_url
if effective_judge:
cmd.extend(["--judge-url", effective_judge])
if judge_backend and judge_backend != "chat":
cmd.extend(["--judge-backend", judge_backend])
if workers > 1:
cmd.extend(["--workers", str(workers)])
if node_ids:
cmd.extend(["--nodes"] + node_ids.split(","))
# Pass license key as env var so subprocess can authenticate with cf-orch # Pass license key as env var so subprocess can authenticate with cf-orch
proc_env = {**os.environ} proc_env = {**os.environ}
@ -273,6 +459,11 @@ def run_benchmark(
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n" yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
finally: finally:
_BENCH_RUNNING = False _BENCH_RUNNING = False
if _tmp_models_path:
try:
os.unlink(_tmp_models_path)
except OSError:
pass
return StreamingResponse( return StreamingResponse(
generate(), generate(),
@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
"coordinator_url": cfg.get("coordinator_url", ""), "coordinator_url": cfg.get("coordinator_url", ""),
"ollama_url": cfg.get("ollama_url", ""), "ollama_url": cfg.get("ollama_url", ""),
"ollama_model": cfg.get("ollama_model", ""), "ollama_model": cfg.get("ollama_model", ""),
"judge_url": cfg.get("judge_url", ""),
"license_key_set": bool(cfg.get("license_key", "")), "license_key_set": bool(cfg.get("license_key", "")),
"source": "env" if not _config_file().exists() else "yaml+env", "source": "env" if not _config_file().exists() else "yaml+env",
} }
@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
# ── GET /results ─────────────────────────────────────────────────────────────── # ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results") @router.get("/results")
def get_results() -> dict: def get_results() -> list:
"""Return the latest benchmark summary.json from results_dir.""" """Return the latest benchmark summary.json from results_dir."""
cfg = _load_cforch_config() cfg = _load_cforch_config()
results_dir = cfg.get("results_dir", "") results_dir = cfg.get("results_dir", "")

View file

@ -15,6 +15,7 @@ from __future__ import annotations
import json import json
import logging import logging
import os import os
import re
import shutil import shutil
import threading import threading
from datetime import datetime, timezone from datetime import datetime, timezone
@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(
router = APIRouter() router = APIRouter()
# ── HuggingFace auth ─────────────────────────────────────────────────────────
def _get_hf_token() -> str | None:
"""Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
config_file = _ROOT / "config" / "label_tool.yaml"
if config_file.exists():
try:
import yaml as _yaml
raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
if token:
return token
except Exception:
pass
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
# ── GGUF quantization detection ───────────────────────────────────────────────
# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
_QUANT_RE = re.compile(
r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
re.IGNORECASE,
)
# ── Download progress shared state ──────────────────────────────────────────── # ── Download progress shared state ────────────────────────────────────────────
# Updated by the background download thread; read by GET /download/stream. # Updated by the background download thread; read by GET /download/stream.
_download_progress: dict[str, Any] = {} _download_progress: dict[str, Any] = {}
@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
"audio-classification": {"adapter": None, "role": "classifier", "service": "cf-voice"}, "audio-classification": {"adapter": None, "role": "classifier", "service": "cf-voice"},
# TTS — cf-tts text-to-speech service # TTS — cf-tts text-to-speech service
"text-to-speech": {"adapter": None, "role": "tts", "service": "cf-tts"}, "text-to-speech": {"adapter": None, "role": "tts", "service": "cf-tts"},
# Vision — cf-vision image classification / embedding / VLM service # Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
"image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"}, "image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
"zero-shot-image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"}, "zero-shot-image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
"image-feature-extraction": {"adapter": None, "role": "embedding", "service": "cf-vision"}, "image-feature-extraction": {"adapter": None, "role": "embedding", "service": "cf-vision"},
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "cf-vision"}, # Generative VLMs (image+text → text) — run under vllm, not cf-vision.
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "cf-vision"}, # cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
# LLaVA, and InternVL are textgen models that happen to accept image inputs.
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "vllm"},
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "vllm"},
# Image generation — cf-image (text → image; distinct from cf-vision image understanding) # Image generation — cf-image (text → image; distinct from cf-vision image understanding)
"text-to-image": {"adapter": None, "role": "image-gen", "service": "cf-image"}, "text-to-image": {"adapter": None, "role": "image-gen", "service": "cf-image"},
# Embedding — cf-core shared embedding layer # Embedding — cf-core shared embedding layer
@ -195,10 +223,17 @@ def _get_queue_entry(entry_id: str) -> dict | None:
def _catalog_key(repo_id: str) -> str: def _catalog_key(repo_id: str) -> str:
"""Derive a readable catalog key from repo_id. """Derive a readable catalog key from repo_id.
ibm-granite/granite-4.1-8b granite-4.1-8b ibm-granite/granite-4.1-8b granite-4.1-8b
facebook/bart-large-cnn bart-large-cnn facebook/bart-large-cnn bart-large-cnn
WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF opus4.7-gods.ghost.codex-4b
The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
""" """
return repo_id.split("/", 1)[-1].lower() key = repo_id.split("/", 1)[-1].lower()
if key.endswith(".gguf"):
key = key[:-5]
return key
def _insert_catalog_entry(content: str, entry_lines: str) -> str: def _insert_catalog_entry(content: str, entry_lines: str) -> str:
@ -290,6 +325,15 @@ def _register_in_node_catalogs(
max_mb: int = cf_text.get("max_mb", 0) max_mb: int = cf_text.get("max_mb", 0)
catalog: dict = cf_text.get("catalog") or {} catalog: dict = cf_text.get("catalog") or {}
# If the node has a different local model dir, remap the NFS path.
model_base = cf_text.get("model_base_path", "").rstrip("/")
if model_base:
nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
model_name = local_path.name
effective_path_str = f"{model_base}/{model_name}"
else:
effective_path_str = local_path_str
# Skip if key already exists # Skip if key already exists
if model_key in catalog: if model_key in catalog:
logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name) logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
@ -301,10 +345,10 @@ def _register_in_node_catalogs(
for entry in catalog.values() for entry in catalog.values()
if isinstance(entry, dict) if isinstance(entry, dict)
} }
if local_path_str in registered_paths or any( if effective_path_str in registered_paths or any(
p.startswith(local_path_str + "/") for p in registered_paths p.startswith(effective_path_str + "/") for p in registered_paths
): ):
logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name) logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
continue continue
# Determine whether model fits at FP16 or needs 4-bit # Determine whether model fits at FP16 or needs 4-bit
@ -330,12 +374,18 @@ def _register_in_node_catalogs(
if needs_4bit if needs_4bit
else f" # FP16 file-size estimate" else f" # FP16 file-size estimate"
) )
env_block = (
f" env:\n"
f" CF_TEXT_4BIT: \"1\"\n"
if needs_4bit else ""
)
entry_block = ( entry_block = (
f" # auto-registered by avocet on download\n" f" # auto-registered by avocet on download\n"
f" {model_key}:\n" f" {model_key}:\n"
f" path: {local_path_str}\n" f" path: {effective_path_str}\n"
f" vram_mb: {vram_for_node}{vram_comment}\n" f" vram_mb: {vram_for_node}{vram_comment}\n"
f" description: \"{desc}\"\n" f" description: \"{desc}\"\n"
f"{env_block}"
) )
new_content = _insert_catalog_entry(content, entry_block) new_content = _insert_catalog_entry(content, entry_block)
@ -388,12 +438,17 @@ def _run_download(
role: str | None = None, role: str | None = None,
service: str | None = None, service: str | None = None,
model_size_bytes: int = 0, model_size_bytes: int = 0,
quant_pattern: str | None = None,
) -> None: ) -> None:
"""Background thread: download model via huggingface_hub.snapshot_download. """Background thread: download model via huggingface_hub.snapshot_download.
model_size_bytes is the sum of file sizes reported by the HF API (siblings). model_size_bytes is the sum of file sizes reported by the HF API (siblings).
It is used to estimate vram_mb and written to model_info.json so cf-orch can It is used to estimate vram_mb and written to model_info.json so cf-orch can
budget VRAM when allocating a cf-text instance for this model. budget VRAM when allocating a cf-text instance for this model.
quant_pattern: when set, restricts snapshot_download to only files matching
*{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
from GGUF-only repos like bartowski/*.
""" """
global _download_progress global _download_progress
local_dir = _model_dir_for(repo_id, service) local_dir = _model_dir_for(repo_id, service)
@ -422,10 +477,20 @@ def _run_download(
local_dir.mkdir(parents=True, exist_ok=True) local_dir.mkdir(parents=True, exist_ok=True)
poll_thread.start() poll_thread.start()
snapshot_download(
repo_id=repo_id, dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
local_dir=str(local_dir), hf_token = _get_hf_token()
) if hf_token:
dl_kwargs["token"] = hf_token
if quant_pattern:
# Include both cases: repos use mixed conventions (Q6_K vs q6_k).
dl_kwargs["allow_patterns"] = [
f"*{quant_pattern.upper()}*.gguf",
f"*{quant_pattern.lower()}*.gguf",
"*.json",
"README.md",
]
snapshot_download(**dl_kwargs)
# Estimate VRAM from reported file size. # Estimate VRAM from reported file size.
# HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache # HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
) )
logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id) logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)
# Estimate model size from siblings list # Detect GGUF files and parse quant names from siblings list.
# For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
# a per-quant size picker instead of downloading every variant.
siblings = data.get("siblings") or [] siblings = data.get("siblings") or []
model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict)) gguf_files: list[dict] = []
for s in siblings:
if not isinstance(s, dict):
continue
fname: str = s.get("rfilename", "")
if not fname.lower().endswith(".gguf"):
continue
m = _QUANT_RE.search(fname)
gguf_files.append({
"filename": fname,
"size": s.get("size", 0) or 0,
"quant_name": m.group(1).upper() if m else None,
})
gguf_files.sort(key=lambda f: f["size"])
# model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
# For GGUF repos the frontend will substitute the selected quant's size on submit.
if gguf_files:
model_size_bytes: int = sum(f["size"] for f in gguf_files)
else:
model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
# Description: first 300 chars of card data (modelId field used as fallback) # Description: first 300 chars of card data (modelId field used as fallback)
card_data = data.get("cardData") or {} card_data = data.get("cardData") or {}
@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
"compatible": compatible, "compatible": compatible,
"warning": warning, "warning": warning,
"model_size_bytes": model_size_bytes, "model_size_bytes": model_size_bytes,
"gguf_files": gguf_files if gguf_files else None,
"description": description, "description": description,
"tags": data.get("tags") or [], "tags": data.get("tags") or [],
"downloads": data.get("downloads") or 0, "downloads": data.get("downloads") or 0,
@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
# Stored in the queue entry so approve can pass it to _run_download # Stored in the queue entry so approve can pass it to _run_download
# without a second HF API round-trip. # without a second HF API round-trip.
model_size_bytes: int = 0 model_size_bytes: int = 0
# GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
# restricts to *{quant_pattern}*.gguf instead of fetching all variants.
quant_pattern: str | None = None
@router.post("/queue", status_code=201) @router.post("/queue", status_code=201)
@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
"role": req.role, "role": req.role,
"service": req.service, "service": req.service,
"model_size_bytes": req.model_size_bytes, "model_size_bytes": req.model_size_bytes,
"quant_pattern": req.quant_pattern,
"status": "pending", "status": "pending",
"queued_at": datetime.now(timezone.utc).isoformat(), "queued_at": datetime.now(timezone.utc).isoformat(),
} }
@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
entry.get("role"), entry.get("role"),
entry.get("service"), entry.get("service"),
entry.get("model_size_bytes", 0), entry.get("model_size_bytes", 0),
entry.get("quant_pattern"),
), ),
daemon=True, daemon=True,
name=f"model-download-{entry_id}", name=f"model-download-{entry_id}",
@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
return {"ok": True} return {"ok": True}
# ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
class QueuePatchRequest(BaseModel):
service: str | None = None
role: str | None = None
@router.patch("/queue/{entry_id}")
def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
"""Update mutable fields (service, role) on a pending queue entry."""
entry = _get_queue_entry(entry_id)
if entry is None:
raise HTTPException(404, f"Queue entry {entry_id!r} not found")
if entry.get("status") != "pending":
raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
updates: dict = {}
if body.service is not None:
updates["service"] = body.service
if body.role is not None:
updates["role"] = body.role
updated = _update_queue_entry(entry_id, updates)
return updated or {}
# ── DELETE /queue/{id} ───────────────────────────────────────────────────────── # ── DELETE /queue/{id} ─────────────────────────────────────────────────────────
@router.delete("/queue/{entry_id}") @router.delete("/queue/{entry_id}")

View file

@ -41,11 +41,15 @@ cforch:
# Python interpreter with cf-orch installed # Python interpreter with cf-orch installed
python_bin: /devl/miniconda3/envs/cf/bin/python python_bin: /devl/miniconda3/envs/cf/bin/python
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
# coordinator_url: http://localhost:7700 # coordinator_url: http://localhost:7700
# license_key: CFG-AVCT-xxxx-xxxx-xxxx # license_key: CFG-AVCT-xxxx-xxxx-xxxx
# ollama_url: http://localhost:11434 # ollama_url: http://localhost:11434
# ollama_model: llama3.2:3b # ollama_model: llama3.2:3b
# judge_url: http://10.1.10.158:8008 # Sif cf-text — LLM-as-judge secondary scorer
# judge_url: http://10.1.10.71:8008 # Heimdall cf-text (alternative)
# Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
# hf_token: hf_xxxxxxxxxxxxxxxxxxxx # HuggingFace token — required for gated/terms-restricted models
# Imitate tab — pull real samples from sibling CF product APIs and run them # Imitate tab — pull real samples from sibling CF product APIs and run them
# through local LLMs to build a corrections dataset. # through local LLMs to build a corrections dataset.

View file

@ -90,6 +90,12 @@ usage() {
echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]" echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]"
echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]" echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]"
echo "" echo ""
echo " Planning Benchmark:"
echo -e " ${GREEN}plans-bench [args]${NC} Run benchmark_plans.py (args passed through)"
echo -e " ${GREEN}plans-list${NC} Shortcut: --list-models"
echo -e " ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
echo -e " ${GREEN}plans-compare <m1> <m2> [more]${NC} Compare models side-by-side"
echo ""
echo " Writing Style Benchmark:" echo " Writing Style Benchmark:"
echo -e " ${GREEN}style-bench [args]${NC} Run benchmark_style.py (args passed through)" echo -e " ${GREEN}style-bench [args]${NC} Run benchmark_style.py (args passed through)"
echo -e " ${GREEN}style-list${NC} List available ollama models for style bench" echo -e " ${GREEN}style-list${NC} List available ollama models for style bench"
@ -127,6 +133,8 @@ case "$CMD" in
fi fi
mkdir -p "$LOG_DIR" mkdir -p "$LOG_DIR"
API_LOG="${LOG_DIR}/api.log" API_LOG="${LOG_DIR}/api.log"
# Load .env if present — sets HF_TOKEN and other optional overrides.
[[ -f .env ]] && set -a && source .env && set +a
info "Building Vue SPA…" info "Building Vue SPA…"
(cd web && npm run build) >> "$API_LOG" 2>&1 (cd web && npm run build) >> "$API_LOG" 2>&1
info "Starting FastAPI on port ${API_PORT}" info "Starting FastAPI on port ${API_PORT}"
@ -179,6 +187,9 @@ case "$CMD" in
mkdir -p "$LOG_DIR" mkdir -p "$LOG_DIR"
DEV_API_LOG="${LOG_DIR}/dev-api.log" DEV_API_LOG="${LOG_DIR}/dev-api.log"
# Load .env if present — sets HF_TOKEN and other optional overrides.
[[ -f .env ]] && set -a && source .env && set +a
if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))" warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
else else
@ -255,6 +266,30 @@ case "$CMD" in
exec "$0" benchmark --compare "$@" exec "$0" benchmark --compare "$@"
;; ;;
plans-bench)
info "Running planning benchmark (${ENV_UI})…"
"$PYTHON_UI" scripts/benchmark_plans.py "$@"
;;
plans-list)
exec "$0" plans-bench --list-models
;;
plans-run)
if [[ $# -lt 1 ]]; then
error "Usage: ./manage.sh plans-run <model-key> [extra args]"
fi
MODEL="$1"; shift
exec "$0" plans-bench --model "$MODEL" --verbose "$@"
;;
plans-compare)
if [[ $# -lt 2 ]]; then
error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
fi
exec "$0" plans-bench --compare "$@" --verbose
;;
style-bench) style-bench)
info "Running writing style benchmark (${ENV_BM})…" info "Running writing style benchmark (${ENV_BM})…"
if [[ ! -x "$PYTHON_BM" ]]; then if [[ ! -x "$PYTHON_BM" ]]; then

719
scripts/benchmark_plans.py Normal file
View file

@ -0,0 +1,719 @@
#!/usr/bin/env python
"""CF-specific planning benchmark — compare base models before fine-tuning.
Sends held-out CircuitForge planning prompts to one or more models via the
cf-text (local) or cf-orch API, then scores responses against CF-specific
rubrics. Use this to select the best base model for SFT.
Scoring rubrics (each 0-1, summed to total/N):
- task_structure : uses checkbox syntax (- [ ]), git commit steps
- tier_awareness : mentions Free/Paid/Premium/Ultra tiers
- privacy_pillar : mentions privacy/local-inference/no-logging
- safety_pillar : mentions safety, human approval, or reversibility
- accessibility : mentions ND/accessibility/adaptive needs
- license_split : mentions MIT vs BSL or open-core model
- file_paths : uses plausible file path references
- cf_conventions : uses conda run -n cf, /Library/Development/, or known CF dirs
- paired_coherence : (paired only) plan references the design doc's feature name
- length_ok : 3002500 words (under-short = hallucination risk; over-long = padding)
Usage
-----
# List available model targets
python scripts/benchmark_plans.py --list-models
# Run all held-out prompts against a single model, print report
python scripts/benchmark_plans.py --model llama3.2-3b
# Compare two models side-by-side
python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
# Run with a custom API base (cf-text default: http://localhost:8080/v1)
python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
# Export detailed results JSON
python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Any
import httpx
# ── Paths ──────────────────────────────────────────────────────────────────────
_ROOT = Path(__file__).parent.parent
_DATA_DIR = _ROOT / "data"
CF_TEXT_BASE = "http://localhost:8080/v1"
CF_ORCH_BASE = "http://localhost:8090/v1"
CF_COORD_URL = "http://10.1.10.71:7700" # cf-orch coordinator (LAN)
# ── Held-out prompts ───────────────────────────────────────────────────────────
# These are NOT in the training export (no matching docs in circuitforge-plans/).
# Each prompt exercises a different CF planning domain.
HELD_OUT_PROMPTS: list[dict[str, Any]] = [
{
"id": "ho_001",
"name": "kiwi_barcode_ocr",
"domain": "feature_plan",
"prompt": (
"You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
"Write a detailed implementation plan for adding barcode scanning via device camera "
"and receipt OCR to the item-add flow.\n\n"
"The plan should include: file structure (create/modify), step-by-step task checklist "
"with checkboxes, any DB migrations, and git commit steps."
),
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
},
{
"id": "ho_002",
"name": "peregrine_ats_scoring",
"domain": "feature_design",
"prompt": (
"Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
"Context: Peregrine users paste job descriptions and their resume. "
"We want to score how well the resume keywords match the JD and suggest rewrites. "
"Describe the architecture, data flow, and key design decisions."
),
"expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
},
{
"id": "ho_003",
"name": "tier_gate_local_llm",
"domain": "architecture",
"prompt": (
"Design the tier-gating architecture for a new CircuitForge product. "
"The product should:\n"
"- Default to local LLM inference for all tiers\n"
"- Unlock cloud LLM for Paid tier and above\n"
"- Keep fine-tuned model weights for Premium/Ultra only\n\n"
"Describe how the tier check integrates with the LLM router, "
"what happens when a Free user tries a Paid-tier feature, "
"and how BYOK (bring-your-own-key) fits in."
),
"expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
},
{
"id": "ho_004",
"name": "heimdall_webhook_plan",
"domain": "feature_plan",
"prompt": (
"Break the following Heimdall feature into a detailed implementation plan with "
"file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
"Heimdall is the CircuitForge license server (FastAPI + SQLite). "
"The webhook needs to handle checkout.session.completed, "
"customer.subscription.updated, and customer.subscription.deleted events."
),
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
},
{
"id": "ho_005",
"name": "nd_accessible_onboarding",
"domain": "ux_design",
"prompt": (
"You are a product designer working on Harrier, a CircuitForge tool for "
"helping people navigate government benefits applications.\n\n"
"Design the onboarding flow for neurodivergent (ND) users. "
"Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
"and rejection sensitivity. The flow should reduce cognitive load and "
"never use urgency or panic patterns."
),
"expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
},
{
"id": "ho_006",
"name": "circuitforge_core_extraction",
"domain": "architecture",
"prompt": (
"Produce a CircuitForge-style design document for the following circuitforge-core "
"feature — shared ActivityPub federation module.\n\n"
"Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
"to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
"Design the module API, describe what belongs in MIT vs BSL, and note federation "
"privacy constraints."
),
"expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
},
{
"id": "ho_007",
"name": "snipe_trust_score_plan",
"domain": "feature_plan",
"prompt": (
"You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
"Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
"The score should combine: feedback ratio, account age, item-specifics completeness, "
"listing photo quality, and shipping time accuracy. "
"Include file structure, test plan, and migration steps."
),
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
},
{
"id": "ho_008",
"name": "avocet_training_pipeline",
"domain": "feature_plan",
"prompt": (
"Break the following Avocet feature into a detailed implementation plan — "
"end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
"Avocet is the CircuitForge email classifier training tool. "
"The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
"quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
"in the Avocet model queue if it beats the baseline."
),
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
},
{
"id": "ho_009",
"name": "privacy_data_flow",
"domain": "architecture",
"prompt": (
"Design the data privacy architecture for a CircuitForge cloud product. "
"Describe: what PII is collected, how it's stored, retention policy, "
"obfuscation strategy for cloud-side logs, and how consent is obtained "
"in plain language. The product handles job applications (resumes, cover letters)."
),
"expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
},
{
"id": "ho_010",
"name": "git_workflow_doc",
"domain": "process_doc",
"prompt": (
"Write a developer process document for CircuitForge: conventional commit and "
"branch workflow for a BSL 1.1 open-core product.\n\n"
"Cover: commit message format (type: description), branch naming, "
"when to use feature branches vs direct main commits, "
"how the MIT/BSL split affects which commits go in which branch, "
"and how CI gates on gitleaks for secret scanning."
),
"expected_signals": ["license_split", "cf_conventions", "task_structure"],
},
]
# ── Rubric scoring ─────────────────────────────────────────────────────────────
_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
@dataclass
class RubricScore:
task_structure: float = 0.0
tier_awareness: float = 0.0
privacy_pillar: float = 0.0
safety_pillar: float = 0.0
accessibility: float = 0.0
license_split: float = 0.0
file_paths: float = 0.0
cf_conventions: float = 0.0
length_ok: float = 0.0
def total(self) -> float:
vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
self.safety_pillar, self.accessibility, self.license_split,
self.file_paths, self.cf_conventions, self.length_ok]
return sum(vals) / len(vals)
def as_dict(self) -> dict[str, float]:
return asdict(self)
def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
words = len(response.split())
s = RubricScore()
# Task structure: needs checkboxes AND at least one commit step
checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
has_commit = bool(_COMMIT_RE.search(response))
s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
# Tier awareness
s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
# Privacy pillar
s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
# Safety pillar
s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
# Accessibility
s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
# License split awareness
s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
# File paths: at least 3 plausible path references
s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
# CF conventions
s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
# Length: 2002500 words is healthy; outside = partial credit
if 200 <= words <= 2500:
s.length_ok = 1.0
elif words < 200:
s.length_ok = words / 200
else:
s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
return s
# ── Model client ───────────────────────────────────────────────────────────────
# Registry of named model targets (shorthand → {api_base, model_name})
MODEL_REGISTRY: dict[str, dict[str, str]] = {
"deepseek-r1-1.5b": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-r1-1.5b",
"description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
},
"deepseek-r1-7b-4bit": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-r1-7b-4bit",
"description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
},
"deepseek-coder-6.7b-4bit": {
"api_base": CF_TEXT_BASE,
"model": "deepseek-coder-6.7b-4bit",
"description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
},
"granite-4.1-8b": {
"api_base": CF_TEXT_BASE,
"model": "granite-4.1-8b",
"description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
},
"qwen2.5-3b": {
"api_base": CF_TEXT_BASE,
"model": "qwen2.5-3b",
"description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
},
"qwen2.5-7b": {
"api_base": CF_TEXT_BASE,
"model": "qwen2.5-7b",
"description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
},
}
# ── cf-orch allocation ─────────────────────────────────────────────────────────
def _cforch_allocate(
model_id: str,
cforch_url: str,
startup_timeout_s: float = 300.0,
) -> tuple[str, str] | None:
"""Allocate a cf-text instance for model_id via the cf-orch coordinator.
Returns (service_url, allocation_id) on success, None on failure.
service_url is the direct node URL exposing /v1/chat/completions.
"""
try:
resp = httpx.post(
f"{cforch_url}/api/services/cf-text/allocate",
json={
"model_candidates": [model_id],
"caller": "avocet",
"pipeline": "plans_benchmark",
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
service_url: str = data["url"]
allocation_id: str = data.get("allocation_id", "")
node_id: str = data.get("node_id", "")
gpu_id: int | None = data.get("gpu_id")
if data.get("started", False) and not data.get("warm", True):
# Use \n so the SSE generator sees the line immediately
print(f" [cold start] loading {model_id!r} — polling every 3s…", flush=True)
t0 = time.monotonic()
deadline = t0 + startup_timeout_s
probe_misses = 0
while time.monotonic() < deadline:
elapsed = time.monotonic() - t0
try:
status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
if status.is_success:
instances = status.json().get("instances", [])
match = next(
(i for i in instances
if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
None,
)
if match:
probe_misses = 0
state = match.get("state", "")
if state == "running":
print(f" [cold start] ready in {elapsed:.0f}s", flush=True)
return service_url, allocation_id
elif state == "stopped":
print(f" [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
return None
else:
# still starting — emit keepalive so SSE stream stays alive
print(f" [cold start] state={state!r} elapsed={elapsed:.0f}s", flush=True)
else:
probe_misses += 1
print(f" [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
if probe_misses >= 6:
try:
h = httpx.get(f"{service_url}/health", timeout=3.0)
if h.is_success:
print(f" [cold start] ready via health check in {elapsed:.0f}s", flush=True)
return service_url, allocation_id
except Exception:
pass
else:
print(f" [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
except Exception as poll_exc:
print(f" [cold start] poll error: {poll_exc} elapsed={elapsed:.0f}s", flush=True)
time.sleep(3.0)
print(f" [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
return None
return service_url, allocation_id
except Exception as exc:
print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
return None
def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
"""Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
t0 = time.monotonic()
resp = httpx.post(
f"{service_url.rstrip('/')}/v1/chat/completions",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 2048,
"temperature": 0.2,
},
timeout=timeout,
)
resp.raise_for_status()
latency = time.monotonic() - t0
text = resp.json()["choices"][0]["message"]["content"]
return text, latency
def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
"""Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
t0 = time.monotonic()
resp = httpx.post(
f"{api_base}/chat/completions",
json={
"model": model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 2048,
"temperature": 0.2,
},
timeout=timeout,
)
resp.raise_for_status()
latency = time.monotonic() - t0
text = resp.json()["choices"][0]["message"]["content"]
return text, latency
# ── Benchmark runner ───────────────────────────────────────────────────────────
@dataclass
class PromptResult:
prompt_id: str
prompt_name: str
model_key: str
response: str
latency_s: float
word_count: int
scores: dict[str, float]
total_score: float
error: str | None = None
def run_benchmark(
model_key: str,
model_name: str,
prompts: list[dict[str, Any]] | None = None,
verbose: bool = False,
# cf-orch path
use_cforch: bool = False,
cforch_url: str = CF_COORD_URL,
# direct path (used when not cf-orch)
api_base: str = CF_TEXT_BASE,
) -> list[PromptResult]:
"""Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
if prompts is None:
prompts = HELD_OUT_PROMPTS
# Allocate once per model when using cf-orch
service_url: str | None = None
if use_cforch:
print(f" Allocating {model_name!r} via cf-orch…", flush=True)
alloc = _cforch_allocate(model_name, cforch_url)
if alloc is None:
# Return all prompts as errors
return [
PromptResult(
prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
error=f"cf-orch allocation failed for {model_name!r}",
)
for p in prompts
]
service_url, _alloc_id = alloc
results: list[PromptResult] = []
for p in prompts:
if verbose:
print(f" [{p['id']}] {p['name']}", end="", flush=True)
try:
if service_url:
response, latency = _call_model_direct(service_url, model_name, p["prompt"])
else:
response, latency = _call_model(api_base, model_name, p["prompt"])
rubric = score_response(response, p)
result = PromptResult(
prompt_id=p["id"],
prompt_name=p["name"],
model_key=model_key,
response=response,
latency_s=round(latency, 2),
word_count=len(response.split()),
scores=rubric.as_dict(),
total_score=round(rubric.total(), 3),
)
if verbose:
print(f"score={result.total_score:.3f} ({result.word_count}w, {latency:.1f}s)")
except Exception as exc:
result = PromptResult(
prompt_id=p["id"],
prompt_name=p["name"],
model_key=model_key,
response="",
latency_s=0.0,
word_count=0,
scores={},
total_score=0.0,
error=str(exc),
)
if verbose:
print(f"ERROR: {exc}")
results.append(result)
return results
# ── Reporting ──────────────────────────────────────────────────────────────────
def _print_single_report(results: list[PromptResult], model_key: str) -> None:
ok = [r for r in results if not r.error]
err = [r for r in results if r.error]
if not ok:
print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
return
avg_total = sum(r.total_score for r in ok) / len(ok)
avg_latency = sum(r.latency_s for r in ok) / len(ok)
# Aggregate per-rubric averages
rubric_keys = list(ok[0].scores.keys())
rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
print(f"\n{'='*60}")
print(f" Model : {model_key}")
print(f" Prompts: {len(ok)}/{len(results)} passed ({len(err)} errors)")
print(f" Overall score : {avg_total:.3f} (avg latency {avg_latency:.1f}s)")
print(f"\n Rubric breakdown:")
for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
bar = "" * int(v * 20)
print(f" {k:<22} {v:.3f} {bar}")
print(f"\n Per-prompt scores:")
for r in sorted(ok, key=lambda x: -x.total_score):
flag = "" if r.total_score < 0.3 else " "
print(f" {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f} ({r.word_count}w)")
if err:
print(f"\n Errors:")
for r in err:
print(f" {r.prompt_id} {r.prompt_name}: {r.error}")
print(f"{'='*60}\n")
def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
model_keys = list(all_results.keys())
prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
# Scores by (model, prompt_id)
score_map: dict[tuple[str, str], float] = {}
for mk, results in all_results.items():
for r in results:
score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
col_w = 10
header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
print(f"\n{'='*len(header)}")
print(" COMPARISON TABLE")
print(f"{'='*len(header)}")
print(f" {header}")
print(f" {'-'*len(header)}")
for pid in prompt_ids:
pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
row = f" {pname:<35}"
best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
for mk in model_keys:
v = score_map.get((mk, pid), 0.0)
marker = "*" if v == best and len(model_keys) > 1 else " "
row += f"{v:.3f}{marker} "
print(row)
print(f" {'-'*len(header)}")
avgs_row = f" {'AVERAGE':<35}"
best_avg = -1.0
avgs: dict[str, float] = {}
for mk in model_keys:
vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
avgs[mk] = sum(vals) / len(vals)
best_avg = max(best_avg, avgs[mk])
for mk in model_keys:
marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
avgs_row += f"{avgs[mk]:.3f}{marker} "
print(avgs_row)
print(f"{'='*len(header)}\n")
if len(model_keys) > 1:
winner = max(avgs, key=lambda k: avgs[k])
print(f" Winner: {winner} (avg {avgs[winner]:.3f})\n")
# ── CLI ────────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--list-models", action="store_true",
help="Print registered model shortcuts and exit")
parser.add_argument("--model", metavar="KEY",
help="Benchmark a single model (registry key or raw model name)")
parser.add_argument("--compare", nargs="+", metavar="KEY",
help="Compare two or more models side-by-side")
parser.add_argument("--cforch", action="store_true",
help="Route inference through cf-orch coordinator (allocate per model)")
parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
parser.add_argument("--api-base", default=None,
help="Direct API base URL when not using cf-orch")
parser.add_argument("--model-name", default=None,
help="Override model name sent to API (single-model runs only)")
parser.add_argument("--prompts", nargs="+", metavar="ID",
help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
parser.add_argument("--output", type=Path, default=None,
help="Write detailed JSON results to this path")
parser.add_argument("--workers", type=int, default=1, metavar="N",
help="Run N models concurrently (default 1). Set to number of available nodes.")
parser.add_argument("--verbose", "-v", action="store_true",
help="Print per-prompt progress")
args = parser.parse_args()
if args.list_models:
print("\nRegistered model shortcuts:")
for key, info in MODEL_REGISTRY.items():
print(f" {key:<20} {info['description']}")
print(f"\nDefault endpoints:")
print(f" direct {CF_TEXT_BASE}")
print(f" cf-orch {CF_COORD_URL}")
return
prompts = HELD_OUT_PROMPTS
if args.prompts:
ids = set(args.prompts)
prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
if not prompts:
print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
sys.exit(1)
model_keys: list[str] = []
if args.compare:
model_keys = args.compare
elif args.model:
model_keys = [args.model]
else:
parser.print_help()
sys.exit(0)
all_results: dict[str, list[PromptResult]] = {}
print_lock = threading.Lock()
def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
if mk in MODEL_REGISTRY:
reg = MODEL_REGISTRY[mk]
model_name = args.model_name or reg["model"]
direct_base = args.api_base or reg["api_base"]
else:
model_name = args.model_name or mk
direct_base = args.api_base or CF_TEXT_BASE
if args.cforch:
with print_lock:
print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url}) model={model_name}")
results = run_benchmark(
mk, model_name, prompts=prompts, verbose=args.verbose,
use_cforch=True, cforch_url=args.cforch_url,
)
else:
with print_lock:
print(f"\nRunning [{mk}] → {direct_base} model={model_name}")
results = run_benchmark(
mk, model_name, prompts=prompts, verbose=args.verbose,
api_base=direct_base,
)
with print_lock:
_print_single_report(results, mk)
return mk, results
workers = max(1, args.workers)
if workers == 1 or len(model_keys) == 1:
for mk in model_keys:
mk_out, results = _run_one(mk)
all_results[mk_out] = results
else:
with ThreadPoolExecutor(max_workers=workers) as pool:
futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
for fut in as_completed(futures):
mk_out, results = fut.result()
all_results[mk_out] = results
if len(model_keys) > 1:
_print_comparison_table(all_results)
if args.output:
args.output.parent.mkdir(parents=True, exist_ok=True)
payload = {
mk: [asdict(r) for r in results]
for mk, results in all_results.items()
}
with open(args.output, "w", encoding="utf-8") as f:
json.dump(payload, f, indent=2, ensure_ascii=False)
print(f"Wrote detailed results to {args.output}")
if __name__ == "__main__":
main()

458
scripts/export_plans.py Normal file
View file

@ -0,0 +1,458 @@
"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
Each record is a HuggingFace chat-format example:
{
"id": "<sha256>",
"messages": [
{"role": "user", "content": "<reconstructed planning prompt>"},
{"role": "assistant", "content": "<cleaned document content>"}
],
"meta": {
"source": "peregrine/2026-03-03-feedback-button-design.md",
"product": "peregrine",
"doc_type": "design", # design | plan | spec | implementation | other
"date": "2026-03-03",
"paired_with": "...", # sibling path, or null
"word_count": 1847,
"pair_role": "context" # "context" | "target" | "standalone"
}
}
Pairing strategy
----------------
When a design doc and a plan doc share the same date + feature-name prefix,
they are treated as a pair:
- design plan: instruction = "Given this design doc, write the implementation plan."
context appended = full design doc content.
- Solo docs get a synthetic instruction from the title + first overview section.
Usage
-----
# Preview stats and 5 sample records
python scripts/export_plans.py --preview
# Write full output
python scripts/export_plans.py --output data/plan_pairs.jsonl
# Restrict to specific products
python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
"""
from __future__ import annotations
import argparse
import hashlib
import json
import re
import sys
from pathlib import Path
from typing import Iterator
# ── Paths ──────────────────────────────────────────────────────────────────────
_SCRIPT_DIR = Path(__file__).parent
_AVOCET_ROOT = _SCRIPT_DIR.parent
_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
# ── Doc type detection ─────────────────────────────────────────────────────────
_TYPE_RE = re.compile(
r"-(design|plan|spec|implementation|specs|plans)s?$",
re.IGNORECASE,
)
_SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
# Boilerplate lines to strip from document content before using as output.
_BOILERPLATE_RE = re.compile(
r"""
^\s*>\s*\*\*For\s+agentic\s+workers.* # superpowers agent hints
|^\s*>\s*REQUIRED\s+SUB-SKILL.*
|^\s*\*\*Date:\*\*.* # metadata header lines
|\*\*Status:\*\*\s*Complete.* # completed-feature noise
|\*\*Status:\*\*\s*Done.*
|\*\*Product:\*\*.*
|\*\*Repo:\*\*.*
|\*\*Tech\s+Stack:\*\*.*
|\*\*Candidate:\*\*.* # old synthetic personas
|^Candidate:.*
|^Team:.*
""",
re.VERBOSE | re.MULTILINE,
)
# Old repo/path names to normalise to current equivalents.
_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
(re.compile(r"/devl/job-seeker", re.IGNORECASE), "/Library/Development/CircuitForge/peregrine"),
(re.compile(r"\bjob-seeker\b", re.IGNORECASE), "peregrine"),
(re.compile(r"Alex Rivera", re.IGNORECASE), "[user]"),
]
# Instruction paraphrase templates per doc type.
# Each entry is (user_prefix, paired_prefix).
# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
_DESIGN_INSTRUCTIONS = [
"Write a design document for {product}: {title}.\n\nContext: {overview}",
"You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
"Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
]
_PLAN_INSTRUCTIONS = [
"Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
"Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
"You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
]
_PAIRED_INSTRUCTIONS = [
(
"You are a software architect working on {product}, a CircuitForge product. "
"Given the following design document, write a detailed implementation plan "
"(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
"---\n{design_context}\n---"
),
(
"The following is a design spec for a {product} feature. "
"Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
"---\n{design_context}\n---"
),
(
"Convert this {product} design document into an actionable implementation plan. "
"Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
"---\n{design_context}\n---"
),
]
def _doc_type(stem: str) -> str:
m = _TYPE_RE.search(stem)
if not m:
return "other"
raw = m.group(1).lower().rstrip("s")
return {"implementation": "plan"}.get(raw, raw)
def _date_feature(stem: str) -> tuple[str, str]:
"""Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
if m:
return m.group(1), m.group(2)
return "", stem
# ── Content extraction ─────────────────────────────────────────────────────────
def _extract_title(content: str) -> str:
m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
return m.group(1).strip() if m else ""
def _extract_overview(content: str) -> str:
"""Return first substantive paragraph or h2 section body (≤300 chars)."""
# Superpowers plans have an explicit **Goal:** line — prefer that.
goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
if goal_m:
return goal_m.group(1).strip()[:300]
# Otherwise use the body of the first h2 section.
h2_m = re.search(
r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
content,
re.MULTILINE,
)
if h2_m:
body = h2_m.group(1).strip()
# Strip markdown bullet/code noise for the instruction
body = re.sub(r"```[\s\S]*?```", "", body)
body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
body = re.sub(r"\s+", " ", body).strip()
return body[:300]
return ""
def _clean_content(content: str) -> str:
"""Remove boilerplate, normalize old paths/names, collapse whitespace."""
cleaned = _BOILERPLATE_RE.sub("", content)
for pattern, replacement in _PATH_NORMALIZATIONS:
cleaned = pattern.sub(replacement, cleaned)
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
return cleaned.strip()
def _quality_flags(content: str) -> list[str]:
"""Return a list of quality issue labels found in cleaned content."""
flags = []
if "Alex Rivera" in content or "[user]" in content:
flags.append("persona-residue")
if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
flags.append("completed-status")
return flags
def _make_instruction(
title: str,
product: str,
doc_type: str,
overview: str,
design_context: str | None = None,
variant: int = 0,
) -> str:
"""Synthesise a natural planning prompt for this document.
variant: 0-2 selects which paraphrase template to use. Caller cycles
through all three to produce multiple training examples per document.
"""
product_label = product.replace("-", " ").title() if product else "CircuitForge"
idx = variant % 3
if design_context:
tmpl = _PAIRED_INSTRUCTIONS[idx]
return tmpl.format(
product=product_label,
design_context=design_context[:2500],
)
templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
tmpl = templates[idx]
return tmpl.format(
product=product_label,
title=title,
overview=overview or "",
type_phrase="planning document",
)
def _record_id(content: str, source: str) -> str:
return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
# ── Pair discovery ─────────────────────────────────────────────────────────────
def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
"""Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
by_prefix: dict[str, list[tuple[str, Path]]] = {}
for path in plans_dir.rglob("*.md"):
if any(part in _SKIP_DIRS for part in path.parts):
continue
if path.name == "README.md":
continue
stem = path.stem
date, feature = _date_feature(stem)
if not date:
continue
key = str(path.parent / f"{date}-{feature}")
by_prefix.setdefault(key, []).append((_doc_type(stem), path))
return by_prefix
# ── Record generation ──────────────────────────────────────────────────────────
def _records_for_group(
doc_type_paths: list[tuple[str, Path]],
plans_dir: Path,
) -> Iterator[dict]:
"""Yield one or more training records for a group of related docs."""
# Separate design vs plan docs within this group
designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
plans_ = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
others = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
all_paths = doc_type_paths
if designs and plans_:
# Paired: yield a design→plan record (3 instruction variants)
design_type, design_path = designs[0]
plan_type, plan_path = plans_[0]
design_content = design_path.read_text(encoding="utf-8")
plan_content = plan_path.read_text(encoding="utf-8")
product = _product_from_path(plan_path, plans_dir)
title = _extract_title(plan_content) or plan_path.stem
cleaned = _clean_content(plan_content)
design_cleaned = _clean_content(design_content)
flags = _quality_flags(cleaned)
if len(cleaned.split()) >= 80:
rel_src = str(plan_path.relative_to(plans_dir))
rel_design = str(design_path.relative_to(plans_dir))
for variant in range(3):
instruction = _make_instruction(
title=title,
product=product,
doc_type="plan",
overview=_extract_overview(design_content),
design_context=design_cleaned,
variant=variant,
)
yield {
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
"messages": [
{"role": "user", "content": instruction},
{"role": "assistant", "content": cleaned},
],
"meta": {
"source": rel_src,
"product": product,
"doc_type": "plan",
"date": _date_feature(plan_path.stem)[0],
"paired_with": rel_design,
"word_count": len(cleaned.split()),
"pair_role": "target",
"variant": variant,
"quality_flags": flags,
},
}
# Also yield the design doc as standalone variants
all_paths = [(t, p) for t, p in all_paths if p != plan_path]
# Remaining docs as standalone records (3 instruction variants each)
for doc_type, path in all_paths:
content = path.read_text(encoding="utf-8")
cleaned = _clean_content(content)
if len(cleaned.split()) < 80:
continue
product = _product_from_path(path, plans_dir)
title = _extract_title(content) or path.stem
overview = _extract_overview(content)
flags = _quality_flags(cleaned)
rel_src = str(path.relative_to(plans_dir))
for variant in range(3):
instruction = _make_instruction(
title=title,
product=product,
doc_type=doc_type,
overview=overview,
variant=variant,
)
yield {
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
"messages": [
{"role": "user", "content": instruction},
{"role": "assistant", "content": cleaned},
],
"meta": {
"source": rel_src,
"product": product,
"doc_type": doc_type,
"date": _date_feature(path.stem)[0],
"paired_with": None,
"word_count": len(cleaned.split()),
"pair_role": "standalone",
"variant": variant,
"quality_flags": flags,
},
}
def _product_from_path(path: Path, plans_dir: Path) -> str:
rel = path.relative_to(plans_dir)
return rel.parts[0] if len(rel.parts) > 1 else "shared"
# ── Main export ────────────────────────────────────────────────────────────────
def export(
plans_dir: Path,
products: list[str] | None = None,
) -> list[dict]:
groups = _find_pairs(plans_dir)
records: list[dict] = []
seen_ids: set[str] = set()
for group_key, doc_type_paths in groups.items():
# Filter by product if requested
if products:
paths = [p for _, p in doc_type_paths]
prods = {_product_from_path(p, plans_dir) for p in paths}
if not prods.intersection(products):
continue
for record in _records_for_group(doc_type_paths, plans_dir):
if record["id"] not in seen_ids:
seen_ids.add(record["id"])
records.append(record)
return records
# ── CLI ────────────────────────────────────────────────────────────────────────
def _print_stats(records: list[dict]) -> None:
from collections import Counter
products = Counter(r["meta"]["product"] for r in records)
doc_types = Counter(r["meta"]["doc_type"] for r in records)
pair_roles = Counter(r["meta"]["pair_role"] for r in records)
wc = [r["meta"]["word_count"] for r in records]
wc.sort()
print(f"\n{'='*55}")
print(f" Total records: {len(records)}")
print(f" Word counts : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
print(f"\n By product:")
for p, n in products.most_common():
print(f" {p:<22} {n}")
print(f"\n By doc type:")
for t, n in doc_types.most_common():
print(f" {t:<22} {n}")
print(f"\n Pair roles:")
for r, n in pair_roles.most_common():
print(f" {r:<22} {n}")
print(f"{'='*55}\n")
def _print_sample(records: list[dict], n: int = 3) -> None:
import random
sample = random.sample(records, min(n, len(records)))
for i, rec in enumerate(sample, 1):
meta = rec["meta"]
user_msg = rec["messages"][0]["content"]
asst_msg = rec["messages"][1]["content"]
print(f"\n{''*55}")
print(f"SAMPLE {i}/{n} [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
print(f"source: {meta['source']}")
print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
print(f"\n{''*55}\n")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
parser.add_argument("--output", type=Path, default=None,
help="Write JSONL to this path (omit for preview-only)")
parser.add_argument("--products", default=None,
help="Comma-separated product filter, e.g. peregrine,kiwi")
parser.add_argument("--preview", action="store_true",
help="Print stats + sample records, don't write output")
parser.add_argument("--samples", type=int, default=3,
help="Number of sample records to show in preview (default 3)")
args = parser.parse_args()
products = [p.strip() for p in args.products.split(",")] if args.products else None
print(f"Scanning {args.plans_dir}", file=sys.stderr)
records = export(args.plans_dir, products=products)
_print_stats(records)
if args.preview or args.output is None:
_print_sample(records, n=args.samples)
if args.output is None:
print("(Pass --output <path> to write JSONL)")
return
args.output.parent.mkdir(parents=True, exist_ok=True)
with open(args.output, "w", encoding="utf-8") as f:
for rec in records:
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"Wrote {len(records)} records to {args.output}")
if __name__ == "__main__":
main()

View file

@ -14,7 +14,9 @@ from fastapi.testclient import TestClient
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def reset_cforch_globals(tmp_path): def reset_cforch_globals(tmp_path):
"""Redirect _CONFIG_DIR to tmp_path and reset running-state globals.""" """Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
list_installed to return [] so real disk model directories don't bleed into
tests that don't exercise the installed-model merge path."""
from app import cforch as cforch_module from app import cforch as cforch_module
prev_config_dir = cforch_module._CONFIG_DIR prev_config_dir = cforch_module._CONFIG_DIR
@ -25,7 +27,8 @@ def reset_cforch_globals(tmp_path):
cforch_module._BENCH_RUNNING = False cforch_module._BENCH_RUNNING = False
cforch_module._bench_proc = None cforch_module._bench_proc = None
yield tmp_path with patch("app.models.list_installed", return_value=[]):
yield tmp_path
cforch_module.set_config_dir(prev_config_dir) cforch_module.set_config_dir(prev_config_dir)
cforch_module._BENCH_RUNNING = prev_running cforch_module._BENCH_RUNNING = prev_running
@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
assert m["vram_estimate_mb"] == 6000 assert m["vram_estimate_mb"] == 6000
def test_models_merges_installed_generators(client, config_dir, tmp_path):
"""Installed cf-text/vllm generator models appear in the model list,
deduplicated against bench_models.yaml entries."""
models_file = tmp_path / "bench_models.yaml"
_write_models_yaml(models_file, [
{"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
{"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
])
_write_config(config_dir, {"bench_models": str(models_file)})
fake_installed = [
# should be included — cf-text generator not already in YAML
{"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
# should be deduped — repo_id matches a YAML entry
{"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
# should be excluded — classifier, not a generator
{"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
]
with patch("app.models.list_installed", return_value=fake_installed):
r = client.get("/api/cforch/models")
assert r.status_code == 200
ids = [m["id"] for m in r.json()["models"]]
assert "llama3:8b" in ids # from YAML
assert "ibm-granite/granite-4.1-8b" in ids # from YAML (not duplicated)
assert "meta-llama/Llama-3.1-8B" in ids # merged from installed
assert "cross-encoder/ms-marco-MiniLM-L6" not in ids # filtered out (reranker)
assert ids.count("ibm-granite/granite-4.1-8b") == 1 # no duplicate
# ── GET /run ─────────────────────────────────────────────────────────────────── # ── GET /run ───────────────────────────────────────────────────────────────────
def test_run_returns_409_when_already_running(client): def test_run_returns_409_when_already_running(client):

View file

@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
except _HTTPException as exc: except _HTTPException as exc:
assert exc.status_code in (400, 404) assert exc.status_code in (400, 404)
raise raise
# ── Catalog registration ───────────────────────────────────────────────────────
_MINIMAL_YAML = """\
services:
cf-text:
max_mb: {max_mb}
catalog:
existing-model:
path: /some/path
vram_mb: 1000
description: "placeholder"
"""
def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
p = tmp_path / "testnode.yaml"
p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
return p
def test_catalog_registration_fp16_no_env_block(tmp_path):
"""When model fits at FP16, no env block should be written."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/SmallModel",
local_path=tmp_path / "org--SmallModel",
vram_mb_fp16=4000,
role="generator",
)
assert "testnode" in updated
content = node_yaml.read_text()
# _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
assert "smallmodel:" in content
assert "CF_TEXT_4BIT" not in content
assert "env:" not in content
def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
"""When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/BigModel",
local_path=tmp_path / "org--BigModel",
vram_mb_fp16=20000, # won't fit at FP16 on 8 GB
role="generator",
)
assert "testnode" in updated
content = node_yaml.read_text()
# _catalog_key: "org/BigModel" → "bigmodel"
assert "bigmodel:" in content
assert "env:" in content
assert 'CF_TEXT_4BIT: "1"' in content
assert "CF_TEXT_4BIT=1 required" in content # description note
def test_catalog_registration_too_large_skipped(tmp_path):
"""Model too large even at 4-bit should not be registered."""
from app import models as models_module
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
updated = models_module._register_in_node_catalogs(
repo_id="org/HugeModel",
local_path=tmp_path / "org--HugeModel",
vram_mb_fp16=80000, # 4-bit ~22 GB, still won't fit on 8 GB
role="generator",
)
assert updated == []
content = node_yaml.read_text()
assert "hugemodel" not in content

View file

@ -21,21 +21,28 @@
:class="{ active: benchMode === 'style' }" :class="{ active: benchMode === 'style' }"
@click="benchMode = 'style'" @click="benchMode = 'style'"
> Writing Style</button> > Writing Style</button>
<button
class="mode-btn"
:class="{ active: benchMode === 'plans' }"
@click="benchMode = 'plans'"
>📐 Planning</button>
</div> </div>
<ClassifierTab v-if="benchMode === 'classifier'" /> <ClassifierTab v-if="benchMode === 'classifier'" />
<LlmEvalTab v-if="benchMode === 'llm'" /> <LlmEvalTab v-if="benchMode === 'llm'" />
<StyleTab v-if="benchMode === 'style'" /> <StyleTab v-if="benchMode === 'style'" />
<PlansBenchTab v-if="benchMode === 'plans'" />
</div> </div>
</template> </template>
<script setup lang="ts"> <script setup lang="ts">
import { ref } from 'vue' import { ref } from 'vue'
import ClassifierTab from './ClassifierTab.vue' import ClassifierTab from './ClassifierTab.vue'
import LlmEvalTab from './LlmEvalTab.vue' import LlmEvalTab from './LlmEvalTab.vue'
import StyleTab from './StyleTab.vue' import StyleTab from './StyleTab.vue'
import PlansBenchTab from './PlansBenchTab.vue'
type BenchMode = 'classifier' | 'llm' | 'style' type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
const benchMode = ref<BenchMode>('classifier') const benchMode = ref<BenchMode>('classifier')
</script> </script>

View file

@ -6,6 +6,8 @@
<summary class="picker-summary"> <summary class="picker-summary">
<span class="picker-title">📋 Task Selection</span> <span class="picker-title">📋 Task Selection</span>
<span class="picker-badge">{{ llmTaskBadge }}</span> <span class="picker-badge">{{ llmTaskBadge }}</span>
<button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
<button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
</summary> </summary>
<div class="picker-body"> <div class="picker-body">
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks</div> <div v-if="llmTasksLoading" class="picker-loading">Loading tasks</div>
@ -44,6 +46,8 @@
<summary class="picker-summary"> <summary class="picker-summary">
<span class="picker-title">🎯 Model Selection</span> <span class="picker-title">🎯 Model Selection</span>
<span class="picker-badge">{{ llmModelBadge }}</span> <span class="picker-badge">{{ llmModelBadge }}</span>
<button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
<button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
</summary> </summary>
<div class="picker-body"> <div class="picker-body">
<div v-if="llmModelsLoading" class="picker-loading">Loading models</div> <div v-if="llmModelsLoading" class="picker-loading">Loading models</div>
@ -78,6 +82,33 @@
</div> </div>
</details> </details>
<!-- Node Selection -->
<div class="node-picker" v-if="llmNodes.length > 0">
<span class="node-picker-label">Nodes:</span>
<label
v-for="node in llmNodes"
:key="node.node_id"
class="node-chip"
:class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
:title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
>
<input
type="checkbox"
class="node-chip-check"
:checked="enabledNodes.has(node.node_id)"
:disabled="!node.online || llmRunning"
@change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
/>
{{ node.node_id }}
<span class="node-chip-status" v-if="!node.online">offline</span>
</label>
<span class="node-picker-hint">
{{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
? 'auto-routing (all nodes)'
: `restricted to: ${enabledNodeIds.join(', ')}` }}
</span>
</div>
<!-- Run Controls --> <!-- Run Controls -->
<div class="run-controls"> <div class="run-controls">
<button <button
@ -88,6 +119,24 @@
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }} {{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
</button> </button>
<button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark"> Cancel</button> <button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark"> Cancel</button>
<input
v-model="llmJudgeUrl"
class="judge-url-input"
placeholder="Judge URL — leave empty to skip LLM judge scoring"
:disabled="llmRunning"
title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
/>
<label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
<span class="workers-prefix">workers</span>
<input
v-model.number="llmWorkers"
type="number"
min="1"
max="8"
class="workers-input"
:disabled="llmRunning"
/>
</label>
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint"> <span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
Select at least one task and one model to run. Select at least one task and one model to run.
</span> </span>
@ -119,6 +168,7 @@
<tr> <tr>
<th class="hm-label-col">Model</th> <th class="hm-label-col">Model</th>
<th class="hm-model-col">overall</th> <th class="hm-model-col">overall</th>
<th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
<th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th> <th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
<th class="hm-model-col">tok/s</th> <th class="hm-model-col">tok/s</th>
</tr> </tr>
@ -130,6 +180,12 @@
class="hm-value-cell" class="hm-value-cell"
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }" :class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
>{{ pct(row.avg_quality_score) }}</td> >{{ pct(row.avg_quality_score) }}</td>
<td
v-if="llmHasJudge"
class="hm-value-cell hm-judge-cell"
:class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
title="LLM-as-judge secondary score"
>{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
<td <td
v-for="col in llmTaskTypeCols" v-for="col in llmTaskTypeCols"
:key="col" :key="col"
@ -168,6 +224,12 @@ interface CfOrchModel {
vram_estimate_mb?: number vram_estimate_mb?: number
} }
interface CfOrchNode {
node_id: string
online: boolean
gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
}
interface LlmModelResult { interface LlmModelResult {
model_name: string model_name: string
model_id: string model_id: string
@ -175,9 +237,11 @@ interface LlmModelResult {
avg_tokens_per_sec: number avg_tokens_per_sec: number
avg_completion_ms: number avg_completion_ms: number
avg_quality_score: number avg_quality_score: number
avg_judge_score: number | null
finetune_candidates: number finetune_candidates: number
error_count: number error_count: number
quality_by_task_type: Record<string, number> quality_by_task_type: Record<string, number>
judge_score_by_task_type?: Record<string, number>
} }
// State // State
@ -195,6 +259,10 @@ const llmError = ref('')
const llmResults = ref<LlmModelResult[]>([]) const llmResults = ref<LlmModelResult[]>([])
const llmEventSource = ref<EventSource | null>(null) const llmEventSource = ref<EventSource | null>(null)
const llmLogEl = ref<HTMLElement | null>(null) const llmLogEl = ref<HTMLElement | null>(null)
const llmJudgeUrl = ref('')
const llmWorkers = ref(1)
const llmNodes = ref<CfOrchNode[]>([])
const enabledNodes = ref<Set<string>>(new Set())
// Computed // Computed
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => { const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
return [...types].sort() return [...types].sort()
}) })
const llmHasJudge = computed(() =>
llmResults.value.some(r => r.avg_judge_score != null)
)
const enabledNodeIds = computed(() =>
llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
)
const llmBestByCol = computed((): Record<string, string> => { const llmBestByCol = computed((): Record<string, string> => {
const best: Record<string, string> = {} const best: Record<string, string> = {}
if (llmResults.value.length === 0) return best if (llmResults.value.length === 0) return best
@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
} }
best['overall'] = bestId best['overall'] = bestId
if (llmHasJudge.value) {
bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) {
if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
bestVal = r.avg_judge_score; bestId = r.model_id
}
}
best['judge'] = bestId
}
for (const col of llmTaskTypeCols.value) { for (const col of llmTaskTypeCols.value) {
bestId = ''; bestVal = -Infinity bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) { for (const r of llmResults.value) {
@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
} }
selectedLlmModels.value = next selectedLlmModels.value = next
} }
function selectAllTasks() { selectedLlmTasks.value = new Set(llmTasks.value.map(t => t.id)) }
function clearAllTasks() { selectedLlmTasks.value = new Set() }
function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
function clearAllModels() { selectedLlmModels.value = new Set() }
function toggleNode(id: string, checked: boolean) {
const next = new Set(enabledNodes.value)
checked ? next.add(id) : next.delete(id)
enabledNodes.value = next
}
// Data loaders // Data loaders
async function loadLlmTasks() { async function loadLlmTasks() {
@ -335,6 +430,21 @@ async function loadLlmResults() {
} }
} }
async function loadLlmConfig() {
const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
if (data?.judge_url && !llmJudgeUrl.value) {
llmJudgeUrl.value = data.judge_url
}
}
async function loadLlmNodes() {
const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
if (data?.nodes) {
llmNodes.value = data.nodes
enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
}
}
// Run / cancel // Run / cancel
function startLlmBenchmark() { function startLlmBenchmark() {
llmRunning.value = true llmRunning.value = true
@ -344,6 +454,15 @@ function startLlmBenchmark() {
const params = new URLSearchParams() const params = new URLSearchParams()
const taskIds = [...selectedLlmTasks.value].join(',') const taskIds = [...selectedLlmTasks.value].join(',')
if (taskIds) params.set('task_ids', taskIds) if (taskIds) params.set('task_ids', taskIds)
const modelIds = [...selectedLlmModels.value].join(',')
if (modelIds) params.set('model_ids', modelIds)
if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
if (isRestricted && enabledNodeIds.value.length > 0) {
params.set('node_ids', enabledNodeIds.value.join(','))
}
const es = new EventSource(`/api/cforch/run?${params}`) const es = new EventSource(`/api/cforch/run?${params}`)
llmEventSource.value = es llmEventSource.value = es
@ -387,6 +506,8 @@ onMounted(() => {
loadLlmTasks() loadLlmTasks()
loadLlmModels() loadLlmModels()
loadLlmResults() loadLlmResults()
loadLlmConfig()
loadLlmNodes()
}) })
</script> </script>
@ -451,6 +572,43 @@ onMounted(() => {
color: var(--color-text-secondary, #6b7a99); color: var(--color-text-secondary, #6b7a99);
} }
.judge-url-input {
flex: 1;
min-width: 14rem;
max-width: 24rem;
padding: 0.35rem 0.6rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #fff);
color: var(--color-text, #1a2338);
font-size: 0.8rem;
font-family: var(--font-mono, monospace);
}
.judge-url-input:disabled { opacity: 0.5; }
.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
.workers-label {
display: flex;
align-items: center;
gap: 0.3rem;
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
white-space: nowrap;
}
.workers-prefix { font-family: var(--font-mono, monospace); }
.workers-input {
width: 3.2rem;
padding: 0.35rem 0.4rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #fff);
color: var(--color-text, #1a2338);
font-size: 0.8rem;
font-family: var(--font-mono, monospace);
text-align: center;
}
.workers-input:disabled { opacity: 0.5; }
/* ── Run log ────────────────────────────────────────────── */ /* ── Run log ────────────────────────────────────────────── */
.run-log { .run-log {
border: 1px solid var(--color-border, #d0d7e8); border: 1px solid var(--color-border, #d0d7e8);
@ -592,6 +750,15 @@ onMounted(() => {
white-space: nowrap; white-space: nowrap;
} }
.hm-judge-col {
background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
}
.hm-judge-cell {
background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
font-style: italic;
opacity: 0.9;
}
/* ── Model Picker ───────────────────────────────────────── */ /* ── Model Picker ───────────────────────────────────────── */
.model-picker { .model-picker {
border: 1px solid var(--color-border, #d0d7e8); border: 1px solid var(--color-border, #d0d7e8);
@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼ '; }
margin-left: auto; margin-left: auto;
} }
.picker-bulk-btn {
padding: 0.1rem 0.45rem;
font-size: 0.7rem;
font-family: var(--font-mono, monospace);
background: var(--color-surface, #fff);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
transition: background 0.12s, color 0.12s;
flex-shrink: 0;
}
.picker-bulk-btn:hover {
background: var(--app-primary, #2A6080);
color: #fff;
border-color: var(--app-primary, #2A6080);
}
.picker-body { .picker-body {
padding: 0.75rem; padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8); border-top: 1px solid var(--color-border, #d0d7e8);
@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼ '; }
.picker-model-list { padding-left: 0; } .picker-model-list { padding-left: 0; }
.picker-model-name { max-width: 14ch; } .picker-model-name { max-width: 14ch; }
} }
/* ── Node picker ────────────────────────────────────── */
.node-picker {
display: flex;
align-items: center;
gap: 0.5rem;
flex-wrap: wrap;
padding: 0.5rem 0.75rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
background: var(--color-surface-raised, #e4ebf5);
}
.node-picker-label {
font-size: 0.78rem;
font-weight: 600;
color: var(--color-text-secondary, #6b7a99);
text-transform: uppercase;
letter-spacing: 0.04em;
white-space: nowrap;
}
.node-chip {
display: inline-flex;
align-items: center;
gap: 0.3rem;
padding: 0.2rem 0.55rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 1rem;
background: var(--color-surface, #fff);
font-size: 0.78rem;
font-family: var(--font-mono, monospace);
color: var(--color-text, #1a2338);
cursor: pointer;
transition: background 0.12s, opacity 0.12s;
}
.node-chip--off {
opacity: 0.45;
background: transparent;
}
.node-chip--offline {
opacity: 0.35;
cursor: not-allowed;
font-style: italic;
}
.node-chip-check { accent-color: var(--app-primary, #2A6080); }
.node-chip-status {
font-size: 0.66rem;
color: var(--color-text-secondary, #6b7a99);
}
.node-picker-hint {
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
font-family: var(--font-mono, monospace);
margin-left: auto;
}
</style> </style>

View file

@ -51,8 +51,31 @@
<span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter"> <span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
{{ lookupResult.adapter_recommendation }} {{ lookupResult.adapter_recommendation }}
</span> </span>
<span v-if="lookupResult.size != null" class="preview-size"> <span v-if="selectedQuantSize > 0" class="preview-size">
{{ humanBytes(lookupResult.size) }} {{ humanBytes(selectedQuantSize) }}
</span>
</div>
<!-- GGUF quantization picker only shown for GGUF repos -->
<div v-if="lookupResult.gguf_files?.length" class="quant-picker">
<label class="quant-label" for="quant-select">Quantization</label>
<select
id="quant-select"
v-model="selectedQuant"
class="quant-select"
aria-label="Select quantization variant"
>
<option :value="null" disabled>Select quantization</option>
<option
v-for="f in lookupResult.gguf_files"
:key="f.filename"
:value="f.quant_name ?? f.filename"
>
{{ f.quant_name ?? f.filename }} {{ humanBytes(f.size) }}
</option>
</select>
<span class="quant-hint">
Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
</span> </span>
</div> </div>
@ -67,7 +90,7 @@
<button <button
class="btn-primary btn-add-queue" class="btn-primary btn-add-queue"
:disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue" :disabled="!canAddToQueue"
@click="addToQueue" @click="addToQueue"
> >
{{ addingToQueue ? 'Adding…' : 'Add to queue' }} {{ addingToQueue ? 'Adding…' : 'Add to queue' }}
@ -99,9 +122,39 @@
<span v-if="model.role" class="chip chip-role">{{ model.role }}</span> <span v-if="model.role" class="chip chip-role">{{ model.role }}</span>
<span v-if="model.service" class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span> <span v-if="model.service" class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
<span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span> <span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
<span v-if="model.quant_pattern" class="chip chip-quant">{{ model.quant_pattern }}</span>
</div>
<!-- Allow manual service/role assignment for unrecognized pipeline tags -->
<div v-if="!model.service" class="classify-row queue-classify">
<select
class="classify-select"
:value="classifyDraft[model.id]?.service ?? ''"
@change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
aria-label="Assign service"
>
<option value="" disabled>Service</option>
<option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
</select>
<select
class="classify-select"
:value="classifyDraft[model.id]?.role ?? ''"
:disabled="!classifyDraft[model.id]?.service"
@change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
aria-label="Assign role"
>
<option value="" disabled>Role</option>
<option
v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
:key="role"
:value="role"
>{{ role }}</option>
</select>
</div> </div>
<div class="model-card-actions"> <div class="model-card-actions">
<button class="btn-primary btn-sm" @click="approveModel(model.id)"> <button
class="btn-primary btn-sm"
@click="approveModel(model.id, classifyDraft[model.id])"
>
Approve download Approve download
</button> </button>
</div> </div>
@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'
// Type definitions // Type definitions
interface GgufFile {
filename: string
size: number
quant_name: string | null
}
interface LookupResult { interface LookupResult {
repo_id: string repo_id: string
pipeline_tag: string | null pipeline_tag: string | null
@ -260,7 +319,8 @@ interface LookupResult {
service: string | null service: string | null
compatible: boolean compatible: boolean
warning: string | null warning: string | null
size: number | null model_size_bytes: number
gguf_files: GgufFile[] | null
description: string | null description: string | null
already_installed: boolean already_installed: boolean
already_queued: boolean already_queued: boolean
@ -274,6 +334,7 @@ interface QueuedModel {
adapter_recommendation: string | null adapter_recommendation: string | null
role: string | null role: string | null
service: string | null service: string | null
quant_pattern: string | null
} }
interface InstalledModel { interface InstalledModel {
@ -302,6 +363,26 @@ const lookupLoading = ref(false)
const lookupError = ref<string | null>(null) const lookupError = ref<string | null>(null)
const lookupResult = ref<LookupResult | null>(null) const lookupResult = ref<LookupResult | null>(null)
const addingToQueue = ref(false) const addingToQueue = ref(false)
const selectedQuant = ref<string | null>(null)
// Size of the selected GGUF file, or total model size for non-GGUF repos.
const selectedQuantSize = computed<number>(() => {
const r = lookupResult.value
if (!r) return 0
if (r.gguf_files?.length && selectedQuant.value) {
const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
return f?.size ?? r.model_size_bytes
}
return r.model_size_bytes
})
// Disable "Add to queue" when a GGUF repo but no quant chosen yet.
const canAddToQueue = computed(() => {
const r = lookupResult.value
if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
if (r.gguf_files?.length && !selectedQuant.value) return false
return true
})
const queuedModels = ref<QueuedModel[]>([]) const queuedModels = ref<QueuedModel[]>([])
const installedModels = ref<InstalledModel[]>([]) const installedModels = ref<InstalledModel[]>([])
@ -411,6 +492,7 @@ async function doLookup() {
lookupLoading.value = true lookupLoading.value = true
lookupError.value = null lookupError.value = null
lookupResult.value = null lookupResult.value = null
selectedQuant.value = null
try { try {
const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`) const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
@ -442,7 +524,15 @@ async function addToQueue() {
const res = await fetch('/api/models/queue', { const res = await fetch('/api/models/queue', {
method: 'POST', method: 'POST',
headers: { 'Content-Type': 'application/json' }, headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }), body: JSON.stringify({
repo_id,
pipeline_tag,
adapter_recommendation,
role,
service,
model_size_bytes: selectedQuantSize.value,
quant_pattern: selectedQuant.value,
}),
}) })
if (res.ok) { if (res.ok) {
lookupResult.value = { ...lookupResult.value, already_queued: true } lookupResult.value = { ...lookupResult.value, already_queued: true }
@ -454,8 +544,16 @@ async function addToQueue() {
} }
} }
async function approveModel(id: string) { async function approveModel(id: string, draft?: { service: string; role: string }) {
try { try {
// If the user picked a service/role for an unrecognized model, patch it first.
if (draft?.service && draft?.role) {
await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
method: 'PATCH',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ service: draft.service, role: draft.role }),
})
}
const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' }) const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
if (res.ok) { if (res.ok) {
await loadQueue() await loadQueue()
@ -774,6 +872,44 @@ onUnmounted(() => {
align-self: flex-start; align-self: flex-start;
} }
/* ── Quant picker ── */
.quant-picker {
display: flex;
flex-direction: column;
gap: 0.35rem;
}
.quant-label {
font-size: 0.8rem;
font-weight: 600;
color: var(--color-text-muted, #4a5c7a);
text-transform: uppercase;
letter-spacing: 0.04em;
}
.quant-select {
padding: 0.4rem 0.6rem;
border: 1px solid var(--color-border, #a8b8d0);
border-radius: var(--radius-md, 0.5rem);
background: var(--color-surface, #f0f4fb);
color: var(--color-text, #1a2338);
font-size: 0.9rem;
font-family: var(--font-mono, monospace);
cursor: pointer;
}
.quant-hint {
font-size: 0.78rem;
color: var(--color-text-muted, #4a5c7a);
}
.chip-quant {
background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
color: var(--color-primary, #2A6080);
font-family: var(--font-mono, monospace);
font-size: 0.75rem;
}
/* ── Model cards (queue + downloads) ── */ /* ── Model cards (queue + downloads) ── */
.model-card { .model-card {
border: 1px solid var(--color-border, #a8b8d0); border: 1px solid var(--color-border, #a8b8d0);

File diff suppressed because it is too large Load diff