feat: plans benchmark harness — model scoring for CF planning prompts
Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue component, and registers /api/plans-bench in api.py. Also extends models registry (cf-text catalog integration), cforch client, LlmEvalTab, and ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
This commit is contained in:
parent
e11db5ccd9
commit
bce932461a
14 changed files with 3137 additions and 59 deletions
|
|
@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
|
|||
# Set one of these to use a cloud LLM instead of a local model.
|
||||
# ANTHROPIC_API_KEY=sk-ant-...
|
||||
# OPENAI_API_KEY=sk-...
|
||||
|
||||
# ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
|
||||
# Generate at https://huggingface.co/settings/tokens and accept model terms first.
|
||||
# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx
|
||||
|
|
|
|||
|
|
@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
|
|||
from app.train.train import router as train_router
|
||||
app.include_router(train_router, prefix="/api/train")
|
||||
|
||||
from app.plans_bench import router as plans_bench_router
|
||||
app.include_router(plans_bench_router, prefix="/api/plans-bench")
|
||||
|
||||
# In-memory last-action store (single user, local tool — in-memory is fine)
|
||||
_last_action: dict | None = None
|
||||
|
||||
from app.dashboard import router as dashboard_router
|
||||
app.include_router(dashboard_router, prefix="/api")
|
||||
|
||||
|
|
|
|||
220
app/cforch.py
220
app/cforch.py
|
|
@ -17,9 +17,12 @@ import logging
|
|||
import os
|
||||
import re
|
||||
import subprocess as _subprocess
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import urllib.parse
|
||||
|
||||
import yaml
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
|
@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
|
|||
"license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"),
|
||||
"ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"),
|
||||
"ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"),
|
||||
"judge_url": _coalesce(file_cfg.get("judge_url", ""), "CF_JUDGE_URL"),
|
||||
"hf_token": _coalesce(file_cfg.get("hf_token", ""), "HF_TOKEN"),
|
||||
}
|
||||
|
||||
|
||||
def _validate_service_url(url: str, param_name: str) -> str:
|
||||
"""Validate that a URL is a well-formed http/https URL with a hostname.
|
||||
|
||||
Guards against SSRF: only http/https is allowed; the URL must have a
|
||||
non-empty host. Does not enforce an allowlist — call sites are internal
|
||||
tooling, not a public API.
|
||||
"""
|
||||
if not url:
|
||||
return url
|
||||
try:
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
except Exception:
|
||||
raise HTTPException(400, f"{param_name}: not a valid URL")
|
||||
if parsed.scheme not in ("http", "https"):
|
||||
raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
|
||||
if not parsed.hostname:
|
||||
raise HTTPException(400, f"{param_name}: URL has no hostname")
|
||||
return url
|
||||
|
||||
|
||||
def _strip_ansi(text: str) -> str:
|
||||
"""Remove ANSI escape codes from a string."""
|
||||
return re.sub(r'\x1b\[[0-9;]*m', '', text)
|
||||
|
|
@ -147,48 +172,141 @@ def get_tasks() -> dict:
|
|||
|
||||
# ── GET /models ────────────────────────────────────────────────────────────────
|
||||
|
||||
# Services and roles surfaced in the benchmark model picker.
|
||||
# Covers all cf-orch service types that benchmark.py can route tasks to.
|
||||
_BENCH_SERVICES = frozenset({
|
||||
"cf-text", "vllm", # LLM text generation
|
||||
"cf-stt", # speech-to-text
|
||||
"cf-tts", # text-to-speech
|
||||
"cf-vision", # image classification / embedding
|
||||
"cf-voice", # audio context classification
|
||||
})
|
||||
_BENCH_ROLES = frozenset({
|
||||
"generator", "vlm", # LLM roles
|
||||
"stt", "alm", # speech recognition
|
||||
"tts", # speech synthesis
|
||||
"vision", "embedding", # image understanding
|
||||
"classifier", # audio classification (cf-voice)
|
||||
})
|
||||
|
||||
|
||||
@router.get("/models")
|
||||
def get_models() -> dict:
|
||||
"""Return model list from bench_models.yaml."""
|
||||
"""Return model list from bench_models.yaml merged with locally installed models.
|
||||
|
||||
bench_models.yaml entries are listed first and take precedence; any installed
|
||||
model whose repo_id is already present in the YAML is skipped. Only models
|
||||
whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
|
||||
cf-voice) are surfaced from the installed registry.
|
||||
"""
|
||||
cfg = _load_cforch_config()
|
||||
models_path = cfg.get("bench_models", "")
|
||||
if not models_path:
|
||||
return {"models": []}
|
||||
|
||||
models: list[dict] = []
|
||||
bench_ids: set[str] = set()
|
||||
|
||||
if models_path:
|
||||
p = Path(models_path)
|
||||
if not p.exists():
|
||||
return {"models": []}
|
||||
|
||||
if p.exists():
|
||||
try:
|
||||
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||
except yaml.YAMLError as exc:
|
||||
logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
|
||||
return {"models": []}
|
||||
|
||||
models_raw = raw.get("models", []) or []
|
||||
models: list[dict] = []
|
||||
for m in models_raw:
|
||||
raw = {}
|
||||
for m in (raw.get("models", []) or []):
|
||||
if not isinstance(m, dict):
|
||||
continue
|
||||
model_id = m.get("id", "")
|
||||
models.append({
|
||||
"name": m.get("name", ""),
|
||||
"id": m.get("id", ""),
|
||||
"id": model_id,
|
||||
"service": m.get("service", "ollama"),
|
||||
"tags": m.get("tags", []) or [],
|
||||
"vram_estimate_mb": m.get("vram_estimate_mb", 0),
|
||||
})
|
||||
if model_id:
|
||||
bench_ids.add(model_id)
|
||||
|
||||
# Merge installed generator models not already in bench_models.yaml.
|
||||
try:
|
||||
from app.models import list_installed # local import avoids circular dependency at module load
|
||||
for installed in list_installed():
|
||||
model_id: str = installed.get("model_id") or ""
|
||||
service: str = installed.get("service") or ""
|
||||
role: str = installed.get("role") or ""
|
||||
if not model_id:
|
||||
continue
|
||||
if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
|
||||
continue
|
||||
if model_id in bench_ids:
|
||||
continue
|
||||
display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
|
||||
models.append({
|
||||
"name": display_name,
|
||||
"id": model_id,
|
||||
"service": service,
|
||||
"tags": [role],
|
||||
"vram_estimate_mb": installed.get("vram_mb") or 0,
|
||||
})
|
||||
bench_ids.add(model_id)
|
||||
except Exception as exc:
|
||||
logger.warning("Could not merge installed models into model list: %s", exc)
|
||||
|
||||
return {"models": models}
|
||||
|
||||
|
||||
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/nodes")
|
||||
def get_nodes() -> dict:
|
||||
"""Proxy the coordinator's /api/nodes list, returning node_id + online status.
|
||||
|
||||
Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
|
||||
Returns an empty list if the coordinator is unreachable.
|
||||
"""
|
||||
cfg = _load_cforch_config()
|
||||
coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
|
||||
if not coordinator_url:
|
||||
return {"nodes": []}
|
||||
try:
|
||||
import httpx as _httpx
|
||||
resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
|
||||
resp.raise_for_status()
|
||||
raw_nodes = resp.json().get("nodes", [])
|
||||
return {
|
||||
"nodes": [
|
||||
{
|
||||
"node_id": n.get("node_id", ""),
|
||||
"online": n.get("last_heartbeat") is not None,
|
||||
"gpus": [
|
||||
{
|
||||
"gpu_id": g.get("gpu_id"),
|
||||
"name": g.get("name", ""),
|
||||
"vram_total_mb": g.get("vram_total_mb", 0),
|
||||
"vram_free_mb": g.get("vram_free_mb", 0),
|
||||
}
|
||||
for g in n.get("gpus", [])
|
||||
],
|
||||
}
|
||||
for n in raw_nodes
|
||||
]
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.warning("Could not fetch nodes from coordinator: %s", exc)
|
||||
return {"nodes": []}
|
||||
|
||||
|
||||
@router.get("/run")
|
||||
def run_benchmark(
|
||||
task_ids: str = "",
|
||||
model_ids: str = "",
|
||||
model_tags: str = "",
|
||||
coordinator_url: str = "",
|
||||
ollama_url: str = "",
|
||||
judge_url: str = "",
|
||||
judge_backend: str = "chat",
|
||||
workers: int = 1,
|
||||
node_ids: str = "",
|
||||
) -> StreamingResponse:
|
||||
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
|
@ -205,6 +323,13 @@ def run_benchmark(
|
|||
cfg_coordinator = cfg.get("coordinator_url", "")
|
||||
cfg_ollama = cfg.get("ollama_url", "")
|
||||
cfg_license_key = cfg.get("license_key", "")
|
||||
cfg_judge_url = cfg.get("judge_url", "")
|
||||
|
||||
# Validate URL params before spawning the subprocess.
|
||||
# _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
|
||||
_validate_service_url(coordinator_url, "coordinator_url")
|
||||
_validate_service_url(ollama_url, "ollama_url")
|
||||
_validate_service_url(judge_url, "judge_url")
|
||||
|
||||
def generate():
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
|
@ -213,16 +338,68 @@ def run_benchmark(
|
|||
yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
|
||||
return
|
||||
|
||||
# Build effective models file: bench_models.yaml + any installed models
|
||||
# whose IDs were selected but are absent from the YAML (e.g. downloaded
|
||||
# via the Models view). Written to a temp file so benchmark.py sees one
|
||||
# unified list; cleaned up in the finally block.
|
||||
effective_models_file = bench_models
|
||||
_tmp_models_path: str | None = None
|
||||
|
||||
if model_ids and bench_models and Path(bench_models).exists():
|
||||
requested_ids = set(model_ids.split(","))
|
||||
try:
|
||||
raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
|
||||
bench_entries: list[dict] = raw_bench.get("models", []) or []
|
||||
bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
|
||||
missing_ids = requested_ids - bench_id_set
|
||||
if missing_ids:
|
||||
from app.models import list_installed
|
||||
installed_map = {
|
||||
m["model_id"]: m
|
||||
for m in list_installed()
|
||||
if m.get("model_id") and m.get("service") in _BENCH_SERVICES
|
||||
}
|
||||
extra: list[dict] = []
|
||||
for mid in missing_ids:
|
||||
if mid in installed_map:
|
||||
inst = installed_map[mid]
|
||||
entry: dict[str, Any] = {
|
||||
"id": mid,
|
||||
"name": mid.split("/", 1)[-1] if "/" in mid else mid,
|
||||
"service": inst.get("service", "cf-text"),
|
||||
"vram_estimate_mb": inst.get("vram_mb") or 0,
|
||||
"tags": [inst.get("role", "generator")],
|
||||
"temperature": 0.0,
|
||||
}
|
||||
local_path = inst.get("path", "") or inst.get("local_path", "")
|
||||
if local_path:
|
||||
entry["model_path"] = local_path
|
||||
extra.append(entry)
|
||||
if extra:
|
||||
merged = {"models": bench_entries + extra}
|
||||
tf = tempfile.NamedTemporaryFile(
|
||||
mode="w", suffix=".yaml", delete=False,
|
||||
prefix="avocet_bench_models_",
|
||||
)
|
||||
yaml.dump(merged, tf)
|
||||
tf.close()
|
||||
_tmp_models_path = tf.name
|
||||
effective_models_file = _tmp_models_path
|
||||
except Exception as exc:
|
||||
logger.warning("Could not merge installed models into temp bench file: %s", exc)
|
||||
|
||||
cmd = [
|
||||
python_bin,
|
||||
bench_script,
|
||||
"--tasks", bench_tasks,
|
||||
"--models", bench_models,
|
||||
"--models", effective_models_file,
|
||||
"--output", results_dir,
|
||||
]
|
||||
|
||||
if task_ids:
|
||||
cmd.extend(["--filter-tasks"] + task_ids.split(","))
|
||||
if model_ids:
|
||||
cmd.extend(["--filter-models"] + model_ids.split(","))
|
||||
if model_tags:
|
||||
cmd.extend(["--filter-tags"] + model_tags.split(","))
|
||||
|
||||
|
|
@ -233,6 +410,15 @@ def run_benchmark(
|
|||
cmd.extend(["--coordinator", effective_coordinator])
|
||||
if effective_ollama:
|
||||
cmd.extend(["--ollama-url", effective_ollama])
|
||||
effective_judge = judge_url if judge_url else cfg_judge_url
|
||||
if effective_judge:
|
||||
cmd.extend(["--judge-url", effective_judge])
|
||||
if judge_backend and judge_backend != "chat":
|
||||
cmd.extend(["--judge-backend", judge_backend])
|
||||
if workers > 1:
|
||||
cmd.extend(["--workers", str(workers)])
|
||||
if node_ids:
|
||||
cmd.extend(["--nodes"] + node_ids.split(","))
|
||||
|
||||
# Pass license key as env var so subprocess can authenticate with cf-orch
|
||||
proc_env = {**os.environ}
|
||||
|
|
@ -273,6 +459,11 @@ def run_benchmark(
|
|||
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
|
||||
finally:
|
||||
_BENCH_RUNNING = False
|
||||
if _tmp_models_path:
|
||||
try:
|
||||
os.unlink(_tmp_models_path)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
|
|
@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
|
|||
"coordinator_url": cfg.get("coordinator_url", ""),
|
||||
"ollama_url": cfg.get("ollama_url", ""),
|
||||
"ollama_model": cfg.get("ollama_model", ""),
|
||||
"judge_url": cfg.get("judge_url", ""),
|
||||
"license_key_set": bool(cfg.get("license_key", "")),
|
||||
"source": "env" if not _config_file().exists() else "yaml+env",
|
||||
}
|
||||
|
|
@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
|
|||
# ── GET /results ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/results")
|
||||
def get_results() -> dict:
|
||||
def get_results() -> list:
|
||||
"""Return the latest benchmark summary.json from results_dir."""
|
||||
cfg = _load_cforch_config()
|
||||
results_dir = cfg.get("results_dir", "")
|
||||
|
|
|
|||
147
app/models.py
147
app/models.py
|
|
@ -15,6 +15,7 @@ from __future__ import annotations
|
|||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
|
|
@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(
|
|||
|
||||
router = APIRouter()
|
||||
|
||||
# ── HuggingFace auth ─────────────────────────────────────────────────────────
|
||||
|
||||
def _get_hf_token() -> str | None:
|
||||
"""Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
|
||||
config_file = _ROOT / "config" / "label_tool.yaml"
|
||||
if config_file.exists():
|
||||
try:
|
||||
import yaml as _yaml
|
||||
raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
|
||||
token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
|
||||
if token:
|
||||
return token
|
||||
except Exception:
|
||||
pass
|
||||
return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
|
||||
|
||||
|
||||
# ── GGUF quantization detection ───────────────────────────────────────────────
|
||||
# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
|
||||
_QUANT_RE = re.compile(
|
||||
r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
# ── Download progress shared state ────────────────────────────────────────────
|
||||
# Updated by the background download thread; read by GET /download/stream.
|
||||
_download_progress: dict[str, Any] = {}
|
||||
|
|
@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
|
|||
"audio-classification": {"adapter": None, "role": "classifier", "service": "cf-voice"},
|
||||
# TTS — cf-tts text-to-speech service
|
||||
"text-to-speech": {"adapter": None, "role": "tts", "service": "cf-tts"},
|
||||
# Vision — cf-vision image classification / embedding / VLM service
|
||||
# Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
|
||||
"image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
|
||||
"zero-shot-image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"},
|
||||
"image-feature-extraction": {"adapter": None, "role": "embedding", "service": "cf-vision"},
|
||||
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "cf-vision"},
|
||||
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "cf-vision"},
|
||||
# Generative VLMs (image+text → text) — run under vllm, not cf-vision.
|
||||
# cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
|
||||
# LLaVA, and InternVL are textgen models that happen to accept image inputs.
|
||||
"image-text-to-text": {"adapter": None, "role": "vlm", "service": "vllm"},
|
||||
"visual-question-answering": {"adapter": None, "role": "vlm", "service": "vllm"},
|
||||
# Image generation — cf-image (text → image; distinct from cf-vision image understanding)
|
||||
"text-to-image": {"adapter": None, "role": "image-gen", "service": "cf-image"},
|
||||
# Embedding — cf-core shared embedding layer
|
||||
|
|
@ -197,8 +225,15 @@ def _catalog_key(repo_id: str) -> str:
|
|||
|
||||
ibm-granite/granite-4.1-8b → granite-4.1-8b
|
||||
facebook/bart-large-cnn → bart-large-cnn
|
||||
WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF → opus4.7-gods.ghost.codex-4b
|
||||
|
||||
The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
|
||||
as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
|
||||
"""
|
||||
return repo_id.split("/", 1)[-1].lower()
|
||||
key = repo_id.split("/", 1)[-1].lower()
|
||||
if key.endswith(".gguf"):
|
||||
key = key[:-5]
|
||||
return key
|
||||
|
||||
|
||||
def _insert_catalog_entry(content: str, entry_lines: str) -> str:
|
||||
|
|
@ -290,6 +325,15 @@ def _register_in_node_catalogs(
|
|||
max_mb: int = cf_text.get("max_mb", 0)
|
||||
catalog: dict = cf_text.get("catalog") or {}
|
||||
|
||||
# If the node has a different local model dir, remap the NFS path.
|
||||
model_base = cf_text.get("model_base_path", "").rstrip("/")
|
||||
if model_base:
|
||||
nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
|
||||
model_name = local_path.name
|
||||
effective_path_str = f"{model_base}/{model_name}"
|
||||
else:
|
||||
effective_path_str = local_path_str
|
||||
|
||||
# Skip if key already exists
|
||||
if model_key in catalog:
|
||||
logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
|
||||
|
|
@ -301,10 +345,10 @@ def _register_in_node_catalogs(
|
|||
for entry in catalog.values()
|
||||
if isinstance(entry, dict)
|
||||
}
|
||||
if local_path_str in registered_paths or any(
|
||||
p.startswith(local_path_str + "/") for p in registered_paths
|
||||
if effective_path_str in registered_paths or any(
|
||||
p.startswith(effective_path_str + "/") for p in registered_paths
|
||||
):
|
||||
logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name)
|
||||
logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
|
||||
continue
|
||||
|
||||
# Determine whether model fits at FP16 or needs 4-bit
|
||||
|
|
@ -330,12 +374,18 @@ def _register_in_node_catalogs(
|
|||
if needs_4bit
|
||||
else f" # FP16 file-size estimate"
|
||||
)
|
||||
env_block = (
|
||||
f" env:\n"
|
||||
f" CF_TEXT_4BIT: \"1\"\n"
|
||||
if needs_4bit else ""
|
||||
)
|
||||
entry_block = (
|
||||
f" # auto-registered by avocet on download\n"
|
||||
f" {model_key}:\n"
|
||||
f" path: {local_path_str}\n"
|
||||
f" path: {effective_path_str}\n"
|
||||
f" vram_mb: {vram_for_node}{vram_comment}\n"
|
||||
f" description: \"{desc}\"\n"
|
||||
f"{env_block}"
|
||||
)
|
||||
|
||||
new_content = _insert_catalog_entry(content, entry_block)
|
||||
|
|
@ -388,12 +438,17 @@ def _run_download(
|
|||
role: str | None = None,
|
||||
service: str | None = None,
|
||||
model_size_bytes: int = 0,
|
||||
quant_pattern: str | None = None,
|
||||
) -> None:
|
||||
"""Background thread: download model via huggingface_hub.snapshot_download.
|
||||
|
||||
model_size_bytes is the sum of file sizes reported by the HF API (siblings).
|
||||
It is used to estimate vram_mb and written to model_info.json so cf-orch can
|
||||
budget VRAM when allocating a cf-text instance for this model.
|
||||
|
||||
quant_pattern: when set, restricts snapshot_download to only files matching
|
||||
*{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
|
||||
from GGUF-only repos like bartowski/*.
|
||||
"""
|
||||
global _download_progress
|
||||
local_dir = _model_dir_for(repo_id, service)
|
||||
|
|
@ -422,10 +477,20 @@ def _run_download(
|
|||
|
||||
local_dir.mkdir(parents=True, exist_ok=True)
|
||||
poll_thread.start()
|
||||
snapshot_download(
|
||||
repo_id=repo_id,
|
||||
local_dir=str(local_dir),
|
||||
)
|
||||
|
||||
dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
|
||||
hf_token = _get_hf_token()
|
||||
if hf_token:
|
||||
dl_kwargs["token"] = hf_token
|
||||
if quant_pattern:
|
||||
# Include both cases: repos use mixed conventions (Q6_K vs q6_k).
|
||||
dl_kwargs["allow_patterns"] = [
|
||||
f"*{quant_pattern.upper()}*.gguf",
|
||||
f"*{quant_pattern.lower()}*.gguf",
|
||||
"*.json",
|
||||
"README.md",
|
||||
]
|
||||
snapshot_download(**dl_kwargs)
|
||||
|
||||
# Estimate VRAM from reported file size.
|
||||
# HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
|
||||
|
|
@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
|
|||
)
|
||||
logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)
|
||||
|
||||
# Estimate model size from siblings list
|
||||
# Detect GGUF files and parse quant names from siblings list.
|
||||
# For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
|
||||
# a per-quant size picker instead of downloading every variant.
|
||||
siblings = data.get("siblings") or []
|
||||
model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
|
||||
gguf_files: list[dict] = []
|
||||
for s in siblings:
|
||||
if not isinstance(s, dict):
|
||||
continue
|
||||
fname: str = s.get("rfilename", "")
|
||||
if not fname.lower().endswith(".gguf"):
|
||||
continue
|
||||
m = _QUANT_RE.search(fname)
|
||||
gguf_files.append({
|
||||
"filename": fname,
|
||||
"size": s.get("size", 0) or 0,
|
||||
"quant_name": m.group(1).upper() if m else None,
|
||||
})
|
||||
gguf_files.sort(key=lambda f: f["size"])
|
||||
|
||||
# model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
|
||||
# For GGUF repos the frontend will substitute the selected quant's size on submit.
|
||||
if gguf_files:
|
||||
model_size_bytes: int = sum(f["size"] for f in gguf_files)
|
||||
else:
|
||||
model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
|
||||
|
||||
# Description: first 300 chars of card data (modelId field used as fallback)
|
||||
card_data = data.get("cardData") or {}
|
||||
|
|
@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
|
|||
"compatible": compatible,
|
||||
"warning": warning,
|
||||
"model_size_bytes": model_size_bytes,
|
||||
"gguf_files": gguf_files if gguf_files else None,
|
||||
"description": description,
|
||||
"tags": data.get("tags") or [],
|
||||
"downloads": data.get("downloads") or 0,
|
||||
|
|
@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
|
|||
# Stored in the queue entry so approve can pass it to _run_download
|
||||
# without a second HF API round-trip.
|
||||
model_size_bytes: int = 0
|
||||
# GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
|
||||
# restricts to *{quant_pattern}*.gguf instead of fetching all variants.
|
||||
quant_pattern: str | None = None
|
||||
|
||||
|
||||
@router.post("/queue", status_code=201)
|
||||
|
|
@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
|
|||
"role": req.role,
|
||||
"service": req.service,
|
||||
"model_size_bytes": req.model_size_bytes,
|
||||
"quant_pattern": req.quant_pattern,
|
||||
"status": "pending",
|
||||
"queued_at": datetime.now(timezone.utc).isoformat(),
|
||||
}
|
||||
|
|
@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
|
|||
entry.get("role"),
|
||||
entry.get("service"),
|
||||
entry.get("model_size_bytes", 0),
|
||||
entry.get("quant_pattern"),
|
||||
),
|
||||
daemon=True,
|
||||
name=f"model-download-{entry_id}",
|
||||
|
|
@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
|
|||
return {"ok": True}
|
||||
|
||||
|
||||
# ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
|
||||
|
||||
class QueuePatchRequest(BaseModel):
|
||||
service: str | None = None
|
||||
role: str | None = None
|
||||
|
||||
|
||||
@router.patch("/queue/{entry_id}")
|
||||
def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
|
||||
"""Update mutable fields (service, role) on a pending queue entry."""
|
||||
entry = _get_queue_entry(entry_id)
|
||||
if entry is None:
|
||||
raise HTTPException(404, f"Queue entry {entry_id!r} not found")
|
||||
if entry.get("status") != "pending":
|
||||
raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
|
||||
|
||||
updates: dict = {}
|
||||
if body.service is not None:
|
||||
updates["service"] = body.service
|
||||
if body.role is not None:
|
||||
updates["role"] = body.role
|
||||
|
||||
updated = _update_queue_entry(entry_id, updates)
|
||||
return updated or {}
|
||||
|
||||
|
||||
# ── DELETE /queue/{id} ─────────────────────────────────────────────────────────
|
||||
|
||||
@router.delete("/queue/{entry_id}")
|
||||
|
|
|
|||
|
|
@ -41,11 +41,15 @@ cforch:
|
|||
# Python interpreter with cf-orch installed
|
||||
python_bin: /devl/miniconda3/envs/cf/bin/python
|
||||
|
||||
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST
|
||||
# Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
|
||||
# coordinator_url: http://localhost:7700
|
||||
# license_key: CFG-AVCT-xxxx-xxxx-xxxx
|
||||
# ollama_url: http://localhost:11434
|
||||
# ollama_model: llama3.2:3b
|
||||
# judge_url: http://10.1.10.158:8008 # Sif cf-text — LLM-as-judge secondary scorer
|
||||
# judge_url: http://10.1.10.71:8008 # Heimdall cf-text (alternative)
|
||||
# Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
|
||||
# hf_token: hf_xxxxxxxxxxxxxxxxxxxx # HuggingFace token — required for gated/terms-restricted models
|
||||
|
||||
# Imitate tab — pull real samples from sibling CF product APIs and run them
|
||||
# through local LLMs to build a corrections dataset.
|
||||
|
|
|
|||
35
manage.sh
35
manage.sh
|
|
@ -90,6 +90,12 @@ usage() {
|
|||
echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]"
|
||||
echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]"
|
||||
echo ""
|
||||
echo " Planning Benchmark:"
|
||||
echo -e " ${GREEN}plans-bench [args]${NC} Run benchmark_plans.py (args passed through)"
|
||||
echo -e " ${GREEN}plans-list${NC} Shortcut: --list-models"
|
||||
echo -e " ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
|
||||
echo -e " ${GREEN}plans-compare <m1> <m2> [more]${NC} Compare models side-by-side"
|
||||
echo ""
|
||||
echo " Writing Style Benchmark:"
|
||||
echo -e " ${GREEN}style-bench [args]${NC} Run benchmark_style.py (args passed through)"
|
||||
echo -e " ${GREEN}style-list${NC} List available ollama models for style bench"
|
||||
|
|
@ -127,6 +133,8 @@ case "$CMD" in
|
|||
fi
|
||||
mkdir -p "$LOG_DIR"
|
||||
API_LOG="${LOG_DIR}/api.log"
|
||||
# Load .env if present — sets HF_TOKEN and other optional overrides.
|
||||
[[ -f .env ]] && set -a && source .env && set +a
|
||||
info "Building Vue SPA…"
|
||||
(cd web && npm run build) >> "$API_LOG" 2>&1
|
||||
info "Starting FastAPI on port ${API_PORT}…"
|
||||
|
|
@ -179,6 +187,9 @@ case "$CMD" in
|
|||
mkdir -p "$LOG_DIR"
|
||||
DEV_API_LOG="${LOG_DIR}/dev-api.log"
|
||||
|
||||
# Load .env if present — sets HF_TOKEN and other optional overrides.
|
||||
[[ -f .env ]] && set -a && source .env && set +a
|
||||
|
||||
if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
|
||||
warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
|
||||
else
|
||||
|
|
@ -255,6 +266,30 @@ case "$CMD" in
|
|||
exec "$0" benchmark --compare "$@"
|
||||
;;
|
||||
|
||||
plans-bench)
|
||||
info "Running planning benchmark (${ENV_UI})…"
|
||||
"$PYTHON_UI" scripts/benchmark_plans.py "$@"
|
||||
;;
|
||||
|
||||
plans-list)
|
||||
exec "$0" plans-bench --list-models
|
||||
;;
|
||||
|
||||
plans-run)
|
||||
if [[ $# -lt 1 ]]; then
|
||||
error "Usage: ./manage.sh plans-run <model-key> [extra args]"
|
||||
fi
|
||||
MODEL="$1"; shift
|
||||
exec "$0" plans-bench --model "$MODEL" --verbose "$@"
|
||||
;;
|
||||
|
||||
plans-compare)
|
||||
if [[ $# -lt 2 ]]; then
|
||||
error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
|
||||
fi
|
||||
exec "$0" plans-bench --compare "$@" --verbose
|
||||
;;
|
||||
|
||||
style-bench)
|
||||
info "Running writing style benchmark (${ENV_BM})…"
|
||||
if [[ ! -x "$PYTHON_BM" ]]; then
|
||||
|
|
|
|||
719
scripts/benchmark_plans.py
Normal file
719
scripts/benchmark_plans.py
Normal file
|
|
@ -0,0 +1,719 @@
|
|||
#!/usr/bin/env python
|
||||
"""CF-specific planning benchmark — compare base models before fine-tuning.
|
||||
|
||||
Sends held-out CircuitForge planning prompts to one or more models via the
|
||||
cf-text (local) or cf-orch API, then scores responses against CF-specific
|
||||
rubrics. Use this to select the best base model for SFT.
|
||||
|
||||
Scoring rubrics (each 0-1, summed to total/N):
|
||||
- task_structure : uses checkbox syntax (- [ ]), git commit steps
|
||||
- tier_awareness : mentions Free/Paid/Premium/Ultra tiers
|
||||
- privacy_pillar : mentions privacy/local-inference/no-logging
|
||||
- safety_pillar : mentions safety, human approval, or reversibility
|
||||
- accessibility : mentions ND/accessibility/adaptive needs
|
||||
- license_split : mentions MIT vs BSL or open-core model
|
||||
- file_paths : uses plausible file path references
|
||||
- cf_conventions : uses conda run -n cf, /Library/Development/, or known CF dirs
|
||||
- paired_coherence : (paired only) plan references the design doc's feature name
|
||||
- length_ok : 300–2500 words (under-short = hallucination risk; over-long = padding)
|
||||
|
||||
Usage
|
||||
-----
|
||||
# List available model targets
|
||||
python scripts/benchmark_plans.py --list-models
|
||||
|
||||
# Run all held-out prompts against a single model, print report
|
||||
python scripts/benchmark_plans.py --model llama3.2-3b
|
||||
|
||||
# Compare two models side-by-side
|
||||
python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
|
||||
|
||||
# Run with a custom API base (cf-text default: http://localhost:8080/v1)
|
||||
python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
|
||||
|
||||
# Export detailed results JSON
|
||||
python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from dataclasses import dataclass, field, asdict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
# ── Paths ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_DATA_DIR = _ROOT / "data"
|
||||
|
||||
CF_TEXT_BASE = "http://localhost:8080/v1"
|
||||
CF_ORCH_BASE = "http://localhost:8090/v1"
|
||||
CF_COORD_URL = "http://10.1.10.71:7700" # cf-orch coordinator (LAN)
|
||||
|
||||
# ── Held-out prompts ───────────────────────────────────────────────────────────
|
||||
# These are NOT in the training export (no matching docs in circuitforge-plans/).
|
||||
# Each prompt exercises a different CF planning domain.
|
||||
|
||||
HELD_OUT_PROMPTS: list[dict[str, Any]] = [
|
||||
{
|
||||
"id": "ho_001",
|
||||
"name": "kiwi_barcode_ocr",
|
||||
"domain": "feature_plan",
|
||||
"prompt": (
|
||||
"You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
|
||||
"Write a detailed implementation plan for adding barcode scanning via device camera "
|
||||
"and receipt OCR to the item-add flow.\n\n"
|
||||
"The plan should include: file structure (create/modify), step-by-step task checklist "
|
||||
"with checkboxes, any DB migrations, and git commit steps."
|
||||
),
|
||||
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
|
||||
},
|
||||
{
|
||||
"id": "ho_002",
|
||||
"name": "peregrine_ats_scoring",
|
||||
"domain": "feature_design",
|
||||
"prompt": (
|
||||
"Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
|
||||
"Context: Peregrine users paste job descriptions and their resume. "
|
||||
"We want to score how well the resume keywords match the JD and suggest rewrites. "
|
||||
"Describe the architecture, data flow, and key design decisions."
|
||||
),
|
||||
"expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
|
||||
},
|
||||
{
|
||||
"id": "ho_003",
|
||||
"name": "tier_gate_local_llm",
|
||||
"domain": "architecture",
|
||||
"prompt": (
|
||||
"Design the tier-gating architecture for a new CircuitForge product. "
|
||||
"The product should:\n"
|
||||
"- Default to local LLM inference for all tiers\n"
|
||||
"- Unlock cloud LLM for Paid tier and above\n"
|
||||
"- Keep fine-tuned model weights for Premium/Ultra only\n\n"
|
||||
"Describe how the tier check integrates with the LLM router, "
|
||||
"what happens when a Free user tries a Paid-tier feature, "
|
||||
"and how BYOK (bring-your-own-key) fits in."
|
||||
),
|
||||
"expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
|
||||
},
|
||||
{
|
||||
"id": "ho_004",
|
||||
"name": "heimdall_webhook_plan",
|
||||
"domain": "feature_plan",
|
||||
"prompt": (
|
||||
"Break the following Heimdall feature into a detailed implementation plan with "
|
||||
"file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
|
||||
"Heimdall is the CircuitForge license server (FastAPI + SQLite). "
|
||||
"The webhook needs to handle checkout.session.completed, "
|
||||
"customer.subscription.updated, and customer.subscription.deleted events."
|
||||
),
|
||||
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
|
||||
},
|
||||
{
|
||||
"id": "ho_005",
|
||||
"name": "nd_accessible_onboarding",
|
||||
"domain": "ux_design",
|
||||
"prompt": (
|
||||
"You are a product designer working on Harrier, a CircuitForge tool for "
|
||||
"helping people navigate government benefits applications.\n\n"
|
||||
"Design the onboarding flow for neurodivergent (ND) users. "
|
||||
"Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
|
||||
"and rejection sensitivity. The flow should reduce cognitive load and "
|
||||
"never use urgency or panic patterns."
|
||||
),
|
||||
"expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
|
||||
},
|
||||
{
|
||||
"id": "ho_006",
|
||||
"name": "circuitforge_core_extraction",
|
||||
"domain": "architecture",
|
||||
"prompt": (
|
||||
"Produce a CircuitForge-style design document for the following circuitforge-core "
|
||||
"feature — shared ActivityPub federation module.\n\n"
|
||||
"Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
|
||||
"to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
|
||||
"Design the module API, describe what belongs in MIT vs BSL, and note federation "
|
||||
"privacy constraints."
|
||||
),
|
||||
"expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
|
||||
},
|
||||
{
|
||||
"id": "ho_007",
|
||||
"name": "snipe_trust_score_plan",
|
||||
"domain": "feature_plan",
|
||||
"prompt": (
|
||||
"You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
|
||||
"Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
|
||||
"The score should combine: feedback ratio, account age, item-specifics completeness, "
|
||||
"listing photo quality, and shipping time accuracy. "
|
||||
"Include file structure, test plan, and migration steps."
|
||||
),
|
||||
"expected_signals": ["task_structure", "file_paths", "safety_pillar"],
|
||||
},
|
||||
{
|
||||
"id": "ho_008",
|
||||
"name": "avocet_training_pipeline",
|
||||
"domain": "feature_plan",
|
||||
"prompt": (
|
||||
"Break the following Avocet feature into a detailed implementation plan — "
|
||||
"end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
|
||||
"Avocet is the CircuitForge email classifier training tool. "
|
||||
"The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
|
||||
"quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
|
||||
"in the Avocet model queue if it beats the baseline."
|
||||
),
|
||||
"expected_signals": ["task_structure", "file_paths", "cf_conventions"],
|
||||
},
|
||||
{
|
||||
"id": "ho_009",
|
||||
"name": "privacy_data_flow",
|
||||
"domain": "architecture",
|
||||
"prompt": (
|
||||
"Design the data privacy architecture for a CircuitForge cloud product. "
|
||||
"Describe: what PII is collected, how it's stored, retention policy, "
|
||||
"obfuscation strategy for cloud-side logs, and how consent is obtained "
|
||||
"in plain language. The product handles job applications (resumes, cover letters)."
|
||||
),
|
||||
"expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
|
||||
},
|
||||
{
|
||||
"id": "ho_010",
|
||||
"name": "git_workflow_doc",
|
||||
"domain": "process_doc",
|
||||
"prompt": (
|
||||
"Write a developer process document for CircuitForge: conventional commit and "
|
||||
"branch workflow for a BSL 1.1 open-core product.\n\n"
|
||||
"Cover: commit message format (type: description), branch naming, "
|
||||
"when to use feature branches vs direct main commits, "
|
||||
"how the MIT/BSL split affects which commits go in which branch, "
|
||||
"and how CI gates on gitleaks for secret scanning."
|
||||
),
|
||||
"expected_signals": ["license_split", "cf_conventions", "task_structure"],
|
||||
},
|
||||
]
|
||||
|
||||
# ── Rubric scoring ─────────────────────────────────────────────────────────────
|
||||
|
||||
_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
|
||||
_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
|
||||
_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
|
||||
_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
|
||||
_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
|
||||
_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
|
||||
_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
|
||||
_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
|
||||
_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
|
||||
|
||||
|
||||
@dataclass
|
||||
class RubricScore:
|
||||
task_structure: float = 0.0
|
||||
tier_awareness: float = 0.0
|
||||
privacy_pillar: float = 0.0
|
||||
safety_pillar: float = 0.0
|
||||
accessibility: float = 0.0
|
||||
license_split: float = 0.0
|
||||
file_paths: float = 0.0
|
||||
cf_conventions: float = 0.0
|
||||
length_ok: float = 0.0
|
||||
|
||||
def total(self) -> float:
|
||||
vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
|
||||
self.safety_pillar, self.accessibility, self.license_split,
|
||||
self.file_paths, self.cf_conventions, self.length_ok]
|
||||
return sum(vals) / len(vals)
|
||||
|
||||
def as_dict(self) -> dict[str, float]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
|
||||
words = len(response.split())
|
||||
s = RubricScore()
|
||||
|
||||
# Task structure: needs checkboxes AND at least one commit step
|
||||
checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
|
||||
has_commit = bool(_COMMIT_RE.search(response))
|
||||
s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
|
||||
|
||||
# Tier awareness
|
||||
s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
|
||||
|
||||
# Privacy pillar
|
||||
s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
|
||||
|
||||
# Safety pillar
|
||||
s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
|
||||
|
||||
# Accessibility
|
||||
s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
|
||||
|
||||
# License split awareness
|
||||
s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
|
||||
|
||||
# File paths: at least 3 plausible path references
|
||||
s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
|
||||
|
||||
# CF conventions
|
||||
s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
|
||||
|
||||
# Length: 200–2500 words is healthy; outside = partial credit
|
||||
if 200 <= words <= 2500:
|
||||
s.length_ok = 1.0
|
||||
elif words < 200:
|
||||
s.length_ok = words / 200
|
||||
else:
|
||||
s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
# ── Model client ───────────────────────────────────────────────────────────────
|
||||
|
||||
# Registry of named model targets (shorthand → {api_base, model_name})
|
||||
MODEL_REGISTRY: dict[str, dict[str, str]] = {
|
||||
"deepseek-r1-1.5b": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "deepseek-r1-1.5b",
|
||||
"description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
|
||||
},
|
||||
"deepseek-r1-7b-4bit": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "deepseek-r1-7b-4bit",
|
||||
"description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
|
||||
},
|
||||
"deepseek-coder-6.7b-4bit": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "deepseek-coder-6.7b-4bit",
|
||||
"description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
|
||||
},
|
||||
"granite-4.1-8b": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "granite-4.1-8b",
|
||||
"description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
|
||||
},
|
||||
"qwen2.5-3b": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "qwen2.5-3b",
|
||||
"description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
|
||||
},
|
||||
"qwen2.5-7b": {
|
||||
"api_base": CF_TEXT_BASE,
|
||||
"model": "qwen2.5-7b",
|
||||
"description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
# ── cf-orch allocation ─────────────────────────────────────────────────────────
|
||||
|
||||
def _cforch_allocate(
|
||||
model_id: str,
|
||||
cforch_url: str,
|
||||
startup_timeout_s: float = 300.0,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Allocate a cf-text instance for model_id via the cf-orch coordinator.
|
||||
|
||||
Returns (service_url, allocation_id) on success, None on failure.
|
||||
service_url is the direct node URL exposing /v1/chat/completions.
|
||||
"""
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{cforch_url}/api/services/cf-text/allocate",
|
||||
json={
|
||||
"model_candidates": [model_id],
|
||||
"caller": "avocet",
|
||||
"pipeline": "plans_benchmark",
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
service_url: str = data["url"]
|
||||
allocation_id: str = data.get("allocation_id", "")
|
||||
node_id: str = data.get("node_id", "")
|
||||
gpu_id: int | None = data.get("gpu_id")
|
||||
|
||||
if data.get("started", False) and not data.get("warm", True):
|
||||
# Use \n so the SSE generator sees the line immediately
|
||||
print(f" [cold start] loading {model_id!r} — polling every 3s…", flush=True)
|
||||
t0 = time.monotonic()
|
||||
deadline = t0 + startup_timeout_s
|
||||
probe_misses = 0
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
elapsed = time.monotonic() - t0
|
||||
try:
|
||||
status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
|
||||
if status.is_success:
|
||||
instances = status.json().get("instances", [])
|
||||
match = next(
|
||||
(i for i in instances
|
||||
if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
|
||||
None,
|
||||
)
|
||||
if match:
|
||||
probe_misses = 0
|
||||
state = match.get("state", "")
|
||||
if state == "running":
|
||||
print(f" [cold start] ready in {elapsed:.0f}s", flush=True)
|
||||
return service_url, allocation_id
|
||||
elif state == "stopped":
|
||||
print(f" [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
|
||||
return None
|
||||
else:
|
||||
# still starting — emit keepalive so SSE stream stays alive
|
||||
print(f" [cold start] state={state!r} elapsed={elapsed:.0f}s", flush=True)
|
||||
else:
|
||||
probe_misses += 1
|
||||
print(f" [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
|
||||
if probe_misses >= 6:
|
||||
try:
|
||||
h = httpx.get(f"{service_url}/health", timeout=3.0)
|
||||
if h.is_success:
|
||||
print(f" [cold start] ready via health check in {elapsed:.0f}s", flush=True)
|
||||
return service_url, allocation_id
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
print(f" [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
|
||||
except Exception as poll_exc:
|
||||
print(f" [cold start] poll error: {poll_exc} elapsed={elapsed:.0f}s", flush=True)
|
||||
time.sleep(3.0)
|
||||
|
||||
print(f" [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
|
||||
return None
|
||||
|
||||
return service_url, allocation_id
|
||||
except Exception as exc:
|
||||
print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
|
||||
"""Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
|
||||
t0 = time.monotonic()
|
||||
resp = httpx.post(
|
||||
f"{service_url.rstrip('/')}/v1/chat/completions",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.2,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
latency = time.monotonic() - t0
|
||||
text = resp.json()["choices"][0]["message"]["content"]
|
||||
return text, latency
|
||||
|
||||
|
||||
def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
|
||||
"""Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
|
||||
t0 = time.monotonic()
|
||||
resp = httpx.post(
|
||||
f"{api_base}/chat/completions",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.2,
|
||||
},
|
||||
timeout=timeout,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
latency = time.monotonic() - t0
|
||||
text = resp.json()["choices"][0]["message"]["content"]
|
||||
return text, latency
|
||||
|
||||
|
||||
# ── Benchmark runner ───────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class PromptResult:
|
||||
prompt_id: str
|
||||
prompt_name: str
|
||||
model_key: str
|
||||
response: str
|
||||
latency_s: float
|
||||
word_count: int
|
||||
scores: dict[str, float]
|
||||
total_score: float
|
||||
error: str | None = None
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
model_key: str,
|
||||
model_name: str,
|
||||
prompts: list[dict[str, Any]] | None = None,
|
||||
verbose: bool = False,
|
||||
# cf-orch path
|
||||
use_cforch: bool = False,
|
||||
cforch_url: str = CF_COORD_URL,
|
||||
# direct path (used when not cf-orch)
|
||||
api_base: str = CF_TEXT_BASE,
|
||||
) -> list[PromptResult]:
|
||||
"""Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
|
||||
if prompts is None:
|
||||
prompts = HELD_OUT_PROMPTS
|
||||
|
||||
# Allocate once per model when using cf-orch
|
||||
service_url: str | None = None
|
||||
if use_cforch:
|
||||
print(f" Allocating {model_name!r} via cf-orch…", flush=True)
|
||||
alloc = _cforch_allocate(model_name, cforch_url)
|
||||
if alloc is None:
|
||||
# Return all prompts as errors
|
||||
return [
|
||||
PromptResult(
|
||||
prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
|
||||
response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
|
||||
error=f"cf-orch allocation failed for {model_name!r}",
|
||||
)
|
||||
for p in prompts
|
||||
]
|
||||
service_url, _alloc_id = alloc
|
||||
|
||||
results: list[PromptResult] = []
|
||||
for p in prompts:
|
||||
if verbose:
|
||||
print(f" [{p['id']}] {p['name']} … ", end="", flush=True)
|
||||
try:
|
||||
if service_url:
|
||||
response, latency = _call_model_direct(service_url, model_name, p["prompt"])
|
||||
else:
|
||||
response, latency = _call_model(api_base, model_name, p["prompt"])
|
||||
rubric = score_response(response, p)
|
||||
result = PromptResult(
|
||||
prompt_id=p["id"],
|
||||
prompt_name=p["name"],
|
||||
model_key=model_key,
|
||||
response=response,
|
||||
latency_s=round(latency, 2),
|
||||
word_count=len(response.split()),
|
||||
scores=rubric.as_dict(),
|
||||
total_score=round(rubric.total(), 3),
|
||||
)
|
||||
if verbose:
|
||||
print(f"score={result.total_score:.3f} ({result.word_count}w, {latency:.1f}s)")
|
||||
except Exception as exc:
|
||||
result = PromptResult(
|
||||
prompt_id=p["id"],
|
||||
prompt_name=p["name"],
|
||||
model_key=model_key,
|
||||
response="",
|
||||
latency_s=0.0,
|
||||
word_count=0,
|
||||
scores={},
|
||||
total_score=0.0,
|
||||
error=str(exc),
|
||||
)
|
||||
if verbose:
|
||||
print(f"ERROR: {exc}")
|
||||
results.append(result)
|
||||
return results
|
||||
|
||||
|
||||
# ── Reporting ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _print_single_report(results: list[PromptResult], model_key: str) -> None:
|
||||
ok = [r for r in results if not r.error]
|
||||
err = [r for r in results if r.error]
|
||||
if not ok:
|
||||
print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
|
||||
return
|
||||
|
||||
avg_total = sum(r.total_score for r in ok) / len(ok)
|
||||
avg_latency = sum(r.latency_s for r in ok) / len(ok)
|
||||
|
||||
# Aggregate per-rubric averages
|
||||
rubric_keys = list(ok[0].scores.keys())
|
||||
rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Model : {model_key}")
|
||||
print(f" Prompts: {len(ok)}/{len(results)} passed ({len(err)} errors)")
|
||||
print(f" Overall score : {avg_total:.3f} (avg latency {avg_latency:.1f}s)")
|
||||
print(f"\n Rubric breakdown:")
|
||||
for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
|
||||
bar = "█" * int(v * 20)
|
||||
print(f" {k:<22} {v:.3f} {bar}")
|
||||
print(f"\n Per-prompt scores:")
|
||||
for r in sorted(ok, key=lambda x: -x.total_score):
|
||||
flag = "⚠" if r.total_score < 0.3 else " "
|
||||
print(f" {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f} ({r.word_count}w)")
|
||||
if err:
|
||||
print(f"\n Errors:")
|
||||
for r in err:
|
||||
print(f" {r.prompt_id} {r.prompt_name}: {r.error}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
|
||||
model_keys = list(all_results.keys())
|
||||
prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
|
||||
|
||||
# Scores by (model, prompt_id)
|
||||
score_map: dict[tuple[str, str], float] = {}
|
||||
for mk, results in all_results.items():
|
||||
for r in results:
|
||||
score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
|
||||
|
||||
col_w = 10
|
||||
header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
|
||||
print(f"\n{'='*len(header)}")
|
||||
print(" COMPARISON TABLE")
|
||||
print(f"{'='*len(header)}")
|
||||
print(f" {header}")
|
||||
print(f" {'-'*len(header)}")
|
||||
|
||||
for pid in prompt_ids:
|
||||
pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
|
||||
row = f" {pname:<35}"
|
||||
best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
|
||||
for mk in model_keys:
|
||||
v = score_map.get((mk, pid), 0.0)
|
||||
marker = "*" if v == best and len(model_keys) > 1 else " "
|
||||
row += f"{v:.3f}{marker} "
|
||||
print(row)
|
||||
|
||||
print(f" {'-'*len(header)}")
|
||||
avgs_row = f" {'AVERAGE':<35}"
|
||||
best_avg = -1.0
|
||||
avgs: dict[str, float] = {}
|
||||
for mk in model_keys:
|
||||
vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
|
||||
avgs[mk] = sum(vals) / len(vals)
|
||||
best_avg = max(best_avg, avgs[mk])
|
||||
for mk in model_keys:
|
||||
marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
|
||||
avgs_row += f"{avgs[mk]:.3f}{marker} "
|
||||
print(avgs_row)
|
||||
print(f"{'='*len(header)}\n")
|
||||
if len(model_keys) > 1:
|
||||
winner = max(avgs, key=lambda k: avgs[k])
|
||||
print(f" Winner: {winner} (avg {avgs[winner]:.3f})\n")
|
||||
|
||||
|
||||
# ── CLI ────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
parser.add_argument("--list-models", action="store_true",
|
||||
help="Print registered model shortcuts and exit")
|
||||
parser.add_argument("--model", metavar="KEY",
|
||||
help="Benchmark a single model (registry key or raw model name)")
|
||||
parser.add_argument("--compare", nargs="+", metavar="KEY",
|
||||
help="Compare two or more models side-by-side")
|
||||
parser.add_argument("--cforch", action="store_true",
|
||||
help="Route inference through cf-orch coordinator (allocate per model)")
|
||||
parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
|
||||
help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
|
||||
parser.add_argument("--api-base", default=None,
|
||||
help="Direct API base URL when not using cf-orch")
|
||||
parser.add_argument("--model-name", default=None,
|
||||
help="Override model name sent to API (single-model runs only)")
|
||||
parser.add_argument("--prompts", nargs="+", metavar="ID",
|
||||
help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
|
||||
parser.add_argument("--output", type=Path, default=None,
|
||||
help="Write detailed JSON results to this path")
|
||||
parser.add_argument("--workers", type=int, default=1, metavar="N",
|
||||
help="Run N models concurrently (default 1). Set to number of available nodes.")
|
||||
parser.add_argument("--verbose", "-v", action="store_true",
|
||||
help="Print per-prompt progress")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.list_models:
|
||||
print("\nRegistered model shortcuts:")
|
||||
for key, info in MODEL_REGISTRY.items():
|
||||
print(f" {key:<20} {info['description']}")
|
||||
print(f"\nDefault endpoints:")
|
||||
print(f" direct {CF_TEXT_BASE}")
|
||||
print(f" cf-orch {CF_COORD_URL}")
|
||||
return
|
||||
|
||||
prompts = HELD_OUT_PROMPTS
|
||||
if args.prompts:
|
||||
ids = set(args.prompts)
|
||||
prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
|
||||
if not prompts:
|
||||
print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
model_keys: list[str] = []
|
||||
if args.compare:
|
||||
model_keys = args.compare
|
||||
elif args.model:
|
||||
model_keys = [args.model]
|
||||
else:
|
||||
parser.print_help()
|
||||
sys.exit(0)
|
||||
|
||||
all_results: dict[str, list[PromptResult]] = {}
|
||||
print_lock = threading.Lock()
|
||||
|
||||
def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
|
||||
if mk in MODEL_REGISTRY:
|
||||
reg = MODEL_REGISTRY[mk]
|
||||
model_name = args.model_name or reg["model"]
|
||||
direct_base = args.api_base or reg["api_base"]
|
||||
else:
|
||||
model_name = args.model_name or mk
|
||||
direct_base = args.api_base or CF_TEXT_BASE
|
||||
|
||||
if args.cforch:
|
||||
with print_lock:
|
||||
print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url}) model={model_name}")
|
||||
results = run_benchmark(
|
||||
mk, model_name, prompts=prompts, verbose=args.verbose,
|
||||
use_cforch=True, cforch_url=args.cforch_url,
|
||||
)
|
||||
else:
|
||||
with print_lock:
|
||||
print(f"\nRunning [{mk}] → {direct_base} model={model_name}")
|
||||
results = run_benchmark(
|
||||
mk, model_name, prompts=prompts, verbose=args.verbose,
|
||||
api_base=direct_base,
|
||||
)
|
||||
|
||||
with print_lock:
|
||||
_print_single_report(results, mk)
|
||||
return mk, results
|
||||
|
||||
workers = max(1, args.workers)
|
||||
if workers == 1 or len(model_keys) == 1:
|
||||
for mk in model_keys:
|
||||
mk_out, results = _run_one(mk)
|
||||
all_results[mk_out] = results
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||
futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
|
||||
for fut in as_completed(futures):
|
||||
mk_out, results = fut.result()
|
||||
all_results[mk_out] = results
|
||||
|
||||
if len(model_keys) > 1:
|
||||
_print_comparison_table(all_results)
|
||||
|
||||
if args.output:
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {
|
||||
mk: [asdict(r) for r in results]
|
||||
for mk, results in all_results.items()
|
||||
}
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
json.dump(payload, f, indent=2, ensure_ascii=False)
|
||||
print(f"Wrote detailed results to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
458
scripts/export_plans.py
Normal file
458
scripts/export_plans.py
Normal file
|
|
@ -0,0 +1,458 @@
|
|||
"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
|
||||
|
||||
Each record is a HuggingFace chat-format example:
|
||||
|
||||
{
|
||||
"id": "<sha256>",
|
||||
"messages": [
|
||||
{"role": "user", "content": "<reconstructed planning prompt>"},
|
||||
{"role": "assistant", "content": "<cleaned document content>"}
|
||||
],
|
||||
"meta": {
|
||||
"source": "peregrine/2026-03-03-feedback-button-design.md",
|
||||
"product": "peregrine",
|
||||
"doc_type": "design", # design | plan | spec | implementation | other
|
||||
"date": "2026-03-03",
|
||||
"paired_with": "...", # sibling path, or null
|
||||
"word_count": 1847,
|
||||
"pair_role": "context" # "context" | "target" | "standalone"
|
||||
}
|
||||
}
|
||||
|
||||
Pairing strategy
|
||||
----------------
|
||||
When a design doc and a plan doc share the same date + feature-name prefix,
|
||||
they are treated as a pair:
|
||||
- design → plan: instruction = "Given this design doc, write the implementation plan."
|
||||
context appended = full design doc content.
|
||||
- Solo docs get a synthetic instruction from the title + first overview section.
|
||||
|
||||
Usage
|
||||
-----
|
||||
# Preview stats and 5 sample records
|
||||
python scripts/export_plans.py --preview
|
||||
|
||||
# Write full output
|
||||
python scripts/export_plans.py --output data/plan_pairs.jsonl
|
||||
|
||||
# Restrict to specific products
|
||||
python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
# ── Paths ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
_SCRIPT_DIR = Path(__file__).parent
|
||||
_AVOCET_ROOT = _SCRIPT_DIR.parent
|
||||
_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
|
||||
_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
|
||||
|
||||
# ── Doc type detection ─────────────────────────────────────────────────────────
|
||||
|
||||
_TYPE_RE = re.compile(
|
||||
r"-(design|plan|spec|implementation|specs|plans)s?$",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
|
||||
|
||||
# Boilerplate lines to strip from document content before using as output.
|
||||
_BOILERPLATE_RE = re.compile(
|
||||
r"""
|
||||
^\s*>\s*\*\*For\s+agentic\s+workers.* # superpowers agent hints
|
||||
|^\s*>\s*REQUIRED\s+SUB-SKILL.*
|
||||
|^\s*\*\*Date:\*\*.* # metadata header lines
|
||||
|\*\*Status:\*\*\s*Complete.* # completed-feature noise
|
||||
|\*\*Status:\*\*\s*Done.*
|
||||
|\*\*Product:\*\*.*
|
||||
|\*\*Repo:\*\*.*
|
||||
|\*\*Tech\s+Stack:\*\*.*
|
||||
|\*\*Candidate:\*\*.* # old synthetic personas
|
||||
|^Candidate:.*
|
||||
|^Team:.*
|
||||
""",
|
||||
re.VERBOSE | re.MULTILINE,
|
||||
)
|
||||
|
||||
# Old repo/path names to normalise to current equivalents.
|
||||
_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
|
||||
(re.compile(r"/devl/job-seeker", re.IGNORECASE), "/Library/Development/CircuitForge/peregrine"),
|
||||
(re.compile(r"\bjob-seeker\b", re.IGNORECASE), "peregrine"),
|
||||
(re.compile(r"Alex Rivera", re.IGNORECASE), "[user]"),
|
||||
]
|
||||
|
||||
# Instruction paraphrase templates per doc type.
|
||||
# Each entry is (user_prefix, paired_prefix).
|
||||
# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
|
||||
_DESIGN_INSTRUCTIONS = [
|
||||
"Write a design document for {product}: {title}.\n\nContext: {overview}",
|
||||
"You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
|
||||
"Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
|
||||
]
|
||||
|
||||
_PLAN_INSTRUCTIONS = [
|
||||
"Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
|
||||
"Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
|
||||
"You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
|
||||
]
|
||||
|
||||
_PAIRED_INSTRUCTIONS = [
|
||||
(
|
||||
"You are a software architect working on {product}, a CircuitForge product. "
|
||||
"Given the following design document, write a detailed implementation plan "
|
||||
"(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
|
||||
"---\n{design_context}\n---"
|
||||
),
|
||||
(
|
||||
"The following is a design spec for a {product} feature. "
|
||||
"Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
|
||||
"---\n{design_context}\n---"
|
||||
),
|
||||
(
|
||||
"Convert this {product} design document into an actionable implementation plan. "
|
||||
"Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
|
||||
"---\n{design_context}\n---"
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
def _doc_type(stem: str) -> str:
|
||||
m = _TYPE_RE.search(stem)
|
||||
if not m:
|
||||
return "other"
|
||||
raw = m.group(1).lower().rstrip("s")
|
||||
return {"implementation": "plan"}.get(raw, raw)
|
||||
|
||||
|
||||
def _date_feature(stem: str) -> tuple[str, str]:
|
||||
"""Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
|
||||
m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
|
||||
if m:
|
||||
return m.group(1), m.group(2)
|
||||
return "", stem
|
||||
|
||||
|
||||
# ── Content extraction ─────────────────────────────────────────────────────────
|
||||
|
||||
def _extract_title(content: str) -> str:
|
||||
m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
|
||||
return m.group(1).strip() if m else ""
|
||||
|
||||
|
||||
def _extract_overview(content: str) -> str:
|
||||
"""Return first substantive paragraph or h2 section body (≤300 chars)."""
|
||||
# Superpowers plans have an explicit **Goal:** line — prefer that.
|
||||
goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
|
||||
if goal_m:
|
||||
return goal_m.group(1).strip()[:300]
|
||||
|
||||
# Otherwise use the body of the first h2 section.
|
||||
h2_m = re.search(
|
||||
r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
|
||||
content,
|
||||
re.MULTILINE,
|
||||
)
|
||||
if h2_m:
|
||||
body = h2_m.group(1).strip()
|
||||
# Strip markdown bullet/code noise for the instruction
|
||||
body = re.sub(r"```[\s\S]*?```", "", body)
|
||||
body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
|
||||
body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
|
||||
body = re.sub(r"\s+", " ", body).strip()
|
||||
return body[:300]
|
||||
|
||||
return ""
|
||||
|
||||
|
||||
def _clean_content(content: str) -> str:
|
||||
"""Remove boilerplate, normalize old paths/names, collapse whitespace."""
|
||||
cleaned = _BOILERPLATE_RE.sub("", content)
|
||||
for pattern, replacement in _PATH_NORMALIZATIONS:
|
||||
cleaned = pattern.sub(replacement, cleaned)
|
||||
cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
|
||||
return cleaned.strip()
|
||||
|
||||
|
||||
def _quality_flags(content: str) -> list[str]:
|
||||
"""Return a list of quality issue labels found in cleaned content."""
|
||||
flags = []
|
||||
if "Alex Rivera" in content or "[user]" in content:
|
||||
flags.append("persona-residue")
|
||||
if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
|
||||
flags.append("completed-status")
|
||||
return flags
|
||||
|
||||
|
||||
def _make_instruction(
|
||||
title: str,
|
||||
product: str,
|
||||
doc_type: str,
|
||||
overview: str,
|
||||
design_context: str | None = None,
|
||||
variant: int = 0,
|
||||
) -> str:
|
||||
"""Synthesise a natural planning prompt for this document.
|
||||
|
||||
variant: 0-2 selects which paraphrase template to use. Caller cycles
|
||||
through all three to produce multiple training examples per document.
|
||||
"""
|
||||
product_label = product.replace("-", " ").title() if product else "CircuitForge"
|
||||
idx = variant % 3
|
||||
|
||||
if design_context:
|
||||
tmpl = _PAIRED_INSTRUCTIONS[idx]
|
||||
return tmpl.format(
|
||||
product=product_label,
|
||||
design_context=design_context[:2500],
|
||||
)
|
||||
|
||||
templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
|
||||
tmpl = templates[idx]
|
||||
return tmpl.format(
|
||||
product=product_label,
|
||||
title=title,
|
||||
overview=overview or "",
|
||||
type_phrase="planning document",
|
||||
)
|
||||
|
||||
|
||||
def _record_id(content: str, source: str) -> str:
|
||||
return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
# ── Pair discovery ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
|
||||
"""Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
|
||||
by_prefix: dict[str, list[tuple[str, Path]]] = {}
|
||||
for path in plans_dir.rglob("*.md"):
|
||||
if any(part in _SKIP_DIRS for part in path.parts):
|
||||
continue
|
||||
if path.name == "README.md":
|
||||
continue
|
||||
stem = path.stem
|
||||
date, feature = _date_feature(stem)
|
||||
if not date:
|
||||
continue
|
||||
key = str(path.parent / f"{date}-{feature}")
|
||||
by_prefix.setdefault(key, []).append((_doc_type(stem), path))
|
||||
return by_prefix
|
||||
|
||||
|
||||
# ── Record generation ──────────────────────────────────────────────────────────
|
||||
|
||||
def _records_for_group(
|
||||
doc_type_paths: list[tuple[str, Path]],
|
||||
plans_dir: Path,
|
||||
) -> Iterator[dict]:
|
||||
"""Yield one or more training records for a group of related docs."""
|
||||
# Separate design vs plan docs within this group
|
||||
designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
|
||||
plans_ = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
|
||||
others = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
|
||||
|
||||
all_paths = doc_type_paths
|
||||
|
||||
if designs and plans_:
|
||||
# Paired: yield a design→plan record (3 instruction variants)
|
||||
design_type, design_path = designs[0]
|
||||
plan_type, plan_path = plans_[0]
|
||||
design_content = design_path.read_text(encoding="utf-8")
|
||||
plan_content = plan_path.read_text(encoding="utf-8")
|
||||
|
||||
product = _product_from_path(plan_path, plans_dir)
|
||||
title = _extract_title(plan_content) or plan_path.stem
|
||||
cleaned = _clean_content(plan_content)
|
||||
design_cleaned = _clean_content(design_content)
|
||||
flags = _quality_flags(cleaned)
|
||||
|
||||
if len(cleaned.split()) >= 80:
|
||||
rel_src = str(plan_path.relative_to(plans_dir))
|
||||
rel_design = str(design_path.relative_to(plans_dir))
|
||||
for variant in range(3):
|
||||
instruction = _make_instruction(
|
||||
title=title,
|
||||
product=product,
|
||||
doc_type="plan",
|
||||
overview=_extract_overview(design_content),
|
||||
design_context=design_cleaned,
|
||||
variant=variant,
|
||||
)
|
||||
yield {
|
||||
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
|
||||
"messages": [
|
||||
{"role": "user", "content": instruction},
|
||||
{"role": "assistant", "content": cleaned},
|
||||
],
|
||||
"meta": {
|
||||
"source": rel_src,
|
||||
"product": product,
|
||||
"doc_type": "plan",
|
||||
"date": _date_feature(plan_path.stem)[0],
|
||||
"paired_with": rel_design,
|
||||
"word_count": len(cleaned.split()),
|
||||
"pair_role": "target",
|
||||
"variant": variant,
|
||||
"quality_flags": flags,
|
||||
},
|
||||
}
|
||||
|
||||
# Also yield the design doc as standalone variants
|
||||
all_paths = [(t, p) for t, p in all_paths if p != plan_path]
|
||||
|
||||
# Remaining docs as standalone records (3 instruction variants each)
|
||||
for doc_type, path in all_paths:
|
||||
content = path.read_text(encoding="utf-8")
|
||||
cleaned = _clean_content(content)
|
||||
if len(cleaned.split()) < 80:
|
||||
continue
|
||||
|
||||
product = _product_from_path(path, plans_dir)
|
||||
title = _extract_title(content) or path.stem
|
||||
overview = _extract_overview(content)
|
||||
flags = _quality_flags(cleaned)
|
||||
rel_src = str(path.relative_to(plans_dir))
|
||||
|
||||
for variant in range(3):
|
||||
instruction = _make_instruction(
|
||||
title=title,
|
||||
product=product,
|
||||
doc_type=doc_type,
|
||||
overview=overview,
|
||||
variant=variant,
|
||||
)
|
||||
yield {
|
||||
"id": _record_id(f"v{variant}:{cleaned}", rel_src),
|
||||
"messages": [
|
||||
{"role": "user", "content": instruction},
|
||||
{"role": "assistant", "content": cleaned},
|
||||
],
|
||||
"meta": {
|
||||
"source": rel_src,
|
||||
"product": product,
|
||||
"doc_type": doc_type,
|
||||
"date": _date_feature(path.stem)[0],
|
||||
"paired_with": None,
|
||||
"word_count": len(cleaned.split()),
|
||||
"pair_role": "standalone",
|
||||
"variant": variant,
|
||||
"quality_flags": flags,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _product_from_path(path: Path, plans_dir: Path) -> str:
|
||||
rel = path.relative_to(plans_dir)
|
||||
return rel.parts[0] if len(rel.parts) > 1 else "shared"
|
||||
|
||||
|
||||
# ── Main export ────────────────────────────────────────────────────────────────
|
||||
|
||||
def export(
|
||||
plans_dir: Path,
|
||||
products: list[str] | None = None,
|
||||
) -> list[dict]:
|
||||
groups = _find_pairs(plans_dir)
|
||||
records: list[dict] = []
|
||||
seen_ids: set[str] = set()
|
||||
|
||||
for group_key, doc_type_paths in groups.items():
|
||||
# Filter by product if requested
|
||||
if products:
|
||||
paths = [p for _, p in doc_type_paths]
|
||||
prods = {_product_from_path(p, plans_dir) for p in paths}
|
||||
if not prods.intersection(products):
|
||||
continue
|
||||
|
||||
for record in _records_for_group(doc_type_paths, plans_dir):
|
||||
if record["id"] not in seen_ids:
|
||||
seen_ids.add(record["id"])
|
||||
records.append(record)
|
||||
|
||||
return records
|
||||
|
||||
|
||||
# ── CLI ────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _print_stats(records: list[dict]) -> None:
|
||||
from collections import Counter
|
||||
products = Counter(r["meta"]["product"] for r in records)
|
||||
doc_types = Counter(r["meta"]["doc_type"] for r in records)
|
||||
pair_roles = Counter(r["meta"]["pair_role"] for r in records)
|
||||
wc = [r["meta"]["word_count"] for r in records]
|
||||
wc.sort()
|
||||
|
||||
print(f"\n{'='*55}")
|
||||
print(f" Total records: {len(records)}")
|
||||
print(f" Word counts : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
|
||||
print(f"\n By product:")
|
||||
for p, n in products.most_common():
|
||||
print(f" {p:<22} {n}")
|
||||
print(f"\n By doc type:")
|
||||
for t, n in doc_types.most_common():
|
||||
print(f" {t:<22} {n}")
|
||||
print(f"\n Pair roles:")
|
||||
for r, n in pair_roles.most_common():
|
||||
print(f" {r:<22} {n}")
|
||||
print(f"{'='*55}\n")
|
||||
|
||||
|
||||
def _print_sample(records: list[dict], n: int = 3) -> None:
|
||||
import random
|
||||
sample = random.sample(records, min(n, len(records)))
|
||||
for i, rec in enumerate(sample, 1):
|
||||
meta = rec["meta"]
|
||||
user_msg = rec["messages"][0]["content"]
|
||||
asst_msg = rec["messages"][1]["content"]
|
||||
print(f"\n{'─'*55}")
|
||||
print(f"SAMPLE {i}/{n} [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
|
||||
print(f"source: {meta['source']}")
|
||||
print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
|
||||
print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
|
||||
print(f"\n{'─'*55}\n")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||
parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
|
||||
parser.add_argument("--output", type=Path, default=None,
|
||||
help="Write JSONL to this path (omit for preview-only)")
|
||||
parser.add_argument("--products", default=None,
|
||||
help="Comma-separated product filter, e.g. peregrine,kiwi")
|
||||
parser.add_argument("--preview", action="store_true",
|
||||
help="Print stats + sample records, don't write output")
|
||||
parser.add_argument("--samples", type=int, default=3,
|
||||
help="Number of sample records to show in preview (default 3)")
|
||||
args = parser.parse_args()
|
||||
|
||||
products = [p.strip() for p in args.products.split(",")] if args.products else None
|
||||
|
||||
print(f"Scanning {args.plans_dir} …", file=sys.stderr)
|
||||
records = export(args.plans_dir, products=products)
|
||||
|
||||
_print_stats(records)
|
||||
|
||||
if args.preview or args.output is None:
|
||||
_print_sample(records, n=args.samples)
|
||||
if args.output is None:
|
||||
print("(Pass --output <path> to write JSONL)")
|
||||
return
|
||||
|
||||
args.output.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(args.output, "w", encoding="utf-8") as f:
|
||||
for rec in records:
|
||||
f.write(json.dumps(rec, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"Wrote {len(records)} records to {args.output}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -14,7 +14,9 @@ from fastapi.testclient import TestClient
|
|||
|
||||
@pytest.fixture(autouse=True)
|
||||
def reset_cforch_globals(tmp_path):
|
||||
"""Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
|
||||
"""Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
|
||||
list_installed to return [] so real disk model directories don't bleed into
|
||||
tests that don't exercise the installed-model merge path."""
|
||||
from app import cforch as cforch_module
|
||||
|
||||
prev_config_dir = cforch_module._CONFIG_DIR
|
||||
|
|
@ -25,6 +27,7 @@ def reset_cforch_globals(tmp_path):
|
|||
cforch_module._BENCH_RUNNING = False
|
||||
cforch_module._bench_proc = None
|
||||
|
||||
with patch("app.models.list_installed", return_value=[]):
|
||||
yield tmp_path
|
||||
|
||||
cforch_module.set_config_dir(prev_config_dir)
|
||||
|
|
@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
|
|||
assert m["vram_estimate_mb"] == 6000
|
||||
|
||||
|
||||
def test_models_merges_installed_generators(client, config_dir, tmp_path):
|
||||
"""Installed cf-text/vllm generator models appear in the model list,
|
||||
deduplicated against bench_models.yaml entries."""
|
||||
models_file = tmp_path / "bench_models.yaml"
|
||||
_write_models_yaml(models_file, [
|
||||
{"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
|
||||
{"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
|
||||
])
|
||||
_write_config(config_dir, {"bench_models": str(models_file)})
|
||||
|
||||
fake_installed = [
|
||||
# should be included — cf-text generator not already in YAML
|
||||
{"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
|
||||
# should be deduped — repo_id matches a YAML entry
|
||||
{"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
|
||||
# should be excluded — classifier, not a generator
|
||||
{"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
|
||||
]
|
||||
with patch("app.models.list_installed", return_value=fake_installed):
|
||||
r = client.get("/api/cforch/models")
|
||||
assert r.status_code == 200
|
||||
ids = [m["id"] for m in r.json()["models"]]
|
||||
assert "llama3:8b" in ids # from YAML
|
||||
assert "ibm-granite/granite-4.1-8b" in ids # from YAML (not duplicated)
|
||||
assert "meta-llama/Llama-3.1-8B" in ids # merged from installed
|
||||
assert "cross-encoder/ms-marco-MiniLM-L6" not in ids # filtered out (reranker)
|
||||
assert ids.count("ibm-granite/granite-4.1-8b") == 1 # no duplicate
|
||||
|
||||
|
||||
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||
|
||||
def test_run_returns_409_when_already_running(client):
|
||||
|
|
|
|||
|
|
@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
|
|||
except _HTTPException as exc:
|
||||
assert exc.status_code in (400, 404)
|
||||
raise
|
||||
|
||||
|
||||
# ── Catalog registration ───────────────────────────────────────────────────────
|
||||
|
||||
_MINIMAL_YAML = """\
|
||||
services:
|
||||
cf-text:
|
||||
max_mb: {max_mb}
|
||||
catalog:
|
||||
existing-model:
|
||||
path: /some/path
|
||||
vram_mb: 1000
|
||||
description: "placeholder"
|
||||
"""
|
||||
|
||||
|
||||
def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
|
||||
p = tmp_path / "testnode.yaml"
|
||||
p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
|
||||
return p
|
||||
|
||||
|
||||
def test_catalog_registration_fp16_no_env_block(tmp_path):
|
||||
"""When model fits at FP16, no env block should be written."""
|
||||
from app import models as models_module
|
||||
|
||||
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
|
||||
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
|
||||
updated = models_module._register_in_node_catalogs(
|
||||
repo_id="org/SmallModel",
|
||||
local_path=tmp_path / "org--SmallModel",
|
||||
vram_mb_fp16=4000,
|
||||
role="generator",
|
||||
)
|
||||
|
||||
assert "testnode" in updated
|
||||
content = node_yaml.read_text()
|
||||
# _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
|
||||
assert "smallmodel:" in content
|
||||
assert "CF_TEXT_4BIT" not in content
|
||||
assert "env:" not in content
|
||||
|
||||
|
||||
def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
|
||||
"""When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
|
||||
from app import models as models_module
|
||||
|
||||
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
|
||||
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
|
||||
updated = models_module._register_in_node_catalogs(
|
||||
repo_id="org/BigModel",
|
||||
local_path=tmp_path / "org--BigModel",
|
||||
vram_mb_fp16=20000, # won't fit at FP16 on 8 GB
|
||||
role="generator",
|
||||
)
|
||||
|
||||
assert "testnode" in updated
|
||||
content = node_yaml.read_text()
|
||||
# _catalog_key: "org/BigModel" → "bigmodel"
|
||||
assert "bigmodel:" in content
|
||||
assert "env:" in content
|
||||
assert 'CF_TEXT_4BIT: "1"' in content
|
||||
assert "CF_TEXT_4BIT=1 required" in content # description note
|
||||
|
||||
|
||||
def test_catalog_registration_too_large_skipped(tmp_path):
|
||||
"""Model too large even at 4-bit should not be registered."""
|
||||
from app import models as models_module
|
||||
|
||||
node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
|
||||
with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
|
||||
updated = models_module._register_in_node_catalogs(
|
||||
repo_id="org/HugeModel",
|
||||
local_path=tmp_path / "org--HugeModel",
|
||||
vram_mb_fp16=80000, # 4-bit ~22 GB, still won't fit on 8 GB
|
||||
role="generator",
|
||||
)
|
||||
|
||||
assert updated == []
|
||||
content = node_yaml.read_text()
|
||||
assert "hugemodel" not in content
|
||||
|
|
|
|||
|
|
@ -21,11 +21,17 @@
|
|||
:class="{ active: benchMode === 'style' }"
|
||||
@click="benchMode = 'style'"
|
||||
>✍️ Writing Style</button>
|
||||
<button
|
||||
class="mode-btn"
|
||||
:class="{ active: benchMode === 'plans' }"
|
||||
@click="benchMode = 'plans'"
|
||||
>📐 Planning</button>
|
||||
</div>
|
||||
|
||||
<ClassifierTab v-if="benchMode === 'classifier'" />
|
||||
<LlmEvalTab v-if="benchMode === 'llm'" />
|
||||
<StyleTab v-if="benchMode === 'style'" />
|
||||
<PlansBenchTab v-if="benchMode === 'plans'" />
|
||||
</div>
|
||||
</template>
|
||||
|
||||
|
|
@ -34,8 +40,9 @@ import { ref } from 'vue'
|
|||
import ClassifierTab from './ClassifierTab.vue'
|
||||
import LlmEvalTab from './LlmEvalTab.vue'
|
||||
import StyleTab from './StyleTab.vue'
|
||||
import PlansBenchTab from './PlansBenchTab.vue'
|
||||
|
||||
type BenchMode = 'classifier' | 'llm' | 'style'
|
||||
type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
|
||||
const benchMode = ref<BenchMode>('classifier')
|
||||
</script>
|
||||
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@
|
|||
<summary class="picker-summary">
|
||||
<span class="picker-title">📋 Task Selection</span>
|
||||
<span class="picker-badge">{{ llmTaskBadge }}</span>
|
||||
<button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
|
||||
<button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
|
||||
|
|
@ -44,6 +46,8 @@
|
|||
<summary class="picker-summary">
|
||||
<span class="picker-title">🎯 Model Selection</span>
|
||||
<span class="picker-badge">{{ llmModelBadge }}</span>
|
||||
<button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
|
||||
<button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
|
||||
|
|
@ -78,6 +82,33 @@
|
|||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Node Selection -->
|
||||
<div class="node-picker" v-if="llmNodes.length > 0">
|
||||
<span class="node-picker-label">Nodes:</span>
|
||||
<label
|
||||
v-for="node in llmNodes"
|
||||
:key="node.node_id"
|
||||
class="node-chip"
|
||||
:class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
|
||||
:title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
|
||||
>
|
||||
<input
|
||||
type="checkbox"
|
||||
class="node-chip-check"
|
||||
:checked="enabledNodes.has(node.node_id)"
|
||||
:disabled="!node.online || llmRunning"
|
||||
@change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
{{ node.node_id }}
|
||||
<span class="node-chip-status" v-if="!node.online">offline</span>
|
||||
</label>
|
||||
<span class="node-picker-hint">
|
||||
{{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
|
||||
? 'auto-routing (all nodes)'
|
||||
: `restricted to: ${enabledNodeIds.join(', ')}` }}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<!-- Run Controls -->
|
||||
<div class="run-controls">
|
||||
<button
|
||||
|
|
@ -88,6 +119,24 @@
|
|||
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
|
||||
</button>
|
||||
<button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark">✕ Cancel</button>
|
||||
<input
|
||||
v-model="llmJudgeUrl"
|
||||
class="judge-url-input"
|
||||
placeholder="Judge URL — leave empty to skip LLM judge scoring"
|
||||
:disabled="llmRunning"
|
||||
title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
|
||||
/>
|
||||
<label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
|
||||
<span class="workers-prefix">workers</span>
|
||||
<input
|
||||
v-model.number="llmWorkers"
|
||||
type="number"
|
||||
min="1"
|
||||
max="8"
|
||||
class="workers-input"
|
||||
:disabled="llmRunning"
|
||||
/>
|
||||
</label>
|
||||
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
|
||||
Select at least one task and one model to run.
|
||||
</span>
|
||||
|
|
@ -119,6 +168,7 @@
|
|||
<tr>
|
||||
<th class="hm-label-col">Model</th>
|
||||
<th class="hm-model-col">overall</th>
|
||||
<th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
|
||||
<th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
|
||||
<th class="hm-model-col">tok/s</th>
|
||||
</tr>
|
||||
|
|
@ -130,6 +180,12 @@
|
|||
class="hm-value-cell"
|
||||
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
|
||||
>{{ pct(row.avg_quality_score) }}</td>
|
||||
<td
|
||||
v-if="llmHasJudge"
|
||||
class="hm-value-cell hm-judge-cell"
|
||||
:class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
|
||||
title="LLM-as-judge secondary score"
|
||||
>{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
|
||||
<td
|
||||
v-for="col in llmTaskTypeCols"
|
||||
:key="col"
|
||||
|
|
@ -168,6 +224,12 @@ interface CfOrchModel {
|
|||
vram_estimate_mb?: number
|
||||
}
|
||||
|
||||
interface CfOrchNode {
|
||||
node_id: string
|
||||
online: boolean
|
||||
gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
|
||||
}
|
||||
|
||||
interface LlmModelResult {
|
||||
model_name: string
|
||||
model_id: string
|
||||
|
|
@ -175,9 +237,11 @@ interface LlmModelResult {
|
|||
avg_tokens_per_sec: number
|
||||
avg_completion_ms: number
|
||||
avg_quality_score: number
|
||||
avg_judge_score: number | null
|
||||
finetune_candidates: number
|
||||
error_count: number
|
||||
quality_by_task_type: Record<string, number>
|
||||
judge_score_by_task_type?: Record<string, number>
|
||||
}
|
||||
|
||||
// ── State ───────────────────────────────────────────────────────────────────
|
||||
|
|
@ -195,6 +259,10 @@ const llmError = ref('')
|
|||
const llmResults = ref<LlmModelResult[]>([])
|
||||
const llmEventSource = ref<EventSource | null>(null)
|
||||
const llmLogEl = ref<HTMLElement | null>(null)
|
||||
const llmJudgeUrl = ref('')
|
||||
const llmWorkers = ref(1)
|
||||
const llmNodes = ref<CfOrchNode[]>([])
|
||||
const enabledNodes = ref<Set<string>>(new Set())
|
||||
|
||||
// ── Computed ────────────────────────────────────────────────────────────────
|
||||
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||
|
|
@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
|
|||
return [...types].sort()
|
||||
})
|
||||
|
||||
const llmHasJudge = computed(() =>
|
||||
llmResults.value.some(r => r.avg_judge_score != null)
|
||||
)
|
||||
|
||||
const enabledNodeIds = computed(() =>
|
||||
llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
|
||||
)
|
||||
|
||||
const llmBestByCol = computed((): Record<string, string> => {
|
||||
const best: Record<string, string> = {}
|
||||
if (llmResults.value.length === 0) return best
|
||||
|
|
@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
|
|||
}
|
||||
best['overall'] = bestId
|
||||
|
||||
if (llmHasJudge.value) {
|
||||
bestId = ''; bestVal = -Infinity
|
||||
for (const r of llmResults.value) {
|
||||
if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
|
||||
bestVal = r.avg_judge_score; bestId = r.model_id
|
||||
}
|
||||
}
|
||||
best['judge'] = bestId
|
||||
}
|
||||
|
||||
for (const col of llmTaskTypeCols.value) {
|
||||
bestId = ''; bestVal = -Infinity
|
||||
for (const r of llmResults.value) {
|
||||
|
|
@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
|
|||
}
|
||||
selectedLlmModels.value = next
|
||||
}
|
||||
function selectAllTasks() { selectedLlmTasks.value = new Set(llmTasks.value.map(t => t.id)) }
|
||||
function clearAllTasks() { selectedLlmTasks.value = new Set() }
|
||||
function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
|
||||
function clearAllModels() { selectedLlmModels.value = new Set() }
|
||||
function toggleNode(id: string, checked: boolean) {
|
||||
const next = new Set(enabledNodes.value)
|
||||
checked ? next.add(id) : next.delete(id)
|
||||
enabledNodes.value = next
|
||||
}
|
||||
|
||||
// ── Data loaders ─────────────────────────────────────────────────────────────
|
||||
async function loadLlmTasks() {
|
||||
|
|
@ -335,6 +430,21 @@ async function loadLlmResults() {
|
|||
}
|
||||
}
|
||||
|
||||
async function loadLlmConfig() {
|
||||
const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
|
||||
if (data?.judge_url && !llmJudgeUrl.value) {
|
||||
llmJudgeUrl.value = data.judge_url
|
||||
}
|
||||
}
|
||||
|
||||
async function loadLlmNodes() {
|
||||
const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
|
||||
if (data?.nodes) {
|
||||
llmNodes.value = data.nodes
|
||||
enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
|
||||
}
|
||||
}
|
||||
|
||||
// ── Run / cancel ──────────────────────────────────────────────────────────────
|
||||
function startLlmBenchmark() {
|
||||
llmRunning.value = true
|
||||
|
|
@ -344,6 +454,15 @@ function startLlmBenchmark() {
|
|||
const params = new URLSearchParams()
|
||||
const taskIds = [...selectedLlmTasks.value].join(',')
|
||||
if (taskIds) params.set('task_ids', taskIds)
|
||||
const modelIds = [...selectedLlmModels.value].join(',')
|
||||
if (modelIds) params.set('model_ids', modelIds)
|
||||
if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
|
||||
if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
|
||||
const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
|
||||
const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
|
||||
if (isRestricted && enabledNodeIds.value.length > 0) {
|
||||
params.set('node_ids', enabledNodeIds.value.join(','))
|
||||
}
|
||||
|
||||
const es = new EventSource(`/api/cforch/run?${params}`)
|
||||
llmEventSource.value = es
|
||||
|
|
@ -387,6 +506,8 @@ onMounted(() => {
|
|||
loadLlmTasks()
|
||||
loadLlmModels()
|
||||
loadLlmResults()
|
||||
loadLlmConfig()
|
||||
loadLlmNodes()
|
||||
})
|
||||
</script>
|
||||
|
||||
|
|
@ -451,6 +572,43 @@ onMounted(() => {
|
|||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.judge-url-input {
|
||||
flex: 1;
|
||||
min-width: 14rem;
|
||||
max-width: 24rem;
|
||||
padding: 0.35rem 0.6rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.375rem;
|
||||
background: var(--color-surface, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.8rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
}
|
||||
.judge-url-input:disabled { opacity: 0.5; }
|
||||
.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
|
||||
|
||||
.workers-label {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.3rem;
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
white-space: nowrap;
|
||||
}
|
||||
.workers-prefix { font-family: var(--font-mono, monospace); }
|
||||
.workers-input {
|
||||
width: 3.2rem;
|
||||
padding: 0.35rem 0.4rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.375rem;
|
||||
background: var(--color-surface, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.8rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
text-align: center;
|
||||
}
|
||||
.workers-input:disabled { opacity: 0.5; }
|
||||
|
||||
/* ── Run log ────────────────────────────────────────────── */
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
|
|
@ -592,6 +750,15 @@ onMounted(() => {
|
|||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.hm-judge-col {
|
||||
background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
|
||||
}
|
||||
.hm-judge-cell {
|
||||
background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
|
||||
font-style: italic;
|
||||
opacity: 0.9;
|
||||
}
|
||||
|
||||
/* ── Model Picker ───────────────────────────────────────── */
|
||||
.model-picker {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
|
|
@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼ '; }
|
|||
margin-left: auto;
|
||||
}
|
||||
|
||||
.picker-bulk-btn {
|
||||
padding: 0.1rem 0.45rem;
|
||||
font-size: 0.7rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
background: var(--color-surface, #fff);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.25rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
transition: background 0.12s, color 0.12s;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.picker-bulk-btn:hover {
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
border-color: var(--app-primary, #2A6080);
|
||||
}
|
||||
|
||||
.picker-body {
|
||||
padding: 0.75rem;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
|
|
@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼ '; }
|
|||
.picker-model-list { padding-left: 0; }
|
||||
.picker-model-name { max-width: 14ch; }
|
||||
}
|
||||
|
||||
/* ── Node picker ────────────────────────────────────── */
|
||||
.node-picker {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
flex-wrap: wrap;
|
||||
padding: 0.5rem 0.75rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
}
|
||||
|
||||
.node-picker-label {
|
||||
font-size: 0.78rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.node-chip {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 0.3rem;
|
||||
padding: 0.2rem 0.55rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 1rem;
|
||||
background: var(--color-surface, #fff);
|
||||
font-size: 0.78rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
color: var(--color-text, #1a2338);
|
||||
cursor: pointer;
|
||||
transition: background 0.12s, opacity 0.12s;
|
||||
}
|
||||
.node-chip--off {
|
||||
opacity: 0.45;
|
||||
background: transparent;
|
||||
}
|
||||
.node-chip--offline {
|
||||
opacity: 0.35;
|
||||
cursor: not-allowed;
|
||||
font-style: italic;
|
||||
}
|
||||
.node-chip-check { accent-color: var(--app-primary, #2A6080); }
|
||||
.node-chip-status {
|
||||
font-size: 0.66rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.node-picker-hint {
|
||||
font-size: 0.72rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-family: var(--font-mono, monospace);
|
||||
margin-left: auto;
|
||||
}
|
||||
</style>
|
||||
|
|
|
|||
|
|
@ -51,8 +51,31 @@
|
|||
<span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
|
||||
{{ lookupResult.adapter_recommendation }}
|
||||
</span>
|
||||
<span v-if="lookupResult.size != null" class="preview-size">
|
||||
{{ humanBytes(lookupResult.size) }}
|
||||
<span v-if="selectedQuantSize > 0" class="preview-size">
|
||||
{{ humanBytes(selectedQuantSize) }}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<!-- GGUF quantization picker — only shown for GGUF repos -->
|
||||
<div v-if="lookupResult.gguf_files?.length" class="quant-picker">
|
||||
<label class="quant-label" for="quant-select">Quantization</label>
|
||||
<select
|
||||
id="quant-select"
|
||||
v-model="selectedQuant"
|
||||
class="quant-select"
|
||||
aria-label="Select quantization variant"
|
||||
>
|
||||
<option :value="null" disabled>Select quantization…</option>
|
||||
<option
|
||||
v-for="f in lookupResult.gguf_files"
|
||||
:key="f.filename"
|
||||
:value="f.quant_name ?? f.filename"
|
||||
>
|
||||
{{ f.quant_name ?? f.filename }} — {{ humanBytes(f.size) }}
|
||||
</option>
|
||||
</select>
|
||||
<span class="quant-hint">
|
||||
Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
|
||||
</span>
|
||||
</div>
|
||||
|
||||
|
|
@ -67,7 +90,7 @@
|
|||
|
||||
<button
|
||||
class="btn-primary btn-add-queue"
|
||||
:disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue"
|
||||
:disabled="!canAddToQueue"
|
||||
@click="addToQueue"
|
||||
>
|
||||
{{ addingToQueue ? 'Adding…' : 'Add to queue' }}
|
||||
|
|
@ -99,9 +122,39 @@
|
|||
<span v-if="model.role" class="chip chip-role">{{ model.role }}</span>
|
||||
<span v-if="model.service" class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
|
||||
<span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
|
||||
<span v-if="model.quant_pattern" class="chip chip-quant">{{ model.quant_pattern }}</span>
|
||||
</div>
|
||||
<!-- Allow manual service/role assignment for unrecognized pipeline tags -->
|
||||
<div v-if="!model.service" class="classify-row queue-classify">
|
||||
<select
|
||||
class="classify-select"
|
||||
:value="classifyDraft[model.id]?.service ?? ''"
|
||||
@change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
|
||||
aria-label="Assign service"
|
||||
>
|
||||
<option value="" disabled>Service…</option>
|
||||
<option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
|
||||
</select>
|
||||
<select
|
||||
class="classify-select"
|
||||
:value="classifyDraft[model.id]?.role ?? ''"
|
||||
:disabled="!classifyDraft[model.id]?.service"
|
||||
@change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
|
||||
aria-label="Assign role"
|
||||
>
|
||||
<option value="" disabled>Role…</option>
|
||||
<option
|
||||
v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
|
||||
:key="role"
|
||||
:value="role"
|
||||
>{{ role }}</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="model-card-actions">
|
||||
<button class="btn-primary btn-sm" @click="approveModel(model.id)">
|
||||
<button
|
||||
class="btn-primary btn-sm"
|
||||
@click="approveModel(model.id, classifyDraft[model.id])"
|
||||
>
|
||||
Approve download
|
||||
</button>
|
||||
</div>
|
||||
|
|
@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'
|
|||
|
||||
// ── Type definitions ──────────────────────────────────
|
||||
|
||||
interface GgufFile {
|
||||
filename: string
|
||||
size: number
|
||||
quant_name: string | null
|
||||
}
|
||||
|
||||
interface LookupResult {
|
||||
repo_id: string
|
||||
pipeline_tag: string | null
|
||||
|
|
@ -260,7 +319,8 @@ interface LookupResult {
|
|||
service: string | null
|
||||
compatible: boolean
|
||||
warning: string | null
|
||||
size: number | null
|
||||
model_size_bytes: number
|
||||
gguf_files: GgufFile[] | null
|
||||
description: string | null
|
||||
already_installed: boolean
|
||||
already_queued: boolean
|
||||
|
|
@ -274,6 +334,7 @@ interface QueuedModel {
|
|||
adapter_recommendation: string | null
|
||||
role: string | null
|
||||
service: string | null
|
||||
quant_pattern: string | null
|
||||
}
|
||||
|
||||
interface InstalledModel {
|
||||
|
|
@ -302,6 +363,26 @@ const lookupLoading = ref(false)
|
|||
const lookupError = ref<string | null>(null)
|
||||
const lookupResult = ref<LookupResult | null>(null)
|
||||
const addingToQueue = ref(false)
|
||||
const selectedQuant = ref<string | null>(null)
|
||||
|
||||
// Size of the selected GGUF file, or total model size for non-GGUF repos.
|
||||
const selectedQuantSize = computed<number>(() => {
|
||||
const r = lookupResult.value
|
||||
if (!r) return 0
|
||||
if (r.gguf_files?.length && selectedQuant.value) {
|
||||
const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
|
||||
return f?.size ?? r.model_size_bytes
|
||||
}
|
||||
return r.model_size_bytes
|
||||
})
|
||||
|
||||
// Disable "Add to queue" when a GGUF repo but no quant chosen yet.
|
||||
const canAddToQueue = computed(() => {
|
||||
const r = lookupResult.value
|
||||
if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
|
||||
if (r.gguf_files?.length && !selectedQuant.value) return false
|
||||
return true
|
||||
})
|
||||
|
||||
const queuedModels = ref<QueuedModel[]>([])
|
||||
const installedModels = ref<InstalledModel[]>([])
|
||||
|
|
@ -411,6 +492,7 @@ async function doLookup() {
|
|||
lookupLoading.value = true
|
||||
lookupError.value = null
|
||||
lookupResult.value = null
|
||||
selectedQuant.value = null
|
||||
|
||||
try {
|
||||
const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
|
||||
|
|
@ -442,7 +524,15 @@ async function addToQueue() {
|
|||
const res = await fetch('/api/models/queue', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }),
|
||||
body: JSON.stringify({
|
||||
repo_id,
|
||||
pipeline_tag,
|
||||
adapter_recommendation,
|
||||
role,
|
||||
service,
|
||||
model_size_bytes: selectedQuantSize.value,
|
||||
quant_pattern: selectedQuant.value,
|
||||
}),
|
||||
})
|
||||
if (res.ok) {
|
||||
lookupResult.value = { ...lookupResult.value, already_queued: true }
|
||||
|
|
@ -454,8 +544,16 @@ async function addToQueue() {
|
|||
}
|
||||
}
|
||||
|
||||
async function approveModel(id: string) {
|
||||
async function approveModel(id: string, draft?: { service: string; role: string }) {
|
||||
try {
|
||||
// If the user picked a service/role for an unrecognized model, patch it first.
|
||||
if (draft?.service && draft?.role) {
|
||||
await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
|
||||
method: 'PATCH',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ service: draft.service, role: draft.role }),
|
||||
})
|
||||
}
|
||||
const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
|
||||
if (res.ok) {
|
||||
await loadQueue()
|
||||
|
|
@ -774,6 +872,44 @@ onUnmounted(() => {
|
|||
align-self: flex-start;
|
||||
}
|
||||
|
||||
/* ── Quant picker ── */
|
||||
.quant-picker {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.35rem;
|
||||
}
|
||||
|
||||
.quant-label {
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-muted, #4a5c7a);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
|
||||
.quant-select {
|
||||
padding: 0.4rem 0.6rem;
|
||||
border: 1px solid var(--color-border, #a8b8d0);
|
||||
border-radius: var(--radius-md, 0.5rem);
|
||||
background: var(--color-surface, #f0f4fb);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.9rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.quant-hint {
|
||||
font-size: 0.78rem;
|
||||
color: var(--color-text-muted, #4a5c7a);
|
||||
}
|
||||
|
||||
.chip-quant {
|
||||
background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
|
||||
color: var(--color-primary, #2A6080);
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.75rem;
|
||||
}
|
||||
|
||||
/* ── Model cards (queue + downloads) ── */
|
||||
.model-card {
|
||||
border: 1px solid var(--color-border, #a8b8d0);
|
||||
|
|
|
|||
1043
web/src/views/PlansBenchTab.vue
Normal file
1043
web/src/views/PlansBenchTab.vue
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue