feat: cf-orch LLM benchmark integration (Phase 1)
Backend (app/cforch.py — new APIRouter at /api/cforch): - GET /tasks — reads bench_tasks.yaml, returns tasks + deduplicated types - GET /models — reads bench_models.yaml, returns model list with service/tags - GET /run — SSE endpoint; spawns cf-orch benchmark.py subprocess with --filter-tasks, --filter-tags, --coordinator, --ollama-url; strips ANSI codes; emits progress/result/complete/error events; 409 guard on concurrency - GET /results — returns latest bench_results/*/summary.json; 404 if none - POST /cancel — terminates running benchmark subprocess - All paths configurable via label_tool.yaml cforch: section - 13 tests; follows sft.py/models.py testability seam pattern Frontend: - BenchmarkView: mode toggle (Classifier / LLM Eval); LLM Eval panel with task picker (by type, select-all + indeterminate), model picker (by service), SSE run log, results table with best-per-column highlighting - StatsView: LLM Benchmark section showing quality_by_task_type table across models; hidden when no results; fetches /api/cforch/results on mount SFT candidate pipeline: cf-orch runs that produce sft_candidates.jsonl are auto-discovered by the existing bench_results_dir config in sft.py — no additional wiring needed.
This commit is contained in:
parent
ce12b29c94
commit
dffb1d0d7a
5 changed files with 1191 additions and 8 deletions
|
|
@ -149,6 +149,9 @@ from app.models import router as models_router
|
||||||
import app.models as _models_module
|
import app.models as _models_module
|
||||||
app.include_router(models_router, prefix="/api/models")
|
app.include_router(models_router, prefix="/api/models")
|
||||||
|
|
||||||
|
from app.cforch import router as cforch_router
|
||||||
|
app.include_router(cforch_router, prefix="/api/cforch")
|
||||||
|
|
||||||
# In-memory last-action store (single user, local tool — in-memory is fine)
|
# In-memory last-action store (single user, local tool — in-memory is fine)
|
||||||
_last_action: dict | None = None
|
_last_action: dict | None = None
|
||||||
|
|
||||||
|
|
|
||||||
291
app/cforch.py
Normal file
291
app/cforch.py
Normal file
|
|
@ -0,0 +1,291 @@
|
||||||
|
"""Avocet — cf-orch benchmark integration API.
|
||||||
|
|
||||||
|
Wraps cf-orch's benchmark.py script and exposes it via the Avocet API.
|
||||||
|
Config is read from label_tool.yaml under the `cforch:` key.
|
||||||
|
|
||||||
|
All endpoints are registered on `router` (a FastAPI APIRouter).
|
||||||
|
api.py includes this router with prefix="/api/cforch".
|
||||||
|
|
||||||
|
Module-level globals (_CONFIG_DIR, _BENCH_RUNNING, _bench_proc) follow the
|
||||||
|
same testability pattern as sft.py — override _CONFIG_DIR via set_config_dir()
|
||||||
|
in test fixtures.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import subprocess as _subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from fastapi import APIRouter, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_ROOT = Path(__file__).parent.parent
|
||||||
|
_CONFIG_DIR: Path | None = None # override in tests
|
||||||
|
_BENCH_RUNNING: bool = False
|
||||||
|
_bench_proc: Any = None # live Popen object while benchmark runs
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Testability seams ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def set_config_dir(path: Path | None) -> None:
|
||||||
|
global _CONFIG_DIR
|
||||||
|
_CONFIG_DIR = path
|
||||||
|
|
||||||
|
|
||||||
|
# ── Internal helpers ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _config_file() -> Path:
|
||||||
|
if _CONFIG_DIR is not None:
|
||||||
|
return _CONFIG_DIR / "label_tool.yaml"
|
||||||
|
return _ROOT / "config" / "label_tool.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_cforch_config() -> dict:
|
||||||
|
"""Read label_tool.yaml and return the cforch sub-dict (or {} if absent/malformed)."""
|
||||||
|
f = _config_file()
|
||||||
|
if not f.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||||
|
except yaml.YAMLError as exc:
|
||||||
|
logger.warning("Failed to parse cforch config %s: %s", f, exc)
|
||||||
|
return {}
|
||||||
|
return raw.get("cforch", {}) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_ansi(text: str) -> str:
|
||||||
|
"""Remove ANSI escape codes from a string."""
|
||||||
|
return re.sub(r'\x1b\[[0-9;]*m', '', text)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_latest_summary(results_dir: str | None) -> Path | None:
|
||||||
|
"""Find the newest summary.json under results_dir, or None if not found."""
|
||||||
|
if not results_dir:
|
||||||
|
return None
|
||||||
|
rdir = Path(results_dir)
|
||||||
|
if not rdir.exists():
|
||||||
|
return None
|
||||||
|
# Subdirs are named YYYY-MM-DD-HHMMSS; sort lexicographically for chronological order
|
||||||
|
subdirs = sorted(
|
||||||
|
[d for d in rdir.iterdir() if d.is_dir()],
|
||||||
|
key=lambda d: d.name,
|
||||||
|
)
|
||||||
|
for subdir in reversed(subdirs):
|
||||||
|
summary = subdir / "summary.json"
|
||||||
|
if summary.exists():
|
||||||
|
return summary
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /tasks ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.get("/tasks")
|
||||||
|
def get_tasks() -> dict:
|
||||||
|
"""Return task list from bench_tasks.yaml."""
|
||||||
|
cfg = _load_cforch_config()
|
||||||
|
tasks_path = cfg.get("bench_tasks", "")
|
||||||
|
if not tasks_path:
|
||||||
|
return {"tasks": [], "types": []}
|
||||||
|
|
||||||
|
p = Path(tasks_path)
|
||||||
|
if not p.exists():
|
||||||
|
return {"tasks": [], "types": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||||
|
except yaml.YAMLError as exc:
|
||||||
|
logger.warning("Failed to parse bench_tasks.yaml %s: %s", p, exc)
|
||||||
|
return {"tasks": [], "types": []}
|
||||||
|
|
||||||
|
tasks_raw = raw.get("tasks", []) or []
|
||||||
|
tasks: list[dict] = []
|
||||||
|
seen_types: list[str] = []
|
||||||
|
types_set: set[str] = set()
|
||||||
|
|
||||||
|
for t in tasks_raw:
|
||||||
|
if not isinstance(t, dict):
|
||||||
|
continue
|
||||||
|
tasks.append({
|
||||||
|
"id": t.get("id", ""),
|
||||||
|
"name": t.get("name", ""),
|
||||||
|
"type": t.get("type", ""),
|
||||||
|
})
|
||||||
|
task_type = t.get("type", "")
|
||||||
|
if task_type and task_type not in types_set:
|
||||||
|
seen_types.append(task_type)
|
||||||
|
types_set.add(task_type)
|
||||||
|
|
||||||
|
return {"tasks": tasks, "types": seen_types}
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /models ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.get("/models")
|
||||||
|
def get_models() -> dict:
|
||||||
|
"""Return model list from bench_models.yaml."""
|
||||||
|
cfg = _load_cforch_config()
|
||||||
|
models_path = cfg.get("bench_models", "")
|
||||||
|
if not models_path:
|
||||||
|
return {"models": []}
|
||||||
|
|
||||||
|
p = Path(models_path)
|
||||||
|
if not p.exists():
|
||||||
|
return {"models": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
|
||||||
|
except yaml.YAMLError as exc:
|
||||||
|
logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
|
||||||
|
return {"models": []}
|
||||||
|
|
||||||
|
models_raw = raw.get("models", []) or []
|
||||||
|
models: list[dict] = []
|
||||||
|
for m in models_raw:
|
||||||
|
if not isinstance(m, dict):
|
||||||
|
continue
|
||||||
|
models.append({
|
||||||
|
"name": m.get("name", ""),
|
||||||
|
"id": m.get("id", ""),
|
||||||
|
"service": m.get("service", "ollama"),
|
||||||
|
"tags": m.get("tags", []) or [],
|
||||||
|
"vram_estimate_mb": m.get("vram_estimate_mb", 0),
|
||||||
|
})
|
||||||
|
|
||||||
|
return {"models": models}
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.get("/run")
|
||||||
|
def run_benchmark(
|
||||||
|
task_ids: str = "",
|
||||||
|
model_tags: str = "",
|
||||||
|
coordinator_url: str = "",
|
||||||
|
ollama_url: str = "",
|
||||||
|
) -> StreamingResponse:
|
||||||
|
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
|
||||||
|
global _BENCH_RUNNING, _bench_proc
|
||||||
|
|
||||||
|
if _BENCH_RUNNING:
|
||||||
|
raise HTTPException(409, "A benchmark is already running")
|
||||||
|
|
||||||
|
cfg = _load_cforch_config()
|
||||||
|
bench_script = cfg.get("bench_script", "")
|
||||||
|
bench_tasks = cfg.get("bench_tasks", "")
|
||||||
|
bench_models = cfg.get("bench_models", "")
|
||||||
|
results_dir = cfg.get("results_dir", "")
|
||||||
|
python_bin = cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python")
|
||||||
|
cfg_coordinator = cfg.get("coordinator_url", "")
|
||||||
|
cfg_ollama = cfg.get("ollama_url", "")
|
||||||
|
|
||||||
|
def generate():
|
||||||
|
global _BENCH_RUNNING, _bench_proc
|
||||||
|
|
||||||
|
if not bench_script or not Path(bench_script).exists():
|
||||||
|
yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
|
||||||
|
return
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
python_bin,
|
||||||
|
bench_script,
|
||||||
|
"--tasks", bench_tasks,
|
||||||
|
"--models", bench_models,
|
||||||
|
"--output", results_dir,
|
||||||
|
]
|
||||||
|
|
||||||
|
if task_ids:
|
||||||
|
cmd.extend(["--filter-tasks"] + task_ids.split(","))
|
||||||
|
if model_tags:
|
||||||
|
cmd.extend(["--filter-tags"] + model_tags.split(","))
|
||||||
|
|
||||||
|
effective_coordinator = coordinator_url if coordinator_url else cfg_coordinator
|
||||||
|
effective_ollama = ollama_url if ollama_url else cfg_ollama
|
||||||
|
if effective_coordinator:
|
||||||
|
cmd.extend(["--coordinator", effective_coordinator])
|
||||||
|
if effective_ollama:
|
||||||
|
cmd.extend(["--ollama-url", effective_ollama])
|
||||||
|
|
||||||
|
_BENCH_RUNNING = True
|
||||||
|
try:
|
||||||
|
proc = _subprocess.Popen(
|
||||||
|
cmd,
|
||||||
|
stdout=_subprocess.PIPE,
|
||||||
|
stderr=_subprocess.STDOUT,
|
||||||
|
text=True,
|
||||||
|
bufsize=1,
|
||||||
|
)
|
||||||
|
_bench_proc = proc
|
||||||
|
try:
|
||||||
|
for line in proc.stdout:
|
||||||
|
line = _strip_ansi(line.rstrip())
|
||||||
|
if line:
|
||||||
|
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||||
|
proc.wait()
|
||||||
|
if proc.returncode == 0:
|
||||||
|
summary_path = _find_latest_summary(results_dir)
|
||||||
|
if summary_path is not None:
|
||||||
|
try:
|
||||||
|
summary = json.loads(summary_path.read_text(encoding="utf-8"))
|
||||||
|
yield f"data: {json.dumps({'type': 'result', 'summary': summary})}\n\n"
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to read summary.json: %s", exc)
|
||||||
|
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
|
||||||
|
else:
|
||||||
|
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
|
||||||
|
finally:
|
||||||
|
_bench_proc = None
|
||||||
|
except Exception as exc:
|
||||||
|
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
|
||||||
|
finally:
|
||||||
|
_BENCH_RUNNING = False
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
generate(),
|
||||||
|
media_type="text/event-stream",
|
||||||
|
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /results ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.get("/results")
|
||||||
|
def get_results() -> dict:
|
||||||
|
"""Return the latest benchmark summary.json from results_dir."""
|
||||||
|
cfg = _load_cforch_config()
|
||||||
|
results_dir = cfg.get("results_dir", "")
|
||||||
|
summary_path = _find_latest_summary(results_dir)
|
||||||
|
if summary_path is None:
|
||||||
|
raise HTTPException(404, "No benchmark results found")
|
||||||
|
try:
|
||||||
|
return json.loads(summary_path.read_text(encoding="utf-8"))
|
||||||
|
except Exception as exc:
|
||||||
|
raise HTTPException(500, f"Failed to read summary.json: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
# ── POST /cancel ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@router.post("/cancel")
|
||||||
|
def cancel_benchmark() -> dict:
|
||||||
|
"""Kill the running benchmark subprocess."""
|
||||||
|
global _BENCH_RUNNING, _bench_proc
|
||||||
|
|
||||||
|
if not _BENCH_RUNNING:
|
||||||
|
raise HTTPException(404, "No benchmark is currently running")
|
||||||
|
|
||||||
|
if _bench_proc is not None:
|
||||||
|
try:
|
||||||
|
_bench_proc.terminate()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Failed to terminate benchmark process: %s", exc)
|
||||||
|
|
||||||
|
_BENCH_RUNNING = False
|
||||||
|
_bench_proc = None
|
||||||
|
return {"status": "cancelled"}
|
||||||
282
tests/test_cforch.py
Normal file
282
tests/test_cforch.py
Normal file
|
|
@ -0,0 +1,282 @@
|
||||||
|
"""Tests for app/cforch.py — /api/cforch/* endpoints."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import yaml
|
||||||
|
from fastapi.testclient import TestClient
|
||||||
|
|
||||||
|
|
||||||
|
# ── Fixtures ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def reset_cforch_globals(tmp_path):
|
||||||
|
"""Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
|
||||||
|
from app import cforch as cforch_module
|
||||||
|
|
||||||
|
prev_config_dir = cforch_module._CONFIG_DIR
|
||||||
|
prev_running = cforch_module._BENCH_RUNNING
|
||||||
|
prev_proc = cforch_module._bench_proc
|
||||||
|
|
||||||
|
cforch_module.set_config_dir(tmp_path)
|
||||||
|
cforch_module._BENCH_RUNNING = False
|
||||||
|
cforch_module._bench_proc = None
|
||||||
|
|
||||||
|
yield tmp_path
|
||||||
|
|
||||||
|
cforch_module.set_config_dir(prev_config_dir)
|
||||||
|
cforch_module._BENCH_RUNNING = prev_running
|
||||||
|
cforch_module._bench_proc = prev_proc
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def client():
|
||||||
|
from app.api import app
|
||||||
|
return TestClient(app)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def config_dir(reset_cforch_globals):
|
||||||
|
"""Return the tmp config dir (already set as _CONFIG_DIR)."""
|
||||||
|
return reset_cforch_globals
|
||||||
|
|
||||||
|
|
||||||
|
def _write_config(config_dir: Path, cforch_cfg: dict) -> None:
|
||||||
|
"""Write a label_tool.yaml with the given cforch block into config_dir."""
|
||||||
|
cfg = {"cforch": cforch_cfg}
|
||||||
|
(config_dir / "label_tool.yaml").write_text(
|
||||||
|
yaml.dump(cfg), encoding="utf-8"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_tasks_yaml(path: Path, tasks: list[dict]) -> None:
|
||||||
|
path.write_text(yaml.dump({"tasks": tasks}), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
def _write_models_yaml(path: Path, models: list[dict]) -> None:
|
||||||
|
path.write_text(yaml.dump({"models": models}), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /tasks ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_tasks_returns_empty_when_not_configured(client):
|
||||||
|
"""No config file present — endpoint returns empty lists."""
|
||||||
|
r = client.get("/api/cforch/tasks")
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data == {"tasks": [], "types": []}
|
||||||
|
|
||||||
|
|
||||||
|
def test_tasks_parses_yaml(client, config_dir, tmp_path):
|
||||||
|
tasks_file = tmp_path / "bench_tasks.yaml"
|
||||||
|
_write_tasks_yaml(tasks_file, [
|
||||||
|
{"id": "t1", "name": "Task One", "type": "instruction"},
|
||||||
|
{"id": "t2", "name": "Task Two", "type": "reasoning"},
|
||||||
|
])
|
||||||
|
_write_config(config_dir, {"bench_tasks": str(tasks_file)})
|
||||||
|
|
||||||
|
r = client.get("/api/cforch/tasks")
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert len(data["tasks"]) == 2
|
||||||
|
assert data["tasks"][0] == {"id": "t1", "name": "Task One", "type": "instruction"}
|
||||||
|
assert data["tasks"][1] == {"id": "t2", "name": "Task Two", "type": "reasoning"}
|
||||||
|
assert "instruction" in data["types"]
|
||||||
|
assert "reasoning" in data["types"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_tasks_returns_types_deduplicated(client, config_dir, tmp_path):
|
||||||
|
"""Multiple tasks sharing a type — types list must not duplicate."""
|
||||||
|
tasks_file = tmp_path / "bench_tasks.yaml"
|
||||||
|
_write_tasks_yaml(tasks_file, [
|
||||||
|
{"id": "t1", "name": "A", "type": "instruction"},
|
||||||
|
{"id": "t2", "name": "B", "type": "instruction"},
|
||||||
|
{"id": "t3", "name": "C", "type": "reasoning"},
|
||||||
|
])
|
||||||
|
_write_config(config_dir, {"bench_tasks": str(tasks_file)})
|
||||||
|
|
||||||
|
r = client.get("/api/cforch/tasks")
|
||||||
|
data = r.json()
|
||||||
|
assert data["types"].count("instruction") == 1
|
||||||
|
assert len(data["types"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /models ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_models_returns_empty_when_not_configured(client):
|
||||||
|
"""No config file present — endpoint returns empty model list."""
|
||||||
|
r = client.get("/api/cforch/models")
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json() == {"models": []}
|
||||||
|
|
||||||
|
|
||||||
|
def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
|
||||||
|
models_file = tmp_path / "bench_models.yaml"
|
||||||
|
_write_models_yaml(models_file, [
|
||||||
|
{
|
||||||
|
"name": "llama3",
|
||||||
|
"id": "llama3:8b",
|
||||||
|
"service": "ollama",
|
||||||
|
"tags": ["fast", "small"],
|
||||||
|
"vram_estimate_mb": 6000,
|
||||||
|
}
|
||||||
|
])
|
||||||
|
_write_config(config_dir, {"bench_models": str(models_file)})
|
||||||
|
|
||||||
|
r = client.get("/api/cforch/models")
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert len(data["models"]) == 1
|
||||||
|
m = data["models"][0]
|
||||||
|
assert m["name"] == "llama3"
|
||||||
|
assert m["id"] == "llama3:8b"
|
||||||
|
assert m["service"] == "ollama"
|
||||||
|
assert m["tags"] == ["fast", "small"]
|
||||||
|
assert m["vram_estimate_mb"] == 6000
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_run_returns_409_when_already_running(client):
|
||||||
|
"""If _BENCH_RUNNING is True, GET /run returns 409."""
|
||||||
|
from app import cforch as cforch_module
|
||||||
|
cforch_module._BENCH_RUNNING = True
|
||||||
|
|
||||||
|
r = client.get("/api/cforch/run")
|
||||||
|
assert r.status_code == 409
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_returns_error_when_bench_script_not_configured(client):
|
||||||
|
"""No config at all — SSE stream contains an error event."""
|
||||||
|
r = client.get("/api/cforch/run")
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert '"type": "error"' in r.text
|
||||||
|
assert "bench_script not configured" in r.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_streams_progress_events(client, config_dir, tmp_path):
|
||||||
|
"""Mock subprocess — SSE stream emits progress events from stdout."""
|
||||||
|
bench_script = tmp_path / "fake_benchmark.py"
|
||||||
|
bench_script.write_text("# fake", encoding="utf-8")
|
||||||
|
|
||||||
|
tasks_file = tmp_path / "bench_tasks.yaml"
|
||||||
|
tasks_file.write_text(yaml.dump({"tasks": []}), encoding="utf-8")
|
||||||
|
models_file = tmp_path / "bench_models.yaml"
|
||||||
|
models_file.write_text(yaml.dump({"models": []}), encoding="utf-8")
|
||||||
|
results_dir = tmp_path / "results"
|
||||||
|
results_dir.mkdir()
|
||||||
|
|
||||||
|
_write_config(config_dir, {
|
||||||
|
"bench_script": str(bench_script),
|
||||||
|
"bench_tasks": str(tasks_file),
|
||||||
|
"bench_models": str(models_file),
|
||||||
|
"results_dir": str(results_dir),
|
||||||
|
"python_bin": "/usr/bin/python3",
|
||||||
|
})
|
||||||
|
|
||||||
|
mock_proc = MagicMock()
|
||||||
|
mock_proc.stdout = iter(["Running task 1\n", "Running task 2\n"])
|
||||||
|
mock_proc.returncode = 1 # non-zero so we don't need summary.json
|
||||||
|
|
||||||
|
def mock_wait():
|
||||||
|
pass
|
||||||
|
|
||||||
|
mock_proc.wait = mock_wait
|
||||||
|
|
||||||
|
with patch("app.cforch._subprocess.Popen", return_value=mock_proc):
|
||||||
|
r = client.get("/api/cforch/run")
|
||||||
|
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert '"type": "progress"' in r.text
|
||||||
|
assert "Running task 1" in r.text
|
||||||
|
assert "Running task 2" in r.text
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_emits_result_on_success(client, config_dir, tmp_path):
|
||||||
|
"""Mock subprocess exit 0 + write fake summary.json — stream emits result event."""
|
||||||
|
bench_script = tmp_path / "fake_benchmark.py"
|
||||||
|
bench_script.write_text("# fake", encoding="utf-8")
|
||||||
|
|
||||||
|
tasks_file = tmp_path / "bench_tasks.yaml"
|
||||||
|
tasks_file.write_text(yaml.dump({"tasks": []}), encoding="utf-8")
|
||||||
|
models_file = tmp_path / "bench_models.yaml"
|
||||||
|
models_file.write_text(yaml.dump({"models": []}), encoding="utf-8")
|
||||||
|
|
||||||
|
results_dir = tmp_path / "results"
|
||||||
|
run_dir = results_dir / "2026-04-08-120000"
|
||||||
|
run_dir.mkdir(parents=True)
|
||||||
|
summary_data = {"score": 0.92, "models_evaluated": 3}
|
||||||
|
(run_dir / "summary.json").write_text(json.dumps(summary_data), encoding="utf-8")
|
||||||
|
|
||||||
|
_write_config(config_dir, {
|
||||||
|
"bench_script": str(bench_script),
|
||||||
|
"bench_tasks": str(tasks_file),
|
||||||
|
"bench_models": str(models_file),
|
||||||
|
"results_dir": str(results_dir),
|
||||||
|
"python_bin": "/usr/bin/python3",
|
||||||
|
})
|
||||||
|
|
||||||
|
mock_proc = MagicMock()
|
||||||
|
mock_proc.stdout = iter([])
|
||||||
|
mock_proc.returncode = 0
|
||||||
|
mock_proc.wait = MagicMock()
|
||||||
|
|
||||||
|
with patch("app.cforch._subprocess.Popen", return_value=mock_proc):
|
||||||
|
r = client.get("/api/cforch/run")
|
||||||
|
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert '"type": "result"' in r.text
|
||||||
|
assert '"score": 0.92' in r.text
|
||||||
|
assert '"type": "complete"' in r.text
|
||||||
|
|
||||||
|
|
||||||
|
# ── GET /results ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_results_returns_404_when_no_results(client):
|
||||||
|
"""No results_dir configured — endpoint returns 404."""
|
||||||
|
r = client.get("/api/cforch/results")
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_results_returns_latest_summary(client, config_dir, tmp_path):
|
||||||
|
"""Write fake results dir with one subdir containing summary.json."""
|
||||||
|
results_dir = tmp_path / "results"
|
||||||
|
run_dir = results_dir / "2026-04-08-150000"
|
||||||
|
run_dir.mkdir(parents=True)
|
||||||
|
summary_data = {"score": 0.88, "run": "test"}
|
||||||
|
(run_dir / "summary.json").write_text(json.dumps(summary_data), encoding="utf-8")
|
||||||
|
|
||||||
|
_write_config(config_dir, {"results_dir": str(results_dir)})
|
||||||
|
|
||||||
|
r = client.get("/api/cforch/results")
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert data["score"] == 0.88
|
||||||
|
assert data["run"] == "test"
|
||||||
|
|
||||||
|
|
||||||
|
# ── POST /cancel ───────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_cancel_returns_404_when_not_running(client):
|
||||||
|
"""POST /cancel when no benchmark running — returns 404."""
|
||||||
|
r = client.post("/api/cforch/cancel")
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
|
|
||||||
|
def test_cancel_terminates_running_benchmark(client):
|
||||||
|
"""POST /cancel when benchmark is running — terminates proc and returns cancelled."""
|
||||||
|
from app import cforch as cforch_module
|
||||||
|
|
||||||
|
mock_proc = MagicMock()
|
||||||
|
cforch_module._BENCH_RUNNING = True
|
||||||
|
cforch_module._bench_proc = mock_proc
|
||||||
|
|
||||||
|
r = client.post("/api/cforch/cancel")
|
||||||
|
assert r.status_code == 200
|
||||||
|
assert r.json() == {"status": "cancelled"}
|
||||||
|
mock_proc.terminate.assert_called_once()
|
||||||
|
assert cforch_module._BENCH_RUNNING is False
|
||||||
|
assert cforch_module._bench_proc is None
|
||||||
|
|
@ -3,10 +3,11 @@
|
||||||
<header class="bench-header">
|
<header class="bench-header">
|
||||||
<h1 class="page-title">🏁 Benchmark</h1>
|
<h1 class="page-title">🏁 Benchmark</h1>
|
||||||
<div class="header-actions">
|
<div class="header-actions">
|
||||||
<label class="slow-toggle" :class="{ disabled: running }">
|
<label class="slow-toggle" :class="{ disabled: running }" v-if="benchMode === 'classifier'">
|
||||||
<input type="checkbox" v-model="includeSlow" :disabled="running" />
|
<input type="checkbox" v-model="includeSlow" :disabled="running" />
|
||||||
Include slow models
|
Include slow models
|
||||||
</label>
|
</label>
|
||||||
|
<template v-if="benchMode === 'classifier'">
|
||||||
<button
|
<button
|
||||||
class="btn-run"
|
class="btn-run"
|
||||||
:disabled="running"
|
:disabled="running"
|
||||||
|
|
@ -21,9 +22,201 @@
|
||||||
>
|
>
|
||||||
✕ Cancel
|
✕ Cancel
|
||||||
</button>
|
</button>
|
||||||
|
</template>
|
||||||
</div>
|
</div>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
<!-- Mode toggle -->
|
||||||
|
<div class="mode-toggle" role="group" aria-label="Benchmark mode">
|
||||||
|
<button
|
||||||
|
class="mode-btn"
|
||||||
|
:class="{ active: benchMode === 'classifier' }"
|
||||||
|
@click="benchMode = 'classifier'"
|
||||||
|
>Classifier</button>
|
||||||
|
<button
|
||||||
|
class="mode-btn"
|
||||||
|
:class="{ active: benchMode === 'llm' }"
|
||||||
|
@click="benchMode = 'llm'"
|
||||||
|
>🤖 LLM Eval</button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ── LLM Eval panel ─────────────────────────────────────── -->
|
||||||
|
<template v-if="benchMode === 'llm'">
|
||||||
|
|
||||||
|
<!-- Task Selection -->
|
||||||
|
<details class="model-picker" open>
|
||||||
|
<summary class="picker-summary">
|
||||||
|
<span class="picker-title">📋 Task Selection</span>
|
||||||
|
<span class="picker-badge">{{ llmTaskBadge }}</span>
|
||||||
|
</summary>
|
||||||
|
<div class="picker-body">
|
||||||
|
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
|
||||||
|
<div v-else-if="Object.keys(llmTasksByType).length === 0" class="picker-empty">
|
||||||
|
No tasks found — check API connection.
|
||||||
|
</div>
|
||||||
|
<template v-else>
|
||||||
|
<div
|
||||||
|
v-for="(tasks, type) in llmTasksByType"
|
||||||
|
:key="type"
|
||||||
|
class="picker-category"
|
||||||
|
>
|
||||||
|
<label class="picker-cat-header">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
:checked="isTaskTypeAllSelected(tasks)"
|
||||||
|
:indeterminate="isTaskTypeIndeterminate(tasks)"
|
||||||
|
@change="toggleTaskType(tasks, ($event.target as HTMLInputElement).checked)"
|
||||||
|
/>
|
||||||
|
<span class="picker-cat-name">{{ type }}</span>
|
||||||
|
<span class="picker-cat-count">({{ tasks.length }})</span>
|
||||||
|
</label>
|
||||||
|
<div class="picker-model-list">
|
||||||
|
<label
|
||||||
|
v-for="t in tasks"
|
||||||
|
:key="t.id"
|
||||||
|
class="picker-model-row"
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
:checked="selectedLlmTasks.has(t.id)"
|
||||||
|
@change="toggleLlmTask(t.id, ($event.target as HTMLInputElement).checked)"
|
||||||
|
/>
|
||||||
|
<span class="picker-model-name" :title="t.name">{{ t.name }}</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Model Selection -->
|
||||||
|
<details class="model-picker" open>
|
||||||
|
<summary class="picker-summary">
|
||||||
|
<span class="picker-title">🎯 Model Selection</span>
|
||||||
|
<span class="picker-badge">{{ llmModelBadge }}</span>
|
||||||
|
</summary>
|
||||||
|
<div class="picker-body">
|
||||||
|
<div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
|
||||||
|
<div v-else-if="Object.keys(llmModelsByService).length === 0" class="picker-empty">
|
||||||
|
No models found — check cf-orch connection.
|
||||||
|
</div>
|
||||||
|
<template v-else>
|
||||||
|
<div
|
||||||
|
v-for="(models, service) in llmModelsByService"
|
||||||
|
:key="service"
|
||||||
|
class="picker-category"
|
||||||
|
>
|
||||||
|
<label class="picker-cat-header">
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
:checked="isServiceAllSelected(models)"
|
||||||
|
:indeterminate="isServiceIndeterminate(models)"
|
||||||
|
@change="toggleService(models, ($event.target as HTMLInputElement).checked)"
|
||||||
|
/>
|
||||||
|
<span class="picker-cat-name">{{ service }}</span>
|
||||||
|
<span class="picker-cat-count">({{ models.length }})</span>
|
||||||
|
</label>
|
||||||
|
<div class="picker-model-list">
|
||||||
|
<label
|
||||||
|
v-for="m in models"
|
||||||
|
:key="m.id"
|
||||||
|
class="picker-model-row"
|
||||||
|
>
|
||||||
|
<input
|
||||||
|
type="checkbox"
|
||||||
|
:checked="selectedLlmModels.has(m.id)"
|
||||||
|
@change="toggleLlmModel(m.id, ($event.target as HTMLInputElement).checked)"
|
||||||
|
/>
|
||||||
|
<span class="picker-model-name" :title="m.name">{{ m.name }}</span>
|
||||||
|
<span class="picker-adapter-type" v-if="m.tags.length">{{ m.tags.join(', ') }}</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</template>
|
||||||
|
</div>
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<!-- Run Controls -->
|
||||||
|
<div class="llm-run-controls">
|
||||||
|
<button
|
||||||
|
class="btn-run"
|
||||||
|
:disabled="llmRunning || selectedLlmTasks.size === 0 || selectedLlmModels.size === 0"
|
||||||
|
@click="startLlmBenchmark"
|
||||||
|
>
|
||||||
|
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
|
||||||
|
</button>
|
||||||
|
<button
|
||||||
|
v-if="llmRunning"
|
||||||
|
class="btn-cancel"
|
||||||
|
@click="cancelLlmBenchmark"
|
||||||
|
>
|
||||||
|
✕ Cancel
|
||||||
|
</button>
|
||||||
|
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="llm-run-hint">
|
||||||
|
Select at least one task and one model to run.
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- Progress log -->
|
||||||
|
<div v-if="llmRunning || llmRunLog.length" class="run-log">
|
||||||
|
<div class="run-log-title">
|
||||||
|
<span>{{ llmRunning ? '⏳ Running LLM eval…' : llmError ? '❌ Failed' : '✅ Done' }}</span>
|
||||||
|
<button class="btn-ghost" @click="llmRunLog = []; llmError = ''">Clear</button>
|
||||||
|
</div>
|
||||||
|
<div class="log-lines" ref="llmLogEl">
|
||||||
|
<div
|
||||||
|
v-for="(line, i) in llmRunLog"
|
||||||
|
:key="i"
|
||||||
|
class="log-line"
|
||||||
|
:class="{ 'log-error': line.startsWith('ERROR') || line.startsWith('[error]') }"
|
||||||
|
>{{ line }}</div>
|
||||||
|
</div>
|
||||||
|
<p v-if="llmError" class="run-error">{{ llmError }}</p>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- LLM Results table -->
|
||||||
|
<template v-if="llmResults.length > 0">
|
||||||
|
<h2 class="chart-title">LLM Eval Results</h2>
|
||||||
|
<div class="heatmap-scroll">
|
||||||
|
<table class="heatmap llm-results-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="hm-label-col">Model</th>
|
||||||
|
<th class="hm-model-col">overall</th>
|
||||||
|
<th
|
||||||
|
v-for="col in llmTaskTypeCols"
|
||||||
|
:key="col"
|
||||||
|
class="hm-model-col"
|
||||||
|
>{{ col }}</th>
|
||||||
|
<th class="hm-model-col">tok/s</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr v-for="row in llmResults" :key="row.model_id">
|
||||||
|
<td class="hm-label-cell llm-model-name-cell" :title="row.model_id">{{ row.model_name }}</td>
|
||||||
|
<td
|
||||||
|
class="hm-value-cell"
|
||||||
|
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
|
||||||
|
>{{ pct(row.avg_quality_score) }}</td>
|
||||||
|
<td
|
||||||
|
v-for="col in llmTaskTypeCols"
|
||||||
|
:key="col"
|
||||||
|
class="hm-value-cell"
|
||||||
|
:class="{ 'bt-best': llmBestByCol[col] === row.model_id }"
|
||||||
|
>{{ row.quality_by_task_type[col] != null ? pct(row.quality_by_task_type[col]) : '—' }}</td>
|
||||||
|
<td class="hm-value-cell llm-tps-cell">{{ row.avg_tokens_per_sec.toFixed(1) }}</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="heatmap-hint">Run LLM Eval on the Benchmark tab to refresh. Green = best per column.</p>
|
||||||
|
</template>
|
||||||
|
|
||||||
|
</template>
|
||||||
|
|
||||||
|
<!-- ── Classifier panel ──────────────────────────────────── -->
|
||||||
|
<template v-if="benchMode === 'classifier'">
|
||||||
|
|
||||||
<!-- Model Picker -->
|
<!-- Model Picker -->
|
||||||
<details class="model-picker" ref="pickerEl">
|
<details class="model-picker" ref="pickerEl">
|
||||||
<summary class="picker-summary">
|
<summary class="picker-summary">
|
||||||
|
|
@ -250,6 +443,10 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
|
</template>
|
||||||
|
<!-- ── /Classifier panel ─────────────────────────────────── -->
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</template>
|
</template>
|
||||||
|
|
||||||
|
|
@ -278,6 +475,33 @@ interface AvailableModel {
|
||||||
adapter_type: string
|
adapter_type: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// cf-orch types
|
||||||
|
interface CfOrchTask {
|
||||||
|
id: string
|
||||||
|
name: string
|
||||||
|
type: string
|
||||||
|
}
|
||||||
|
|
||||||
|
interface CfOrchModel {
|
||||||
|
name: string
|
||||||
|
id: string
|
||||||
|
service: string
|
||||||
|
tags: string[]
|
||||||
|
vram_estimate_mb?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
interface LlmModelResult {
|
||||||
|
model_name: string
|
||||||
|
model_id: string
|
||||||
|
node_id: string
|
||||||
|
avg_tokens_per_sec: number
|
||||||
|
avg_completion_ms: number
|
||||||
|
avg_quality_score: number
|
||||||
|
finetune_candidates: number
|
||||||
|
error_count: number
|
||||||
|
quality_by_task_type: Record<string, number>
|
||||||
|
}
|
||||||
|
|
||||||
interface ModelCategoriesResponse {
|
interface ModelCategoriesResponse {
|
||||||
categories: Record<string, AvailableModel[]>
|
categories: Record<string, AvailableModel[]>
|
||||||
}
|
}
|
||||||
|
|
@ -329,6 +553,25 @@ const ftError = ref('')
|
||||||
const ftLogEl = ref<HTMLElement | null>(null)
|
const ftLogEl = ref<HTMLElement | null>(null)
|
||||||
|
|
||||||
const runCancelled = ref(false)
|
const runCancelled = ref(false)
|
||||||
|
|
||||||
|
// ── Mode toggle ───────────────────────────────────────────────────────────────
|
||||||
|
const benchMode = ref<'classifier' | 'llm'>('classifier')
|
||||||
|
|
||||||
|
// ── LLM Eval state ───────────────────────────────────────────────────────────
|
||||||
|
const llmTasks = ref<CfOrchTask[]>([])
|
||||||
|
const llmTasksLoading = ref(false)
|
||||||
|
const llmModels = ref<CfOrchModel[]>([])
|
||||||
|
const llmModelsLoading = ref(false)
|
||||||
|
|
||||||
|
const selectedLlmTasks = ref<Set<string>>(new Set())
|
||||||
|
const selectedLlmModels = ref<Set<string>>(new Set())
|
||||||
|
|
||||||
|
const llmRunning = ref(false)
|
||||||
|
const llmRunLog = ref<string[]>([])
|
||||||
|
const llmError = ref('')
|
||||||
|
const llmResults = ref<LlmModelResult[]>([])
|
||||||
|
const llmEventSource = ref<EventSource | null>(null)
|
||||||
|
const llmLogEl = ref<HTMLElement | null>(null)
|
||||||
const ftCancelled = ref(false)
|
const ftCancelled = ref(false)
|
||||||
|
|
||||||
async function cancelBenchmark() {
|
async function cancelBenchmark() {
|
||||||
|
|
@ -339,6 +582,197 @@ async function cancelFinetune() {
|
||||||
await fetch('/api/finetune/cancel', { method: 'POST' }).catch(() => {})
|
await fetch('/api/finetune/cancel', { method: 'POST' }).catch(() => {})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── LLM Eval computed ─────────────────────────────────────────────────────────
|
||||||
|
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||||
|
const groups: Record<string, CfOrchTask[]> = {}
|
||||||
|
for (const t of llmTasks.value) {
|
||||||
|
if (!groups[t.type]) groups[t.type] = []
|
||||||
|
groups[t.type].push(t)
|
||||||
|
}
|
||||||
|
return groups
|
||||||
|
})
|
||||||
|
|
||||||
|
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
|
||||||
|
const groups: Record<string, CfOrchModel[]> = {}
|
||||||
|
for (const m of llmModels.value) {
|
||||||
|
if (!groups[m.service]) groups[m.service] = []
|
||||||
|
groups[m.service].push(m)
|
||||||
|
}
|
||||||
|
return groups
|
||||||
|
})
|
||||||
|
|
||||||
|
const llmTaskBadge = computed(() => {
|
||||||
|
const total = llmTasks.value.length
|
||||||
|
if (total === 0) return 'No tasks available'
|
||||||
|
const sel = selectedLlmTasks.value.size
|
||||||
|
if (sel === total) return `All tasks (${total})`
|
||||||
|
return `${sel} of ${total} tasks selected`
|
||||||
|
})
|
||||||
|
|
||||||
|
const llmModelBadge = computed(() => {
|
||||||
|
const total = llmModels.value.length
|
||||||
|
if (total === 0) return 'No models available'
|
||||||
|
const sel = selectedLlmModels.value.size
|
||||||
|
if (sel === total) return `All models (${total})`
|
||||||
|
return `${sel} of ${total} selected`
|
||||||
|
})
|
||||||
|
|
||||||
|
// All task type columns present in any result row
|
||||||
|
const llmTaskTypeCols = computed(() => {
|
||||||
|
const types = new Set<string>()
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
|
||||||
|
}
|
||||||
|
return [...types].sort()
|
||||||
|
})
|
||||||
|
|
||||||
|
// Best model id per column (overall + each task type col)
|
||||||
|
const llmBestByCol = computed((): Record<string, string> => {
|
||||||
|
const best: Record<string, string> = {}
|
||||||
|
if (llmResults.value.length === 0) return best
|
||||||
|
|
||||||
|
// overall
|
||||||
|
let bestId = '', bestVal = -Infinity
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
if (r.avg_quality_score > bestVal) { bestVal = r.avg_quality_score; bestId = r.model_id }
|
||||||
|
}
|
||||||
|
best['overall'] = bestId
|
||||||
|
|
||||||
|
for (const col of llmTaskTypeCols.value) {
|
||||||
|
bestId = ''; bestVal = -Infinity
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
const v = r.quality_by_task_type[col]
|
||||||
|
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
||||||
|
}
|
||||||
|
best[col] = bestId
|
||||||
|
}
|
||||||
|
return best
|
||||||
|
})
|
||||||
|
|
||||||
|
function pct(v: number): string {
|
||||||
|
return `${(v * 100).toFixed(1)}%`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Task picker helpers
|
||||||
|
function isTaskTypeAllSelected(tasks: CfOrchTask[]): boolean {
|
||||||
|
return tasks.length > 0 && tasks.every(t => selectedLlmTasks.value.has(t.id))
|
||||||
|
}
|
||||||
|
function isTaskTypeIndeterminate(tasks: CfOrchTask[]): boolean {
|
||||||
|
const some = tasks.some(t => selectedLlmTasks.value.has(t.id))
|
||||||
|
return some && !isTaskTypeAllSelected(tasks)
|
||||||
|
}
|
||||||
|
function toggleLlmTask(id: string, checked: boolean) {
|
||||||
|
const next = new Set(selectedLlmTasks.value)
|
||||||
|
if (checked) next.add(id)
|
||||||
|
else next.delete(id)
|
||||||
|
selectedLlmTasks.value = next
|
||||||
|
}
|
||||||
|
function toggleTaskType(tasks: CfOrchTask[], checked: boolean) {
|
||||||
|
const next = new Set(selectedLlmTasks.value)
|
||||||
|
for (const t of tasks) {
|
||||||
|
if (checked) next.add(t.id)
|
||||||
|
else next.delete(t.id)
|
||||||
|
}
|
||||||
|
selectedLlmTasks.value = next
|
||||||
|
}
|
||||||
|
|
||||||
|
// Model picker helpers
|
||||||
|
function isServiceAllSelected(models: CfOrchModel[]): boolean {
|
||||||
|
return models.length > 0 && models.every(m => selectedLlmModels.value.has(m.id))
|
||||||
|
}
|
||||||
|
function isServiceIndeterminate(models: CfOrchModel[]): boolean {
|
||||||
|
const some = models.some(m => selectedLlmModels.value.has(m.id))
|
||||||
|
return some && !isServiceAllSelected(models)
|
||||||
|
}
|
||||||
|
function toggleLlmModel(id: string, checked: boolean) {
|
||||||
|
const next = new Set(selectedLlmModels.value)
|
||||||
|
if (checked) next.add(id)
|
||||||
|
else next.delete(id)
|
||||||
|
selectedLlmModels.value = next
|
||||||
|
}
|
||||||
|
function toggleService(models: CfOrchModel[], checked: boolean) {
|
||||||
|
const next = new Set(selectedLlmModels.value)
|
||||||
|
for (const m of models) {
|
||||||
|
if (checked) next.add(m.id)
|
||||||
|
else next.delete(m.id)
|
||||||
|
}
|
||||||
|
selectedLlmModels.value = next
|
||||||
|
}
|
||||||
|
|
||||||
|
// Data loaders
|
||||||
|
async function loadLlmTasks() {
|
||||||
|
llmTasksLoading.value = true
|
||||||
|
const { data } = await useApiFetch<{ tasks: CfOrchTask[]; types: string[] }>('/api/cforch/tasks')
|
||||||
|
llmTasksLoading.value = false
|
||||||
|
if (data?.tasks) {
|
||||||
|
llmTasks.value = data.tasks
|
||||||
|
selectedLlmTasks.value = new Set(data.tasks.map(t => t.id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadLlmModels() {
|
||||||
|
llmModelsLoading.value = true
|
||||||
|
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
||||||
|
llmModelsLoading.value = false
|
||||||
|
if (data?.models) {
|
||||||
|
llmModels.value = data.models
|
||||||
|
selectedLlmModels.value = new Set(data.models.map(m => m.id))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadLlmResults() {
|
||||||
|
const { data } = await useApiFetch<LlmModelResult[]>('/api/cforch/results')
|
||||||
|
if (Array.isArray(data) && data.length > 0) {
|
||||||
|
llmResults.value = data
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function cancelLlmBenchmark() {
|
||||||
|
llmEventSource.value?.close()
|
||||||
|
llmEventSource.value = null
|
||||||
|
llmRunning.value = false
|
||||||
|
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {})
|
||||||
|
}
|
||||||
|
|
||||||
|
function startLlmBenchmark() {
|
||||||
|
llmRunning.value = true
|
||||||
|
llmRunLog.value = []
|
||||||
|
llmError.value = ''
|
||||||
|
|
||||||
|
const params = new URLSearchParams()
|
||||||
|
const taskIds = [...selectedLlmTasks.value].join(',')
|
||||||
|
if (taskIds) params.set('task_ids', taskIds)
|
||||||
|
|
||||||
|
const es = new EventSource(`/api/cforch/run?${params}`)
|
||||||
|
llmEventSource.value = es
|
||||||
|
|
||||||
|
es.onmessage = async (e: MessageEvent) => {
|
||||||
|
const msg = JSON.parse(e.data)
|
||||||
|
if (msg.type === 'progress' && typeof msg.message === 'string') {
|
||||||
|
llmRunLog.value.push(msg.message)
|
||||||
|
await nextTick()
|
||||||
|
llmLogEl.value?.scrollTo({ top: llmLogEl.value.scrollHeight, behavior: 'smooth' })
|
||||||
|
} else if (msg.type === 'result' && Array.isArray(msg.summary)) {
|
||||||
|
llmResults.value = msg.summary
|
||||||
|
} else if (msg.type === 'complete') {
|
||||||
|
llmRunning.value = false
|
||||||
|
es.close()
|
||||||
|
llmEventSource.value = null
|
||||||
|
} else if (msg.type === 'error' && typeof msg.message === 'string') {
|
||||||
|
llmError.value = msg.message
|
||||||
|
llmRunning.value = false
|
||||||
|
es.close()
|
||||||
|
llmEventSource.value = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
es.onerror = () => {
|
||||||
|
if (llmRunning.value) llmError.value = 'Connection lost'
|
||||||
|
llmRunning.value = false
|
||||||
|
es.close()
|
||||||
|
llmEventSource.value = null
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Model picker computed ─────────────────────────────────────────────────────
|
// ── Model picker computed ─────────────────────────────────────────────────────
|
||||||
const pickerSummaryText = computed(() => {
|
const pickerSummaryText = computed(() => {
|
||||||
const total = allModels.value.length
|
const total = allModels.value.length
|
||||||
|
|
@ -548,6 +982,9 @@ onMounted(() => {
|
||||||
loadResults()
|
loadResults()
|
||||||
loadFineTunedModels()
|
loadFineTunedModels()
|
||||||
loadModelCategories()
|
loadModelCategories()
|
||||||
|
loadLlmTasks()
|
||||||
|
loadLlmModels()
|
||||||
|
loadLlmResults()
|
||||||
})
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
|
|
@ -1092,4 +1529,78 @@ details[open] .ft-summary::before { content: '▼ '; }
|
||||||
.ft-controls { flex-direction: column; align-items: stretch; }
|
.ft-controls { flex-direction: column; align-items: stretch; }
|
||||||
.ft-select { min-width: 0; width: 100%; }
|
.ft-select { min-width: 0; width: 100%; }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* ── Mode toggle (segmented control / pill) ─────── */
|
||||||
|
.mode-toggle {
|
||||||
|
display: inline-flex;
|
||||||
|
border: 1px solid var(--color-border, #d0d7e8);
|
||||||
|
border-radius: 0.5rem;
|
||||||
|
overflow: hidden;
|
||||||
|
align-self: flex-start;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-btn {
|
||||||
|
padding: 0.4rem 1.1rem;
|
||||||
|
font-size: 0.85rem;
|
||||||
|
font-family: var(--font-body, sans-serif);
|
||||||
|
font-weight: 500;
|
||||||
|
border: none;
|
||||||
|
background: var(--color-surface, #fff);
|
||||||
|
color: var(--color-text-secondary, #6b7a99);
|
||||||
|
cursor: pointer;
|
||||||
|
transition: background 0.15s, color 0.15s;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-btn:not(:last-child) {
|
||||||
|
border-right: 1px solid var(--color-border, #d0d7e8);
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-btn.active {
|
||||||
|
background: var(--app-primary, #2A6080);
|
||||||
|
color: #fff;
|
||||||
|
}
|
||||||
|
|
||||||
|
.mode-btn:not(.active):hover {
|
||||||
|
background: var(--color-surface-raised, #e4ebf5);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── LLM run controls ───────────────────────────── */
|
||||||
|
.llm-run-controls {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.75rem;
|
||||||
|
flex-wrap: wrap;
|
||||||
|
}
|
||||||
|
|
||||||
|
.llm-run-hint {
|
||||||
|
font-size: 0.8rem;
|
||||||
|
color: var(--color-text-secondary, #6b7a99);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* ── LLM results table tweaks ───────────────────── */
|
||||||
|
.llm-results-table .bt-best {
|
||||||
|
color: var(--color-success, #3a7a32);
|
||||||
|
font-weight: 700;
|
||||||
|
background: color-mix(in srgb, var(--color-success, #3a7a32) 8%, transparent);
|
||||||
|
}
|
||||||
|
|
||||||
|
.llm-model-name-cell {
|
||||||
|
font-family: var(--font-mono, monospace);
|
||||||
|
font-size: 0.75rem;
|
||||||
|
white-space: nowrap;
|
||||||
|
max-width: 16rem;
|
||||||
|
overflow: hidden;
|
||||||
|
text-overflow: ellipsis;
|
||||||
|
background: var(--color-surface, #fff);
|
||||||
|
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||||
|
padding: 0.35rem 0.6rem;
|
||||||
|
position: sticky;
|
||||||
|
left: 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
.llm-tps-cell {
|
||||||
|
font-family: var(--font-mono, monospace);
|
||||||
|
font-variant-numeric: tabular-nums;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,44 @@
|
||||||
<p class="bench-hint">Highlighted cells are the best-scoring model per metric.</p>
|
<p class="bench-hint">Highlighted cells are the best-scoring model per metric.</p>
|
||||||
</template>
|
</template>
|
||||||
|
|
||||||
|
<!-- LLM Benchmark Results -->
|
||||||
|
<template v-if="llmResults.length > 0">
|
||||||
|
<h2 class="section-title">🤖 LLM Benchmark</h2>
|
||||||
|
<div class="bench-table-wrap">
|
||||||
|
<table class="bench-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="bt-model-col">Model</th>
|
||||||
|
<th class="bt-metric-col">overall</th>
|
||||||
|
<th
|
||||||
|
v-for="col in llmTaskTypeCols"
|
||||||
|
:key="col"
|
||||||
|
class="bt-metric-col"
|
||||||
|
>{{ col }}</th>
|
||||||
|
<th class="bt-metric-col">tok/s</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr v-for="row in llmResults" :key="row.model_id">
|
||||||
|
<td class="bt-model-cell" :title="row.model_id">{{ row.model_name }}</td>
|
||||||
|
<td
|
||||||
|
class="bt-metric-cell"
|
||||||
|
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
|
||||||
|
>{{ llmPct(row.avg_quality_score) }}</td>
|
||||||
|
<td
|
||||||
|
v-for="col in llmTaskTypeCols"
|
||||||
|
:key="col"
|
||||||
|
class="bt-metric-cell"
|
||||||
|
:class="{ 'bt-best': llmBestByCol[col] === row.model_id }"
|
||||||
|
>{{ row.quality_by_task_type[col] != null ? llmPct(row.quality_by_task_type[col]) : '—' }}</td>
|
||||||
|
<td class="bt-metric-cell">{{ row.avg_tokens_per_sec.toFixed(1) }}</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
<p class="bench-hint">Run LLM Eval on the Benchmark tab to refresh. Highlighted = best per column.</p>
|
||||||
|
</template>
|
||||||
|
|
||||||
<div class="file-info">
|
<div class="file-info">
|
||||||
<span class="file-path">Score file: <code>data/email_score.jsonl</code></span>
|
<span class="file-path">Score file: <code>data/email_score.jsonl</code></span>
|
||||||
<span class="file-size">{{ fileSizeLabel }}</span>
|
<span class="file-size">{{ fileSizeLabel }}</span>
|
||||||
|
|
@ -94,6 +132,18 @@ interface BenchmarkModelResult {
|
||||||
[key: string]: number | undefined
|
[key: string]: number | undefined
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface LlmModelResult {
|
||||||
|
model_name: string
|
||||||
|
model_id: string
|
||||||
|
node_id: string
|
||||||
|
avg_tokens_per_sec: number
|
||||||
|
avg_completion_ms: number
|
||||||
|
avg_quality_score: number
|
||||||
|
finetune_candidates: number
|
||||||
|
error_count: number
|
||||||
|
quality_by_task_type: Record<string, number>
|
||||||
|
}
|
||||||
|
|
||||||
interface StatsResponse {
|
interface StatsResponse {
|
||||||
total: number
|
total: number
|
||||||
counts: Record<string, number>
|
counts: Record<string, number>
|
||||||
|
|
@ -185,6 +235,49 @@ function formatMetric(v: number | undefined): string {
|
||||||
return `${v.toFixed(1)}%`
|
return `${v.toFixed(1)}%`
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── LLM Benchmark results ────────────────────────────────────────────────────
|
||||||
|
const llmResults = ref<LlmModelResult[]>([])
|
||||||
|
|
||||||
|
const llmTaskTypeCols = computed(() => {
|
||||||
|
const types = new Set<string>()
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
|
||||||
|
}
|
||||||
|
return [...types].sort()
|
||||||
|
})
|
||||||
|
|
||||||
|
const llmBestByCol = computed((): Record<string, string> => {
|
||||||
|
const best: Record<string, string> = {}
|
||||||
|
if (llmResults.value.length === 0) return best
|
||||||
|
|
||||||
|
let bestId = '', bestVal = -Infinity
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
if (r.avg_quality_score > bestVal) { bestVal = r.avg_quality_score; bestId = r.model_id }
|
||||||
|
}
|
||||||
|
best['overall'] = bestId
|
||||||
|
|
||||||
|
for (const col of llmTaskTypeCols.value) {
|
||||||
|
bestId = ''; bestVal = -Infinity
|
||||||
|
for (const r of llmResults.value) {
|
||||||
|
const v = r.quality_by_task_type[col]
|
||||||
|
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
||||||
|
}
|
||||||
|
best[col] = bestId
|
||||||
|
}
|
||||||
|
return best
|
||||||
|
})
|
||||||
|
|
||||||
|
function llmPct(v: number): string {
|
||||||
|
return `${(v * 100).toFixed(1)}%`
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadLlmResults() {
|
||||||
|
const { data } = await useApiFetch<LlmModelResult[]>('/api/cforch/results')
|
||||||
|
if (Array.isArray(data) && data.length > 0) {
|
||||||
|
llmResults.value = data
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function load() {
|
async function load() {
|
||||||
loading.value = true
|
loading.value = true
|
||||||
error.value = ''
|
error.value = ''
|
||||||
|
|
@ -197,7 +290,10 @@ async function load() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
onMounted(load)
|
onMounted(() => {
|
||||||
|
load()
|
||||||
|
loadLlmResults()
|
||||||
|
})
|
||||||
</script>
|
</script>
|
||||||
|
|
||||||
<style scoped>
|
<style scoped>
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue