refactor(bench): extract benchmark tabs — classifier, compare, llm-eval, style, voice
- BenchmarkView.vue: convert from monolithic view to tabbed shell; each tab is now its own component (ClassifierTab, CompareTab, LlmEvalTab, StyleTab, VoiceTab) - StyleTab + VoiceTab: new benchmark modes for style and voice model evaluation - app/style.py: FastAPI router for style imitation benchmarks - app/voice.py: FastAPI router for voice benchmark endpoints - scripts/benchmark_style.py + benchmark_voice.py: headless runner scripts
This commit is contained in:
parent
cc24cd0d7d
commit
ddb56efb89
10 changed files with 7023 additions and 1837 deletions
427
app/style.py
Normal file
427
app/style.py
Normal file
|
|
@ -0,0 +1,427 @@
|
|||
"""Avocet — Writing style benchmark integration API.
|
||||
|
||||
Wraps scripts/benchmark_style.py and exposes it via the Avocet API.
|
||||
Connection config (coordinator_url, ollama_url, python_bin) is read
|
||||
from label_tool.yaml under the `cforch:` key — the same block used
|
||||
by cforch.py, so no new config section is needed.
|
||||
|
||||
All endpoints are registered on `router` (a FastAPI APIRouter).
|
||||
api.py includes this router with prefix="/api/style".
|
||||
|
||||
Module-level globals (_BENCH_RUNNING, _bench_proc) follow the same
|
||||
testability pattern as cforch.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess as _subprocess
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import yaml
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_CONFIG_DIR: Path | None = None # override in tests via set_config_dir()
|
||||
_BENCH_RUNNING: bool = False
|
||||
_bench_proc: Any = None
|
||||
|
||||
_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_style.py"
|
||||
_RESULTS_DIR = _ROOT / "benchmark_results"
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# ── Testability seams ──────────────────────────────────────────────────────────
|
||||
|
||||
def set_config_dir(path: Path | None) -> None:
|
||||
global _CONFIG_DIR
|
||||
_CONFIG_DIR = path
|
||||
|
||||
|
||||
# ── Internal helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
def _config_file() -> Path:
|
||||
if _CONFIG_DIR is not None:
|
||||
return _CONFIG_DIR / "label_tool.yaml"
|
||||
return _ROOT / "config" / "label_tool.yaml"
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
"""Read label_tool.yaml cforch section for coordinator/ollama/python config."""
|
||||
f = _config_file()
|
||||
file_cfg: dict = {}
|
||||
if f.exists():
|
||||
try:
|
||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
file_cfg = raw.get("cforch", {}) or {}
|
||||
except yaml.YAMLError as exc:
|
||||
logger.warning("Failed to parse style config %s: %s", f, exc)
|
||||
return {
|
||||
"coordinator_url": file_cfg.get("coordinator_url", "http://10.1.10.71:7700"),
|
||||
"ollama_url": file_cfg.get("ollama_url", "http://localhost:11434"),
|
||||
"python_bin": file_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python"),
|
||||
}
|
||||
|
||||
|
||||
# ── GET /models ────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/models")
|
||||
def get_models() -> dict:
|
||||
"""Return available models grouped by source.
|
||||
|
||||
- ollama: fetched live from /api/tags (includes any models downloaded
|
||||
via the Models view — automatically in sync)
|
||||
- cf_text: fetched from cf-orch catalog endpoint (requires node profile
|
||||
entry + coordinator restart when new GGUFs are added)
|
||||
"""
|
||||
cfg = _load_config()
|
||||
|
||||
# Ollama models — live query so newly downloaded models appear immediately
|
||||
ollama_models: list[dict] = []
|
||||
try:
|
||||
resp = httpx.get(f"{cfg['ollama_url']}/api/tags", timeout=5.0)
|
||||
resp.raise_for_status()
|
||||
for m in resp.json().get("models", []):
|
||||
name = m.get("name", "")
|
||||
if name:
|
||||
size_bytes = m.get("size", 0)
|
||||
ollama_models.append({
|
||||
"id": name,
|
||||
"name": name,
|
||||
"source": "ollama",
|
||||
"size_mb": round(size_bytes / (1024 * 1024)) if size_bytes else None,
|
||||
"vram_mb": None,
|
||||
})
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch ollama models: %s", exc)
|
||||
|
||||
# cf-text catalog — fetched from cf-orch coordinator
|
||||
cftext_models: list[dict] = []
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
|
||||
timeout=5.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
for model_id, entry in resp.json().items():
|
||||
if isinstance(entry, dict):
|
||||
cftext_models.append({
|
||||
"id": model_id,
|
||||
"name": model_id,
|
||||
"source": "cf-text",
|
||||
"vram_mb": entry.get("vram_mb"),
|
||||
"description": entry.get("description", ""),
|
||||
})
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch cf-text catalog: %s", exc)
|
||||
|
||||
return {"ollama": ollama_models, "cf_text": cftext_models}
|
||||
|
||||
|
||||
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/run")
|
||||
def run_style_benchmark(
|
||||
models: str = Query("", description="Comma-separated model IDs (empty = all)"),
|
||||
use_cforch: bool = Query(False),
|
||||
max_vram: int = Query(7200, description="Max VRAM MB for cf-orch OOM filter"),
|
||||
include_large: bool = Query(False, description="Include large (30B+) ollama models"),
|
||||
workers: int = Query(1, description="Parallel workers — run N models simultaneously"),
|
||||
) -> StreamingResponse:
|
||||
"""Spawn benchmark_style.py and stream stdout as SSE progress events.
|
||||
|
||||
On successful completion, emits a final `type: result` event containing
|
||||
the parsed JSON from the newest style_*.json file.
|
||||
"""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if _BENCH_RUNNING:
|
||||
raise HTTPException(409, "A writing style benchmark is already running")
|
||||
|
||||
cfg = _load_config()
|
||||
python_bin = cfg["python_bin"]
|
||||
|
||||
def generate():
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if not _BENCH_SCRIPT.exists():
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_style.py not found at {_BENCH_SCRIPT}'})}\n\n"
|
||||
return
|
||||
|
||||
cmd = [python_bin, str(_BENCH_SCRIPT), "run"]
|
||||
|
||||
if models:
|
||||
cmd.extend(["--models", ",".join(m.strip() for m in models.split(",") if m.strip())])
|
||||
if use_cforch:
|
||||
cmd.extend(["--cforch", "--cforch-url", cfg["coordinator_url"],
|
||||
"--max-vram", str(max_vram)])
|
||||
if include_large:
|
||||
cmd.append("--include-large")
|
||||
if workers > 1:
|
||||
cmd.extend(["--workers", str(workers)])
|
||||
|
||||
_BENCH_RUNNING = True
|
||||
try:
|
||||
proc = _subprocess.Popen(
|
||||
cmd,
|
||||
stdout=_subprocess.PIPE,
|
||||
stderr=_subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
cwd=str(_ROOT),
|
||||
)
|
||||
_bench_proc = proc
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||
proc.wait()
|
||||
if proc.returncode == 0:
|
||||
result_files = sorted(_RESULTS_DIR.glob("style_*.json"))
|
||||
if result_files:
|
||||
try:
|
||||
results = json.loads(result_files[-1].read_text(encoding="utf-8"))
|
||||
yield f"data: {json.dumps({'type': 'result', 'results': results, 'filename': result_files[-1].name})}\n\n"
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read style results: %s", exc)
|
||||
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
|
||||
else:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
|
||||
finally:
|
||||
_bench_proc = None
|
||||
except Exception as exc:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
|
||||
finally:
|
||||
_BENCH_RUNNING = False
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/event-stream",
|
||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||
)
|
||||
|
||||
|
||||
# ── GET /results ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/results")
|
||||
def list_results() -> list[dict]:
|
||||
"""List past writing style benchmark runs, newest first.
|
||||
|
||||
Returns lightweight summaries (date, model count, top score).
|
||||
Use /results/{filename} to fetch full model-level detail.
|
||||
"""
|
||||
if not _RESULTS_DIR.exists():
|
||||
return []
|
||||
|
||||
runs: list[dict] = []
|
||||
for f in sorted(_RESULTS_DIR.glob("style_*.json"), reverse=True):
|
||||
stem = f.stem # style_2026-04-22_1502
|
||||
date_str = stem.removeprefix("style_") # 2026-04-22_1502
|
||||
try:
|
||||
date_part, time_part = date_str.split("_")
|
||||
display_date = f"{date_part} {time_part[:2]}:{time_part[2:]}"
|
||||
except Exception:
|
||||
display_date = date_str
|
||||
|
||||
try:
|
||||
results = json.loads(f.read_text(encoding="utf-8"))
|
||||
top_score = max((r.get("avg_score", 0) for r in results), default=0)
|
||||
model_count = len(results)
|
||||
except Exception:
|
||||
top_score = 0
|
||||
model_count = 0
|
||||
|
||||
runs.append({
|
||||
"filename": f.name,
|
||||
"date": display_date,
|
||||
"model_count": model_count,
|
||||
"top_score": round(top_score, 1),
|
||||
})
|
||||
|
||||
return runs
|
||||
|
||||
|
||||
@router.get("/results/latest")
|
||||
def get_latest_results() -> list[dict]:
|
||||
"""Return the latest writing style benchmark result list."""
|
||||
if not _RESULTS_DIR.exists():
|
||||
raise HTTPException(404, "No benchmark results found")
|
||||
files = sorted(_RESULTS_DIR.glob("style_*.json"))
|
||||
if not files:
|
||||
raise HTTPException(404, "No benchmark results found")
|
||||
try:
|
||||
return json.loads(files[-1].read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
|
||||
@router.get("/results/{filename}")
|
||||
def get_results_by_filename(filename: str) -> list[dict]:
|
||||
"""Return writing style benchmark results for a specific run file."""
|
||||
if not filename.startswith("style_") or not filename.endswith(".json"):
|
||||
raise HTTPException(400, "Invalid filename — expected style_*.json")
|
||||
f = _RESULTS_DIR / filename
|
||||
if not f.exists():
|
||||
raise HTTPException(404, f"Results file not found: {filename}")
|
||||
try:
|
||||
return json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
|
||||
# ── POST /send-to-corrections ──────────────────────────────────────────────────
|
||||
|
||||
class SendToCorrectionsRequest(BaseModel):
|
||||
filename: str # style_YYYY-MM-DD_HHMM.json — the source run file
|
||||
model_ids: list[str] = [] # empty = all models in the run
|
||||
|
||||
|
||||
@router.post("/send-to-corrections")
|
||||
def send_to_corrections(req: SendToCorrectionsRequest) -> dict:
|
||||
"""Push writing style benchmark outputs into the SFT corrections queue.
|
||||
|
||||
Each prompt_result from the selected models becomes one SFT candidate
|
||||
with status='needs_review'. Duplicates are skipped via the 'id' field
|
||||
(hash of model_id + tag).
|
||||
"""
|
||||
if not req.filename.startswith("style_") or not req.filename.endswith(".json"):
|
||||
raise HTTPException(400, "Invalid filename")
|
||||
|
||||
src = _RESULTS_DIR / req.filename
|
||||
if not src.exists():
|
||||
raise HTTPException(404, f"Results file not found: {req.filename}")
|
||||
|
||||
try:
|
||||
run_results: list[dict] = json.loads(src.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
# Resolve sft_candidates.jsonl path (same logic as sft.py)
|
||||
sft_data_dir = _ROOT / "data"
|
||||
sft_file = sft_data_dir / "sft_candidates.jsonl"
|
||||
|
||||
# Load existing IDs to deduplicate
|
||||
existing_ids: set[str] = set()
|
||||
if sft_file.exists():
|
||||
for line in sft_file.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
existing_ids.add(json.loads(line)["id"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
run_id = req.filename.removesuffix(".json") # style_2026-04-22_1502
|
||||
timestamp = datetime.now(tz=timezone.utc).isoformat()
|
||||
|
||||
new_candidates: list[dict] = []
|
||||
for model_result in run_results:
|
||||
model_id = model_result.get("model_id", "")
|
||||
if req.model_ids and model_id not in req.model_ids:
|
||||
continue
|
||||
for pr in model_result.get("prompt_results", []):
|
||||
tag = pr.get("tag", "")
|
||||
# Stable id: deterministic hash of run + model + prompt tag
|
||||
candidate_id = str(uuid.uuid5(
|
||||
uuid.NAMESPACE_URL,
|
||||
f"style-benchmark/{run_id}/{model_id}/{tag}",
|
||||
))
|
||||
if candidate_id in existing_ids:
|
||||
continue
|
||||
|
||||
score_pct = pr.get("score", 0.0) / 100.0
|
||||
signals = pr.get("signals", {})
|
||||
|
||||
# Build the prompt message list matching the benchmark's actual request
|
||||
prompt_messages = [
|
||||
{"role": "system", "content": _STYLE_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": pr.get("user_prompt", tag)},
|
||||
]
|
||||
|
||||
new_candidates.append({
|
||||
"id": candidate_id,
|
||||
"source": "style-benchmark",
|
||||
"benchmark_run_id": run_id,
|
||||
"timestamp": timestamp,
|
||||
"status": "needs_review",
|
||||
"prompt_messages": prompt_messages,
|
||||
"model_response": pr.get("output", ""),
|
||||
"corrected_response": None,
|
||||
"quality_score": round(score_pct, 4),
|
||||
"failure_reason": _build_failure_reason(pr, signals),
|
||||
"failure_category": None,
|
||||
"task_id": f"style/{tag}",
|
||||
"task_type": "style-match",
|
||||
"task_name": tag.replace("_", " ").title(),
|
||||
"model_id": model_id,
|
||||
"model_name": model_id,
|
||||
"node_id": "",
|
||||
"gpu_id": 0,
|
||||
"tokens_per_sec": 0,
|
||||
})
|
||||
existing_ids.add(candidate_id)
|
||||
|
||||
if new_candidates:
|
||||
sft_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(sft_file, "a", encoding="utf-8") as fh:
|
||||
for c in new_candidates:
|
||||
fh.write(json.dumps(c) + "\n")
|
||||
|
||||
return {"imported": len(new_candidates), "skipped": 0}
|
||||
|
||||
|
||||
# Excerpt of the system prompt used in benchmark_style.py — reproduced here
|
||||
# so the SFT candidate captures the full generation context.
|
||||
_STYLE_SYSTEM_PROMPT = (
|
||||
"You are a writing assistant. Your job is to write a Reddit reply that matches "
|
||||
"the voice, tone, and style of the provided samples exactly.\n\n"
|
||||
"Voice characteristics:\n"
|
||||
"- Casual engineer tone. Short punchy sentences.\n"
|
||||
"- No em dashes. No semicolons. No filler phrases.\n"
|
||||
"- Direct. Opinionated. Community-first."
|
||||
)
|
||||
|
||||
|
||||
def _build_failure_reason(pr: dict, signals: dict) -> str | None:
|
||||
"""Return a human-readable failure reason string if there are violations."""
|
||||
reasons = []
|
||||
if signals.get("em_dash_count", 0) > 0:
|
||||
reasons.append(f"{signals['em_dash_count']} em dash(es)")
|
||||
if signals.get("semicolon_count", 0) > 0:
|
||||
reasons.append(f"{signals['semicolon_count']} semicolon(s)")
|
||||
if signals.get("filler_hits"):
|
||||
reasons.append(f"filler phrases: {', '.join(signals['filler_hits'])}")
|
||||
if not pr.get("output", "").strip():
|
||||
reasons.append("empty output")
|
||||
return "; ".join(reasons) if reasons else None
|
||||
|
||||
|
||||
# ── POST /cancel ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/cancel")
|
||||
def cancel_style_benchmark() -> dict:
|
||||
"""Kill the running writing style benchmark subprocess."""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if not _BENCH_RUNNING:
|
||||
raise HTTPException(404, "No writing style benchmark is currently running")
|
||||
|
||||
if _bench_proc is not None:
|
||||
try:
|
||||
_bench_proc.terminate()
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to terminate style benchmark: %s", exc)
|
||||
|
||||
_BENCH_RUNNING = False
|
||||
_bench_proc = None
|
||||
return {"status": "cancelled"}
|
||||
427
app/voice.py
Normal file
427
app/voice.py
Normal file
|
|
@ -0,0 +1,427 @@
|
|||
"""Avocet — Voice benchmark integration API.
|
||||
|
||||
Wraps scripts/benchmark_voice.py and exposes it via the Avocet API.
|
||||
Connection config (coordinator_url, ollama_url, python_bin) is read
|
||||
from label_tool.yaml under the `cforch:` key — the same block used
|
||||
by cforch.py, so no new config section is needed.
|
||||
|
||||
All endpoints are registered on `router` (a FastAPI APIRouter).
|
||||
api.py includes this router with prefix="/api/voice".
|
||||
|
||||
Module-level globals (_BENCH_RUNNING, _bench_proc) follow the same
|
||||
testability pattern as cforch.py.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess as _subprocess
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
import yaml
|
||||
from fastapi import APIRouter, HTTPException, Query
|
||||
from fastapi.responses import StreamingResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_CONFIG_DIR: Path | None = None # override in tests via set_config_dir()
|
||||
_BENCH_RUNNING: bool = False
|
||||
_bench_proc: Any = None
|
||||
|
||||
_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_voice.py"
|
||||
_RESULTS_DIR = _ROOT / "benchmark_results"
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# ── Testability seams ──────────────────────────────────────────────────────────
|
||||
|
||||
def set_config_dir(path: Path | None) -> None:
|
||||
global _CONFIG_DIR
|
||||
_CONFIG_DIR = path
|
||||
|
||||
|
||||
# ── Internal helpers ───────────────────────────────────────────────────────────
|
||||
|
||||
def _config_file() -> Path:
|
||||
if _CONFIG_DIR is not None:
|
||||
return _CONFIG_DIR / "label_tool.yaml"
|
||||
return _ROOT / "config" / "label_tool.yaml"
|
||||
|
||||
|
||||
def _load_config() -> dict:
|
||||
"""Read label_tool.yaml cforch section for coordinator/ollama/python config."""
|
||||
f = _config_file()
|
||||
file_cfg: dict = {}
|
||||
if f.exists():
|
||||
try:
|
||||
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||
file_cfg = raw.get("cforch", {}) or {}
|
||||
except yaml.YAMLError as exc:
|
||||
logger.warning("Failed to parse voice config %s: %s", f, exc)
|
||||
return {
|
||||
"coordinator_url": file_cfg.get("coordinator_url", "http://10.1.10.71:7700"),
|
||||
"ollama_url": file_cfg.get("ollama_url", "http://localhost:11434"),
|
||||
"python_bin": file_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python"),
|
||||
}
|
||||
|
||||
|
||||
# ── GET /models ────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/models")
|
||||
def get_models() -> dict:
|
||||
"""Return available models grouped by source.
|
||||
|
||||
- ollama: fetched live from /api/tags (includes any models downloaded
|
||||
via the Models view — automatically in sync)
|
||||
- cf_text: fetched from cf-orch catalog endpoint (requires node profile
|
||||
entry + coordinator restart when new GGUFs are added)
|
||||
"""
|
||||
cfg = _load_config()
|
||||
|
||||
# Ollama models — live query so newly downloaded models appear immediately
|
||||
ollama_models: list[dict] = []
|
||||
try:
|
||||
resp = httpx.get(f"{cfg['ollama_url']}/api/tags", timeout=5.0)
|
||||
resp.raise_for_status()
|
||||
for m in resp.json().get("models", []):
|
||||
name = m.get("name", "")
|
||||
if name:
|
||||
size_bytes = m.get("size", 0)
|
||||
ollama_models.append({
|
||||
"id": name,
|
||||
"name": name,
|
||||
"source": "ollama",
|
||||
"size_mb": round(size_bytes / (1024 * 1024)) if size_bytes else None,
|
||||
"vram_mb": None,
|
||||
})
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch ollama models: %s", exc)
|
||||
|
||||
# cf-text catalog — fetched from cf-orch coordinator
|
||||
cftext_models: list[dict] = []
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
|
||||
timeout=5.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
for model_id, entry in resp.json().items():
|
||||
if isinstance(entry, dict):
|
||||
cftext_models.append({
|
||||
"id": model_id,
|
||||
"name": model_id,
|
||||
"source": "cf-text",
|
||||
"vram_mb": entry.get("vram_mb"),
|
||||
"description": entry.get("description", ""),
|
||||
})
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to fetch cf-text catalog: %s", exc)
|
||||
|
||||
return {"ollama": ollama_models, "cf_text": cftext_models}
|
||||
|
||||
|
||||
# ── GET /run ───────────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/run")
|
||||
def run_voice_benchmark(
|
||||
models: str = Query("", description="Comma-separated model IDs (empty = all)"),
|
||||
use_cforch: bool = Query(False),
|
||||
max_vram: int = Query(7200, description="Max VRAM MB for cf-orch OOM filter"),
|
||||
include_large: bool = Query(False, description="Include large (30B+) ollama models"),
|
||||
workers: int = Query(1, description="Parallel workers — run N models simultaneously"),
|
||||
) -> StreamingResponse:
|
||||
"""Spawn benchmark_voice.py and stream stdout as SSE progress events.
|
||||
|
||||
On successful completion, emits a final `type: result` event containing
|
||||
the parsed JSON from the newest voice_*.json file.
|
||||
"""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if _BENCH_RUNNING:
|
||||
raise HTTPException(409, "A voice benchmark is already running")
|
||||
|
||||
cfg = _load_config()
|
||||
python_bin = cfg["python_bin"]
|
||||
|
||||
def generate():
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if not _BENCH_SCRIPT.exists():
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_voice.py not found at {_BENCH_SCRIPT}'})}\n\n"
|
||||
return
|
||||
|
||||
cmd = [python_bin, str(_BENCH_SCRIPT), "run"]
|
||||
|
||||
if models:
|
||||
cmd.extend(["--models", ",".join(m.strip() for m in models.split(",") if m.strip())])
|
||||
if use_cforch:
|
||||
cmd.extend(["--cforch", "--cforch-url", cfg["coordinator_url"],
|
||||
"--max-vram", str(max_vram)])
|
||||
if include_large:
|
||||
cmd.append("--include-large")
|
||||
if workers > 1:
|
||||
cmd.extend(["--workers", str(workers)])
|
||||
|
||||
_BENCH_RUNNING = True
|
||||
try:
|
||||
proc = _subprocess.Popen(
|
||||
cmd,
|
||||
stdout=_subprocess.PIPE,
|
||||
stderr=_subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
cwd=str(_ROOT),
|
||||
)
|
||||
_bench_proc = proc
|
||||
try:
|
||||
for line in proc.stdout:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||
proc.wait()
|
||||
if proc.returncode == 0:
|
||||
result_files = sorted(_RESULTS_DIR.glob("voice_*.json"))
|
||||
if result_files:
|
||||
try:
|
||||
results = json.loads(result_files[-1].read_text(encoding="utf-8"))
|
||||
yield f"data: {json.dumps({'type': 'result', 'results': results, 'filename': result_files[-1].name})}\n\n"
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to read voice results: %s", exc)
|
||||
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
|
||||
else:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
|
||||
finally:
|
||||
_bench_proc = None
|
||||
except Exception as exc:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
|
||||
finally:
|
||||
_BENCH_RUNNING = False
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/event-stream",
|
||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||
)
|
||||
|
||||
|
||||
# ── GET /results ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.get("/results")
|
||||
def list_results() -> list[dict]:
|
||||
"""List past voice benchmark runs, newest first.
|
||||
|
||||
Returns lightweight summaries (date, model count, top score).
|
||||
Use /results/{filename} to fetch full model-level detail.
|
||||
"""
|
||||
if not _RESULTS_DIR.exists():
|
||||
return []
|
||||
|
||||
runs: list[dict] = []
|
||||
for f in sorted(_RESULTS_DIR.glob("voice_*.json"), reverse=True):
|
||||
stem = f.stem # voice_2026-04-22_1502
|
||||
date_str = stem.removeprefix("voice_") # 2026-04-22_1502
|
||||
try:
|
||||
date_part, time_part = date_str.split("_")
|
||||
display_date = f"{date_part} {time_part[:2]}:{time_part[2:]}"
|
||||
except Exception:
|
||||
display_date = date_str
|
||||
|
||||
try:
|
||||
results = json.loads(f.read_text(encoding="utf-8"))
|
||||
top_score = max((r.get("avg_score", 0) for r in results), default=0)
|
||||
model_count = len(results)
|
||||
except Exception:
|
||||
top_score = 0
|
||||
model_count = 0
|
||||
|
||||
runs.append({
|
||||
"filename": f.name,
|
||||
"date": display_date,
|
||||
"model_count": model_count,
|
||||
"top_score": round(top_score, 1),
|
||||
})
|
||||
|
||||
return runs
|
||||
|
||||
|
||||
@router.get("/results/latest")
|
||||
def get_latest_results() -> list[dict]:
|
||||
"""Return the latest voice benchmark result list."""
|
||||
if not _RESULTS_DIR.exists():
|
||||
raise HTTPException(404, "No benchmark results found")
|
||||
files = sorted(_RESULTS_DIR.glob("voice_*.json"))
|
||||
if not files:
|
||||
raise HTTPException(404, "No benchmark results found")
|
||||
try:
|
||||
return json.loads(files[-1].read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
|
||||
@router.get("/results/{filename}")
|
||||
def get_results_by_filename(filename: str) -> list[dict]:
|
||||
"""Return voice benchmark results for a specific run file."""
|
||||
if not filename.startswith("voice_") or not filename.endswith(".json"):
|
||||
raise HTTPException(400, "Invalid filename — expected voice_*.json")
|
||||
f = _RESULTS_DIR / filename
|
||||
if not f.exists():
|
||||
raise HTTPException(404, f"Results file not found: {filename}")
|
||||
try:
|
||||
return json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
|
||||
# ── POST /send-to-corrections ──────────────────────────────────────────────────
|
||||
|
||||
class SendToCorrectionsRequest(BaseModel):
|
||||
filename: str # voice_YYYY-MM-DD_HHMM.json — the source run file
|
||||
model_ids: list[str] = [] # empty = all models in the run
|
||||
|
||||
|
||||
@router.post("/send-to-corrections")
|
||||
def send_to_corrections(req: SendToCorrectionsRequest) -> dict:
|
||||
"""Push voice benchmark outputs into the SFT corrections queue.
|
||||
|
||||
Each prompt_result from the selected models becomes one SFT candidate
|
||||
with status='needs_review'. Duplicates are skipped via the 'id' field
|
||||
(hash of model_id + tag).
|
||||
"""
|
||||
if not req.filename.startswith("voice_") or not req.filename.endswith(".json"):
|
||||
raise HTTPException(400, "Invalid filename")
|
||||
|
||||
src = _RESULTS_DIR / req.filename
|
||||
if not src.exists():
|
||||
raise HTTPException(404, f"Results file not found: {req.filename}")
|
||||
|
||||
try:
|
||||
run_results: list[dict] = json.loads(src.read_text(encoding="utf-8"))
|
||||
except Exception as exc:
|
||||
raise HTTPException(500, f"Failed to read results: {exc}") from exc
|
||||
|
||||
# Resolve sft_candidates.jsonl path (same logic as sft.py)
|
||||
sft_data_dir = _ROOT / "data"
|
||||
sft_file = sft_data_dir / "sft_candidates.jsonl"
|
||||
|
||||
# Load existing IDs to deduplicate
|
||||
existing_ids: set[str] = set()
|
||||
if sft_file.exists():
|
||||
for line in sft_file.read_text(encoding="utf-8").splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
existing_ids.add(json.loads(line)["id"])
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
run_id = req.filename.removesuffix(".json") # voice_2026-04-22_1502
|
||||
timestamp = datetime.now(tz=timezone.utc).isoformat()
|
||||
|
||||
new_candidates: list[dict] = []
|
||||
for model_result in run_results:
|
||||
model_id = model_result.get("model_id", "")
|
||||
if req.model_ids and model_id not in req.model_ids:
|
||||
continue
|
||||
for pr in model_result.get("prompt_results", []):
|
||||
tag = pr.get("tag", "")
|
||||
# Stable id: deterministic hash of run + model + prompt tag
|
||||
candidate_id = str(uuid.uuid5(
|
||||
uuid.NAMESPACE_URL,
|
||||
f"voice-benchmark/{run_id}/{model_id}/{tag}",
|
||||
))
|
||||
if candidate_id in existing_ids:
|
||||
continue
|
||||
|
||||
score_pct = pr.get("score", 0.0) / 100.0
|
||||
signals = pr.get("signals", {})
|
||||
|
||||
# Build the prompt message list matching the benchmark's actual request
|
||||
prompt_messages = [
|
||||
{"role": "system", "content": _VOICE_SYSTEM_PROMPT},
|
||||
{"role": "user", "content": pr.get("user_prompt", tag)},
|
||||
]
|
||||
|
||||
new_candidates.append({
|
||||
"id": candidate_id,
|
||||
"source": "voice-benchmark",
|
||||
"benchmark_run_id": run_id,
|
||||
"timestamp": timestamp,
|
||||
"status": "needs_review",
|
||||
"prompt_messages": prompt_messages,
|
||||
"model_response": pr.get("output", ""),
|
||||
"corrected_response": None,
|
||||
"quality_score": round(score_pct, 4),
|
||||
"failure_reason": _build_failure_reason(pr, signals),
|
||||
"failure_category": None,
|
||||
"task_id": f"voice/{tag}",
|
||||
"task_type": "voice-match",
|
||||
"task_name": tag.replace("_", " ").title(),
|
||||
"model_id": model_id,
|
||||
"model_name": model_id,
|
||||
"node_id": "",
|
||||
"gpu_id": 0,
|
||||
"tokens_per_sec": 0,
|
||||
})
|
||||
existing_ids.add(candidate_id)
|
||||
|
||||
if new_candidates:
|
||||
sft_data_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(sft_file, "a", encoding="utf-8") as fh:
|
||||
for c in new_candidates:
|
||||
fh.write(json.dumps(c) + "\n")
|
||||
|
||||
return {"imported": len(new_candidates), "skipped": 0}
|
||||
|
||||
|
||||
# Excerpt of the system prompt used in benchmark_voice.py — reproduced here
|
||||
# so the SFT candidate captures the full generation context.
|
||||
_VOICE_SYSTEM_PROMPT = (
|
||||
"You are a writing assistant. Your job is to write a Reddit reply that matches "
|
||||
"the voice, tone, and style of the provided samples exactly.\n\n"
|
||||
"Voice characteristics:\n"
|
||||
"- Casual engineer tone. Short punchy sentences.\n"
|
||||
"- No em dashes. No semicolons. No filler phrases.\n"
|
||||
"- Direct. Opinionated. Community-first."
|
||||
)
|
||||
|
||||
|
||||
def _build_failure_reason(pr: dict, signals: dict) -> str | None:
|
||||
"""Return a human-readable failure reason string if there are violations."""
|
||||
reasons = []
|
||||
if signals.get("em_dash_count", 0) > 0:
|
||||
reasons.append(f"{signals['em_dash_count']} em dash(es)")
|
||||
if signals.get("semicolon_count", 0) > 0:
|
||||
reasons.append(f"{signals['semicolon_count']} semicolon(s)")
|
||||
if signals.get("filler_hits"):
|
||||
reasons.append(f"filler phrases: {', '.join(signals['filler_hits'])}")
|
||||
if not pr.get("output", "").strip():
|
||||
reasons.append("empty output")
|
||||
return "; ".join(reasons) if reasons else None
|
||||
|
||||
|
||||
# ── POST /cancel ───────────────────────────────────────────────────────────────
|
||||
|
||||
@router.post("/cancel")
|
||||
def cancel_voice_benchmark() -> dict:
|
||||
"""Kill the running voice benchmark subprocess."""
|
||||
global _BENCH_RUNNING, _bench_proc
|
||||
|
||||
if not _BENCH_RUNNING:
|
||||
raise HTTPException(404, "No voice benchmark is currently running")
|
||||
|
||||
if _bench_proc is not None:
|
||||
try:
|
||||
_bench_proc.terminate()
|
||||
except Exception as exc:
|
||||
logger.warning("Failed to terminate voice benchmark: %s", exc)
|
||||
|
||||
_BENCH_RUNNING = False
|
||||
_bench_proc = None
|
||||
return {"status": "cancelled"}
|
||||
952
scripts/benchmark_style.py
Normal file
952
scripts/benchmark_style.py
Normal file
|
|
@ -0,0 +1,952 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Writing style benchmark harness -- score local text-gen models for writing style match.
|
||||
|
||||
Runs each model against a set of test prompts, extracts style signals from the
|
||||
outputs, compares them to a style corpus, and produces a ranked markdown table.
|
||||
|
||||
Usage:
|
||||
# List available ollama models
|
||||
conda run -n cf python scripts/benchmark_style.py --list-models
|
||||
|
||||
# Run against all models with default test prompts
|
||||
conda run -n cf python scripts/benchmark_style.py --run
|
||||
|
||||
# Run specific models only
|
||||
conda run -n cf python scripts/benchmark_style.py --run --models mistral:7b,llama3.1:8b
|
||||
|
||||
# Use a custom corpus directory
|
||||
conda run -n cf python scripts/benchmark_style.py --run --samples data/style_corpus/
|
||||
|
||||
# Print last results table
|
||||
conda run -n cf python scripts/benchmark_style.py --show-last
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_CORPUS_DIR = _ROOT / "data" / "style_corpus"
|
||||
_RESULTS_DIR = _ROOT / "benchmark_results"
|
||||
_OLLAMA_URL = "http://localhost:11434"
|
||||
_CFORCH_URL = "http://localhost:7700"
|
||||
|
||||
# Subdirectories under --scan-disk root that may contain GGUFs
|
||||
_SCAN_SUBDIRS = ["textgen/models", "llama.cpp/models", "cf-text/models", "vllm/models"]
|
||||
|
||||
# ── Filler phrases that should be absent from good style-match output ──────────
|
||||
FILLER_PHRASES: list[str] = [
|
||||
"delve", "certainly", "absolutely", "i apologize", "i'd be happy to",
|
||||
"of course", "great question", "i understand", "let me know if",
|
||||
"feel free to", "it's important to note", "it's worth noting",
|
||||
"in conclusion", "to summarize", "in summary",
|
||||
]
|
||||
|
||||
# ── Test prompts: (thread_title, thread_body, context_tag) ───────────────────
|
||||
# These are representative threads that Magpie might reply to.
|
||||
# Extend this list with real examples as the corpus grows.
|
||||
TEST_PROMPTS: list[dict[str, str]] = [
|
||||
{
|
||||
"tag": "selfhosted_ai_fatigue",
|
||||
"thread_title": "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
|
||||
"thread_body": (
|
||||
"Every session I start over. My whole hardware setup, what tools I use, "
|
||||
"what I've already tried. It's exhausting. There has to be a better way."
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "privacy_local_llm",
|
||||
"thread_title": "What's the point of running local LLMs if the apps still phone home?",
|
||||
"thread_body": (
|
||||
"I went through all the trouble of setting up ollama and now I find out "
|
||||
"the frontend I'm using is sending telemetry. Kind of defeats the purpose."
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "solarpunk_tech",
|
||||
"thread_title": "What does solarpunk computing actually look like in practice?",
|
||||
"thread_body": (
|
||||
"I keep seeing the aesthetic but not a lot of concrete examples of "
|
||||
"people living it out with their tech choices. What does it mean day to day?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "nd_tools",
|
||||
"thread_title": "Tools that actually help with executive function vs ones that just add friction",
|
||||
"thread_body": (
|
||||
"I've tried a dozen productivity apps and most of them require more "
|
||||
"executive function to maintain than they save. What actually sticks for you?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "data_ownership",
|
||||
"thread_title": "Who actually owns your data when you use a 'free' AI tool?",
|
||||
"thread_body": (
|
||||
"Read the ToS on three different AI assistants today. In all three cases "
|
||||
"your inputs can be used for training, shared with partners, and retained "
|
||||
"indefinitely. At what point does 'free' just mean you're the product?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "digital_culture",
|
||||
"thread_title": "The internet used to feel like it belonged to everyone. What happened?",
|
||||
"thread_body": (
|
||||
"I grew up on forums, IRC, personal homepages. Now everything is a platform "
|
||||
"owned by someone trying to extract value from the community that built it. "
|
||||
"Is the fediverse / self-hosting movement actually reversing this or just "
|
||||
"a niche hobby?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
GENERATION_PARAMS: dict[str, Any] = {
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
"num_predict": 300,
|
||||
}
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a writing assistant. Your job is to write a Reddit reply that matches "
|
||||
"the voice, tone, and style of the provided samples exactly.\n\n"
|
||||
"Voice characteristics:\n"
|
||||
"- Casual engineer tone. Short punchy sentences.\n"
|
||||
"- No hype, no buzzwords, no em dashes, no semicolons.\n"
|
||||
"- Community-first perspective. Solarpunk values.\n"
|
||||
"- Direct and opinionated. No throat-clearing or filler.\n"
|
||||
"- When relevant, mention personal experience with real tools.\n\n"
|
||||
"Write ONLY the reply. No preamble, no 'Here is a reply:', no meta-commentary."
|
||||
)
|
||||
|
||||
|
||||
# ── Style signal extraction ───────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class StyleSignals:
|
||||
"""Quantitative style signals extracted from a text sample."""
|
||||
sentence_count: int = 0
|
||||
word_count: int = 0
|
||||
avg_sentence_length: float = 0.0
|
||||
em_dash_count: int = 0
|
||||
semicolon_count: int = 0
|
||||
filler_hits: list[str] = field(default_factory=list)
|
||||
question_ratio: float = 0.0 # fraction of sentences ending in '?'
|
||||
first_person_ratio: float = 0.0 # fraction of sentences starting with 'I'
|
||||
avg_word_length: float = 0.0
|
||||
|
||||
|
||||
def extract_signals(text: str) -> StyleSignals:
|
||||
"""Extract style signals from a text sample."""
|
||||
text = text.strip()
|
||||
if text.startswith("[ERROR:"):
|
||||
return StyleSignals() # zero-score sentinel — caller checks for empty output
|
||||
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
|
||||
words = text.split()
|
||||
|
||||
if not sentences:
|
||||
return StyleSignals()
|
||||
|
||||
avg_sentence_length = len(words) / len(sentences) if sentences else 0.0
|
||||
avg_word_length = (sum(len(w.strip('.,!?;:"\'')) for w in words) / len(words)) if words else 0.0
|
||||
|
||||
em_dash_count = text.count('\u2014') + text.count(' -- ') + text.count('--')
|
||||
semicolon_count = text.count(';')
|
||||
|
||||
filler_hits = [p for p in FILLER_PHRASES if p.lower() in text.lower()]
|
||||
|
||||
question_ratio = sum(1 for s in sentences if s.endswith('?')) / len(sentences)
|
||||
first_person_ratio = sum(1 for s in sentences if re.match(r"^I\b", s)) / len(sentences)
|
||||
|
||||
return StyleSignals(
|
||||
sentence_count=len(sentences),
|
||||
word_count=len(words),
|
||||
avg_sentence_length=avg_sentence_length,
|
||||
em_dash_count=em_dash_count,
|
||||
semicolon_count=semicolon_count,
|
||||
filler_hits=filler_hits,
|
||||
question_ratio=question_ratio,
|
||||
first_person_ratio=first_person_ratio,
|
||||
avg_word_length=avg_word_length,
|
||||
)
|
||||
|
||||
|
||||
def build_corpus_profile(corpus_dir: Path) -> StyleSignals | None:
|
||||
"""Aggregate style signals across all corpus samples into a target profile."""
|
||||
samples = list(corpus_dir.glob("*.txt"))
|
||||
if not samples:
|
||||
return None
|
||||
|
||||
all_signals = [extract_signals(p.read_text(encoding="utf-8")) for p in samples]
|
||||
n = len(all_signals)
|
||||
|
||||
return StyleSignals(
|
||||
sentence_count=int(sum(s.sentence_count for s in all_signals) / n),
|
||||
word_count=int(sum(s.word_count for s in all_signals) / n),
|
||||
avg_sentence_length=sum(s.avg_sentence_length for s in all_signals) / n,
|
||||
em_dash_count=int(sum(s.em_dash_count for s in all_signals) / n),
|
||||
semicolon_count=int(sum(s.semicolon_count for s in all_signals) / n),
|
||||
question_ratio=sum(s.question_ratio for s in all_signals) / n,
|
||||
first_person_ratio=sum(s.first_person_ratio for s in all_signals) / n,
|
||||
avg_word_length=sum(s.avg_word_length for s in all_signals) / n,
|
||||
)
|
||||
|
||||
|
||||
def score_against_profile(output_signals: StyleSignals, profile: StyleSignals | None) -> float:
|
||||
"""Score a model output against the corpus profile. Returns 0-100.
|
||||
|
||||
Penalties:
|
||||
- Em dashes / semicolons: -5 each occurrence (hard CF style violation)
|
||||
- Filler phrases: -8 each hit (strong signal of non-style output)
|
||||
- Sentence length delta: proportional penalty (target: close to corpus avg)
|
||||
- Word length delta: smaller penalty
|
||||
|
||||
When no corpus profile is available, falls back to absolute signal scores only.
|
||||
"""
|
||||
score = 100.0
|
||||
|
||||
# Hard violations -- always penalised regardless of corpus
|
||||
score -= output_signals.em_dash_count * 5
|
||||
score -= output_signals.semicolon_count * 3
|
||||
score -= len(output_signals.filler_hits) * 8
|
||||
|
||||
if profile is not None:
|
||||
# Sentence length delta: penalise proportionally
|
||||
length_delta = abs(output_signals.avg_sentence_length - profile.avg_sentence_length)
|
||||
score -= min(length_delta * 2, 20)
|
||||
|
||||
# Question ratio delta
|
||||
question_delta = abs(output_signals.question_ratio - profile.question_ratio)
|
||||
score -= min(question_delta * 10, 10)
|
||||
|
||||
return max(0.0, score)
|
||||
|
||||
|
||||
# ── Ollama generation ─────────────────────────────────────────────────────────
|
||||
|
||||
_CFORCH_NODE_ID = "heimdall"
|
||||
|
||||
|
||||
def cforch_list_catalog(
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
node_id: str = _CFORCH_NODE_ID,
|
||||
) -> dict[str, int]:
|
||||
"""Return the cf-text catalog from cf-orch as {model_id: vram_mb}.
|
||||
|
||||
Uses ?node_id= to request the catalog from a specific node's profile,
|
||||
avoiding cross-node catalog shadowing when multiple nodes define catalogs
|
||||
for the same service.
|
||||
"""
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"{cforch_url}/api/services/cf-text/catalog",
|
||||
params={"node_id": node_id} if node_id else {},
|
||||
timeout=10.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json()
|
||||
return {
|
||||
model_id: (entry.get("vram_mb", 0) if isinstance(entry, dict) else 0)
|
||||
for model_id, entry in raw.items()
|
||||
}
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not reach cf-orch catalog at {cforch_url}: {exc}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def _cforch_allocate_service(
|
||||
service: str,
|
||||
model_id: str,
|
||||
cforch_url: str,
|
||||
startup_timeout_s: float,
|
||||
health_path: str,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Generic cf-orch allocate + state-signal wait. Returns (service_url, allocation_id) or None.
|
||||
|
||||
After allocating, waits for the coordinator's service state to reach 'running'.
|
||||
Fails immediately if the state reaches 'stopped' (crashed load) — no waiting out
|
||||
the full timeout for a model that already failed.
|
||||
Falls back to health-polling if the coordinator doesn't expose a matching instance
|
||||
(e.g. older coordinator version or service not yet registered in probe loop).
|
||||
"""
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{cforch_url}/api/services/{service}/allocate",
|
||||
json={
|
||||
"model_candidates": [model_id],
|
||||
"caller": "avocet",
|
||||
"pipeline": "style_benchmark",
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
service_url: str = data["url"]
|
||||
allocation_id: str = data.get("allocation_id", "")
|
||||
node_id: str = data.get("node_id", "")
|
||||
gpu_id: int | None = data.get("gpu_id")
|
||||
|
||||
if data.get("started", False) and not data.get("warm", True):
|
||||
print(f" [cold start] waiting for {service} to load {model_id!r}...", end=" ", flush=True)
|
||||
t0 = time.monotonic()
|
||||
deadline = t0 + startup_timeout_s
|
||||
probe_misses = 0 # consecutive polls with no matching instance in status
|
||||
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
status = httpx.get(
|
||||
f"{cforch_url}/api/services/{service}/status", timeout=5.0
|
||||
)
|
||||
if status.is_success:
|
||||
instances = status.json().get("instances", [])
|
||||
# Find our specific instance by node+gpu
|
||||
match = next(
|
||||
(i for i in instances
|
||||
if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
|
||||
None,
|
||||
)
|
||||
if match:
|
||||
probe_misses = 0
|
||||
state = match.get("state", "")
|
||||
if state == "running":
|
||||
elapsed = time.monotonic() - t0
|
||||
print(f"ready ({elapsed:.0f}s)", flush=True)
|
||||
return service_url, allocation_id
|
||||
elif state == "stopped":
|
||||
print(f"failed (service stopped — model load error)", flush=True)
|
||||
return None
|
||||
# state == "starting" or unknown → keep waiting
|
||||
else:
|
||||
probe_misses += 1
|
||||
# After a grace period with no instance visible, fall back to
|
||||
# direct health-poll (coordinator may not have probed yet)
|
||||
if probe_misses >= 6:
|
||||
try:
|
||||
health = httpx.get(f"{service_url}{health_path}", timeout=3.0)
|
||||
if health.is_success:
|
||||
elapsed = time.monotonic() - t0
|
||||
print(f"ready via health ({elapsed:.0f}s)", flush=True)
|
||||
return service_url, allocation_id
|
||||
except Exception:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(3.0)
|
||||
|
||||
elapsed = time.monotonic() - t0
|
||||
print(f"timed out after {elapsed:.0f}s", flush=True)
|
||||
return None
|
||||
|
||||
return service_url, allocation_id
|
||||
except Exception as exc:
|
||||
print(f"[warn] cf-orch allocation failed for {model_id!r} ({service}): {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def cforch_allocate(
|
||||
model_id: str,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
startup_timeout_s: float = 180.0,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Allocate a cf-text instance for model_id. Returns (service_url, allocation_id) or None."""
|
||||
return _cforch_allocate_service("cf-text", model_id, cforch_url, startup_timeout_s, "/health")
|
||||
|
||||
|
||||
def cforch_allocate_vllm(
|
||||
model_id: str,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
startup_timeout_s: float = 300.0,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Allocate a vllm instance for model_id. Returns (service_url, allocation_id) or None.
|
||||
|
||||
vllm exposes an OpenAI-compatible API — generate_cftext() works unchanged
|
||||
against the returned service_url. Startup timeout is longer (300s) because
|
||||
vllm loads large model weights from disk before becoming ready.
|
||||
"""
|
||||
return _cforch_allocate_service("vllm", model_id, cforch_url, startup_timeout_s, "/health")
|
||||
|
||||
|
||||
def cforch_release(allocation_id: str, cforch_url: str = _CFORCH_URL) -> None:
|
||||
"""Release a cf-orch allocation."""
|
||||
if not allocation_id:
|
||||
return
|
||||
try:
|
||||
httpx.delete(f"{cforch_url}/api/services/cf-text/allocations/{allocation_id}", timeout=10.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def generate_cftext(
|
||||
service_url: str,
|
||||
model_id: str,
|
||||
prompt: str,
|
||||
system: str = "",
|
||||
) -> tuple[str, float]:
|
||||
"""Call cf-text via OpenAI-compatible /v1/chat/completions. Returns (text, elapsed_ms)."""
|
||||
messages: list[dict[str, str]] = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
"model": model_id,
|
||||
"messages": messages,
|
||||
"max_tokens": GENERATION_PARAMS.get("num_predict", 300),
|
||||
"temperature": GENERATION_PARAMS.get("temperature", 0.7),
|
||||
"top_p": GENERATION_PARAMS.get("top_p", 0.9),
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{service_url.rstrip('/')}/v1/chat/completions",
|
||||
json=payload,
|
||||
timeout=180.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
content = resp.json()["choices"][0]["message"]["content"]
|
||||
return content.strip(), elapsed_ms
|
||||
except Exception as exc:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return f"[ERROR: {exc}]", elapsed_ms
|
||||
|
||||
|
||||
def generate(model_id: str, prompt: str, system: str = "") -> tuple[str, float]:
|
||||
"""Call ollama /api/generate. Returns (text, elapsed_ms)."""
|
||||
payload: dict[str, Any] = {
|
||||
"model": model_id,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": GENERATION_PARAMS,
|
||||
}
|
||||
if system:
|
||||
payload["system"] = system
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{_OLLAMA_URL}/api/generate",
|
||||
json=payload,
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return resp.json().get("response", "").strip(), elapsed_ms
|
||||
except Exception as exc:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return f"[ERROR: {exc}]", elapsed_ms
|
||||
|
||||
|
||||
def find_disk_ggufs(llm_root: Path) -> list[Path]:
|
||||
"""Recursively find .gguf files under known subdirs of llm_root.
|
||||
|
||||
Skips vocab-only GGUFs (ggml-vocab-*) which aren't standalone models.
|
||||
"""
|
||||
found: list[Path] = []
|
||||
search_dirs = [llm_root / sub for sub in _SCAN_SUBDIRS] + [llm_root]
|
||||
seen: set[Path] = set()
|
||||
for base in search_dirs:
|
||||
if not base.exists():
|
||||
continue
|
||||
for gguf in base.rglob("*.gguf"):
|
||||
if gguf in seen:
|
||||
continue
|
||||
seen.add(gguf)
|
||||
if gguf.name.startswith("ggml-vocab-"):
|
||||
continue
|
||||
found.append(gguf)
|
||||
return sorted(found)
|
||||
|
||||
|
||||
def gguf_to_ollama_tag(gguf_path: Path) -> str:
|
||||
"""Derive a stable ollama tag from a GGUF path.
|
||||
|
||||
Uses parent dir name + stem to avoid collisions, e.g.:
|
||||
claude-3.7-sonnet-reasoning-gemma3-12B/foo.Q8_0.gguf
|
||||
→ bench-claude-3.7-sonnet-reasoning-gemma3-12b-foo-q8-0
|
||||
"""
|
||||
parent = gguf_path.parent.name.lower()
|
||||
stem = gguf_path.stem.lower()
|
||||
# If stem is contained in parent (common pattern), just use parent
|
||||
slug = parent if stem.replace("-", "").replace("_", "") in parent.replace("-", "").replace("_", "") else f"{parent}-{stem}"
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
|
||||
return f"bench-{slug}:latest"
|
||||
|
||||
|
||||
def register_gguf(gguf_path: Path, tag: str) -> bool:
|
||||
"""Create a temporary ollama model entry from a GGUF file. Returns True on success."""
|
||||
import subprocess
|
||||
import tempfile
|
||||
modelfile = f"FROM {gguf_path.resolve()}\n"
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".Modelfile", delete=False) as f:
|
||||
f.write(modelfile)
|
||||
modelfile_path = f.name
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ollama", "create", tag, "-f", modelfile_path],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not register {gguf_path.name}: {exc}", file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
Path(modelfile_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
def deregister_gguf(tag: str) -> None:
|
||||
"""Remove a temporary ollama model entry."""
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(["ollama", "rm", tag], capture_output=True, timeout=30)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def backfill_disk_models(
|
||||
llm_root: Path,
|
||||
existing_tags: set[str],
|
||||
max_vram_mb: int = 0,
|
||||
) -> list[str]:
|
||||
"""Register GGUFs from disk that aren't already in ollama. Returns new tags.
|
||||
|
||||
max_vram_mb: skip files whose size exceeds this threshold (0 = no limit).
|
||||
GGUF file size is a reliable VRAM proxy -- quantized weights load ~1:1.
|
||||
"""
|
||||
ggufs = find_disk_ggufs(llm_root)
|
||||
if not ggufs:
|
||||
print(f"No .gguf files found under {llm_root}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
new_tags: list[str] = []
|
||||
skipped_oom = 0
|
||||
for gguf in ggufs:
|
||||
size_mb = gguf.stat().st_size // (1024 * 1024)
|
||||
if max_vram_mb and size_mb > max_vram_mb:
|
||||
print(f" [skip-oom] {gguf.name} ({size_mb} MB > {max_vram_mb} MB limit)")
|
||||
skipped_oom += 1
|
||||
continue
|
||||
tag = gguf_to_ollama_tag(gguf)
|
||||
if tag in existing_tags:
|
||||
print(f" [skip] {gguf.name} already registered as {tag}")
|
||||
continue
|
||||
print(f" [register] {gguf.name} ({size_mb} MB) → {tag} ...", end=" ", flush=True)
|
||||
if register_gguf(gguf, tag):
|
||||
print("ok")
|
||||
new_tags.append(tag)
|
||||
else:
|
||||
print("failed")
|
||||
|
||||
if skipped_oom:
|
||||
print(f" [info] {skipped_oom} GGUF(s) skipped (exceed {max_vram_mb} MB VRAM limit)")
|
||||
return new_tags
|
||||
|
||||
|
||||
def list_ollama_models() -> list[str]:
|
||||
"""Return model names from ollama /api/tags, filtered to text-gen candidates."""
|
||||
try:
|
||||
resp = httpx.get(f"{_OLLAMA_URL}/api/tags", timeout=10.0)
|
||||
resp.raise_for_status()
|
||||
models = resp.json().get("models", [])
|
||||
# Exclude embedding-only models
|
||||
exclude = {"mxbai-embed-large", "nomic-embed-text", "all-minilm"}
|
||||
return [
|
||||
m["name"] for m in models
|
||||
if not any(x in m["name"].lower() for x in exclude)
|
||||
]
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not reach ollama: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
# ── Run benchmark ─────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ModelResult:
|
||||
model_id: str
|
||||
prompt_results: list[dict[str, Any]] = field(default_factory=list)
|
||||
avg_score: float = 0.0
|
||||
avg_latency_ms: float = 0.0
|
||||
total_filler_hits: int = 0
|
||||
total_em_dashes: int = 0
|
||||
total_semicolons: int = 0
|
||||
|
||||
|
||||
def _bench_one_model(
|
||||
model_id: str,
|
||||
prompts: list[dict[str, str]],
|
||||
profile: Any,
|
||||
use_cforch: bool,
|
||||
cforch_url: str,
|
||||
use_vllm: bool = False,
|
||||
) -> "ModelResult | None":
|
||||
"""Run all prompts for a single model. Thread-safe — all output is prefixed with model_id.
|
||||
|
||||
Dispatch priority:
|
||||
use_vllm=True → allocate vllm via cf-orch, then generate_cftext() (OpenAI-compatible)
|
||||
use_cforch=True → allocate cf-text via cf-orch, then generate_cftext()
|
||||
else → direct ollama generate()
|
||||
Both vllm and cf-text expose /v1/chat/completions so generate_cftext() works for both.
|
||||
"""
|
||||
prefix = f"[{model_id}]"
|
||||
result = ModelResult(model_id=model_id)
|
||||
|
||||
service_url: str | None = None
|
||||
allocation_id: str = ""
|
||||
if use_vllm:
|
||||
alloc = cforch_allocate_vllm(model_id, cforch_url)
|
||||
if alloc is None:
|
||||
print(f"{prefix} [skip] vllm allocation failed", flush=True)
|
||||
return None
|
||||
service_url, allocation_id = alloc
|
||||
print(f"{prefix} vllm allocated: {service_url}", flush=True)
|
||||
elif use_cforch:
|
||||
alloc = cforch_allocate(model_id, cforch_url)
|
||||
if alloc is None:
|
||||
print(f"{prefix} [skip] cf-orch allocation failed", flush=True)
|
||||
return None
|
||||
service_url, allocation_id = alloc
|
||||
print(f"{prefix} allocated: {service_url}", flush=True)
|
||||
|
||||
try:
|
||||
for prompt_def in prompts:
|
||||
tag = prompt_def["tag"]
|
||||
user_prompt = (
|
||||
f"Thread: {prompt_def['thread_title']}\n\n"
|
||||
f"{prompt_def['thread_body']}\n\n"
|
||||
f"Write a reply:"
|
||||
)
|
||||
print(f"{prefix} [{tag}] generating...", flush=True)
|
||||
|
||||
if (use_cforch or use_vllm) and service_url:
|
||||
# Both cf-text and vllm expose /v1/chat/completions — same call
|
||||
output, elapsed_ms = generate_cftext(service_url, model_id, user_prompt, system=SYSTEM_PROMPT)
|
||||
else:
|
||||
output, elapsed_ms = generate(model_id, user_prompt, system=SYSTEM_PROMPT)
|
||||
|
||||
signals = extract_signals(output)
|
||||
score = score_against_profile(signals, profile)
|
||||
|
||||
print(f"{prefix} [{tag}] {score:.0f}/100 ({elapsed_ms:.0f}ms)", flush=True)
|
||||
if signals.filler_hits:
|
||||
print(f"{prefix} ⚠ filler: {signals.filler_hits}", flush=True)
|
||||
if signals.em_dash_count:
|
||||
print(f"{prefix} ⚠ em-dashes: {signals.em_dash_count}", flush=True)
|
||||
|
||||
result.prompt_results.append({
|
||||
"tag": tag,
|
||||
"user_prompt": user_prompt,
|
||||
"output": output,
|
||||
"signals": {
|
||||
"avg_sentence_length": signals.avg_sentence_length,
|
||||
"em_dash_count": signals.em_dash_count,
|
||||
"semicolon_count": signals.semicolon_count,
|
||||
"filler_hits": signals.filler_hits,
|
||||
"question_ratio": signals.question_ratio,
|
||||
"word_count": signals.word_count,
|
||||
},
|
||||
"score": score,
|
||||
"latency_ms": elapsed_ms,
|
||||
})
|
||||
finally:
|
||||
if (use_cforch or use_vllm) and allocation_id:
|
||||
cforch_release(allocation_id, cforch_url)
|
||||
|
||||
if not result.prompt_results:
|
||||
return None
|
||||
|
||||
scores = [r["score"] for r in result.prompt_results]
|
||||
latencies = [r["latency_ms"] for r in result.prompt_results]
|
||||
result.avg_score = sum(scores) / len(scores)
|
||||
result.avg_latency_ms = sum(latencies) / len(latencies)
|
||||
result.total_filler_hits = sum(len(r["signals"]["filler_hits"]) for r in result.prompt_results)
|
||||
result.total_em_dashes = sum(r["signals"]["em_dash_count"] for r in result.prompt_results)
|
||||
result.total_semicolons = sum(r["signals"]["semicolon_count"] for r in result.prompt_results)
|
||||
|
||||
print(f"{prefix} done — avg score {result.avg_score:.0f}/100", flush=True)
|
||||
return result
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
model_ids: list[str],
|
||||
corpus_dir: Path,
|
||||
prompts: list[dict[str, str]],
|
||||
use_cforch: bool = False,
|
||||
use_vllm: bool = False,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
workers: int = 1,
|
||||
) -> list[ModelResult]:
|
||||
profile = build_corpus_profile(corpus_dir)
|
||||
if profile:
|
||||
print(f"Corpus profile loaded from {corpus_dir} ({len(list(corpus_dir.glob('*.txt')))} samples)")
|
||||
print(f" Target avg sentence length: {profile.avg_sentence_length:.1f} words")
|
||||
else:
|
||||
print(f"[warn] No corpus samples found in {corpus_dir} -- scoring on hard violations only")
|
||||
|
||||
backend = "vllm via cf-orch" if use_vllm else ("cf-text via cf-orch" if use_cforch else "ollama")
|
||||
print(f" Backend: {backend}")
|
||||
|
||||
effective_workers = min(workers, len(model_ids)) if model_ids else 1
|
||||
print(f" Workers: {effective_workers} (of {len(model_ids)} models)", flush=True)
|
||||
|
||||
results: list[ModelResult] = []
|
||||
|
||||
if effective_workers <= 1:
|
||||
# Sequential path — simpler output, easier to follow for single-model runs
|
||||
for model_id in model_ids:
|
||||
print(f"\n{'='*60}\nModel: {model_id}", flush=True)
|
||||
r = _bench_one_model(model_id, prompts, profile, use_cforch, cforch_url, use_vllm)
|
||||
if r:
|
||||
results.append(r)
|
||||
else:
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
print(f" Fanning out {len(model_ids)} models across {effective_workers} workers...", flush=True)
|
||||
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(_bench_one_model, mid, prompts, profile, use_cforch, cforch_url, use_vllm): mid
|
||||
for mid in model_ids
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
r = future.result()
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
return sorted(results, key=lambda r: r.avg_score, reverse=True)
|
||||
|
||||
|
||||
# ── Markdown report ───────────────────────────────────────────────────────────
|
||||
|
||||
def render_report(results: list[ModelResult], corpus_dir: Path) -> str:
|
||||
date_str = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
lines: list[str] = [
|
||||
f"# Writing Style Benchmark Results",
|
||||
f"",
|
||||
f"**Date:** {date_str} ",
|
||||
f"**Corpus:** `{corpus_dir}` ",
|
||||
f"**Models tested:** {len(results)} ",
|
||||
f"**Prompts per model:** {len(TEST_PROMPTS)}",
|
||||
f"",
|
||||
f"## Rankings",
|
||||
f"",
|
||||
f"| Rank | Model | Score | Latency | Em-dashes | Fillers | Semicolons |",
|
||||
f"|------|-------|-------|---------|-----------|---------|------------|",
|
||||
]
|
||||
|
||||
for i, r in enumerate(results, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉"}.get(i, f"#{i}")
|
||||
lines.append(
|
||||
f"| {medal} | `{r.model_id}` | {r.avg_score:.0f}/100 "
|
||||
f"| {r.avg_latency_ms:.0f}ms "
|
||||
f"| {r.total_em_dashes} "
|
||||
f"| {r.total_filler_hits} "
|
||||
f"| {r.total_semicolons} |"
|
||||
)
|
||||
|
||||
lines += ["", "## Sample Outputs", ""]
|
||||
|
||||
for r in results[:3]: # top 3 only to keep report readable
|
||||
lines += [f"### `{r.model_id}` (avg score: {r.avg_score:.0f})", ""]
|
||||
for pr in r.prompt_results:
|
||||
lines += [
|
||||
f"**Prompt:** {pr['tag']} ",
|
||||
f"**Score:** {pr['score']:.0f}/100 ",
|
||||
f"",
|
||||
f"```",
|
||||
pr["output"],
|
||||
f"```",
|
||||
f"",
|
||||
]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def save_report(results: list[ModelResult], corpus_dir: Path) -> Path:
|
||||
_RESULTS_DIR.mkdir(exist_ok=True)
|
||||
date_str = datetime.now().strftime("%Y-%m-%d_%H%M")
|
||||
report_path = _RESULTS_DIR / f"style_{date_str}.md"
|
||||
report_path.write_text(render_report(results, corpus_dir), encoding="utf-8")
|
||||
|
||||
# Also save raw JSON for programmatic use
|
||||
json_path = _RESULTS_DIR / f"style_{date_str}.json"
|
||||
json_path.write_text(
|
||||
json.dumps(
|
||||
[
|
||||
{
|
||||
"model_id": r.model_id,
|
||||
"avg_score": r.avg_score,
|
||||
"avg_latency_ms": r.avg_latency_ms,
|
||||
"total_filler_hits": r.total_filler_hits,
|
||||
"total_em_dashes": r.total_em_dashes,
|
||||
"total_semicolons": r.total_semicolons,
|
||||
"prompt_results": r.prompt_results,
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
indent=2,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
return report_path
|
||||
|
||||
|
||||
# ── CLI commands ──────────────────────────────────────────────────────────────
|
||||
|
||||
def cmd_list_models(_args: argparse.Namespace) -> None:
|
||||
models = list_ollama_models()
|
||||
if not models:
|
||||
print("No models found (is ollama running?)")
|
||||
return
|
||||
print(f"{len(models)} models available:\n")
|
||||
for m in models:
|
||||
print(f" {m}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
corpus_dir = Path(args.samples)
|
||||
if not corpus_dir.exists():
|
||||
print(f"[error] Corpus directory not found: {corpus_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
max_vram_mb: int = getattr(args, "max_vram", 7200)
|
||||
use_cforch: bool = getattr(args, "cforch", False)
|
||||
use_vllm: bool = getattr(args, "vllm", False)
|
||||
cforch_url: str = getattr(args, "cforch_url", _CFORCH_URL)
|
||||
registered_tags: list[str] = []
|
||||
|
||||
def _filter_ollama_by_size(ids: list[str], include_large: bool) -> list[str]:
|
||||
"""Apply name-pattern size filter to ollama model list."""
|
||||
if include_large:
|
||||
return ids
|
||||
skip_patterns = ["270b", "70b", "32b", "30b", "21b", "20b", "deepseek-r1"]
|
||||
filtered = [m for m in ids if not any(p in m.lower() for p in skip_patterns)]
|
||||
skipped = len(ids) - len(filtered)
|
||||
if skipped:
|
||||
print(f"[info] Skipped {skipped} large model(s) by name pattern. "
|
||||
"Pass --include-large to include them.")
|
||||
return filtered
|
||||
|
||||
if args.models and args.models != "all":
|
||||
model_ids = [m.strip() for m in args.models.split(",") if m.strip()]
|
||||
elif use_cforch:
|
||||
# cf-orch path: pull model list from catalog, filter by vram_mb
|
||||
catalog = cforch_list_catalog(cforch_url)
|
||||
if not catalog:
|
||||
print("[warn] cf-orch catalog empty or unreachable -- falling back to ollama models")
|
||||
use_cforch = False
|
||||
model_ids = _filter_ollama_by_size(list_ollama_models(), args.include_large)
|
||||
if not model_ids:
|
||||
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
before = list(catalog.items())
|
||||
allowed = {mid: mb for mid, mb in before if mb == 0 or mb <= max_vram_mb}
|
||||
skipped_oom = {mid: mb for mid, mb in before if mid not in allowed}
|
||||
model_ids = list(allowed.keys())
|
||||
print(f"[info] cf-orch catalog: {len(before)} model(s), "
|
||||
f"{len(allowed)} within {max_vram_mb} MB VRAM limit")
|
||||
if skipped_oom:
|
||||
print(f"[info] Skipped (OOM risk): "
|
||||
+ ", ".join(f"{mid} ({mb} MB)" for mid, mb in sorted(skipped_oom.items())))
|
||||
else:
|
||||
# Ollama path
|
||||
model_ids = list_ollama_models()
|
||||
if not model_ids:
|
||||
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Backfill GGUFs from disk before filtering -- skips files that exceed VRAM limit
|
||||
if getattr(args, "scan_disk", None):
|
||||
llm_root = Path(args.scan_disk)
|
||||
print(f"\nScanning {llm_root} for unregistered GGUFs (limit: {max_vram_mb} MB)...")
|
||||
registered_tags = backfill_disk_models(llm_root, set(model_ids), max_vram_mb=max_vram_mb)
|
||||
model_ids = list_ollama_models() # re-fetch with new registrations
|
||||
|
||||
model_ids = _filter_ollama_by_size(model_ids, args.include_large)
|
||||
|
||||
print(f"\nRunning writing style benchmark on {len(model_ids)} model(s)...")
|
||||
try:
|
||||
results = run_benchmark(model_ids, corpus_dir, TEST_PROMPTS, use_cforch=use_cforch, use_vllm=use_vllm, cforch_url=cforch_url, workers=args.workers)
|
||||
report_path = save_report(results, corpus_dir)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Results saved to: {report_path}")
|
||||
print(f"\n{render_report(results, corpus_dir)}")
|
||||
finally:
|
||||
if registered_tags:
|
||||
print(f"\nCleaning up {len(registered_tags)} temporary ollama registrations...")
|
||||
for tag in registered_tags:
|
||||
deregister_gguf(tag)
|
||||
|
||||
|
||||
def cmd_show_last(_args: argparse.Namespace) -> None:
|
||||
reports = sorted(_RESULTS_DIR.glob("style_*.md"), reverse=True)
|
||||
if not reports:
|
||||
print("No benchmark results found. Run --run first.")
|
||||
return
|
||||
print(reports[0].read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Writing style benchmark harness for local text-gen models",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
sub = parser.add_subparsers(dest="cmd")
|
||||
|
||||
sub.add_parser("list-models", help="List available ollama models")
|
||||
|
||||
run_p = sub.add_parser("run", help="Run the benchmark")
|
||||
run_p.add_argument("--models", default="all", help="Comma-separated model IDs, or 'all'")
|
||||
run_p.add_argument("--samples", default=str(_CORPUS_DIR), help="Path to style corpus directory")
|
||||
run_p.add_argument("--include-large", action="store_true", help="Include models >20B params")
|
||||
run_p.add_argument("--scan-disk", metavar="LLM_ROOT", help="Scan directory for GGUFs not yet in ollama (e.g. /Library/Assets/LLM)")
|
||||
run_p.add_argument("--cforch", action="store_true", help="Route generation through cf-orch/cf-text instead of direct ollama")
|
||||
run_p.add_argument("--vllm", action="store_true", help="Route generation through cf-orch/vllm (OpenAI-compatible) instead of ollama")
|
||||
run_p.add_argument("--cforch-url", default=_CFORCH_URL, help=f"cf-orch coordinator URL (default: {_CFORCH_URL})")
|
||||
run_p.add_argument("--max-vram", type=int, default=7200, metavar="MB",
|
||||
help="Skip models whose VRAM footprint exceeds this limit in MB (default: 7200)")
|
||||
run_p.add_argument("--workers", type=int, default=1, metavar="N",
|
||||
help="Parallel workers — run N models simultaneously (default: 1; use 4+ with cf-orch)")
|
||||
|
||||
sub.add_parser("show-last", help="Print the most recent benchmark report")
|
||||
|
||||
# Also support legacy --list-models / --run / --show-last flags for manage.sh compat
|
||||
parser.add_argument("--list-models", action="store_true")
|
||||
parser.add_argument("--run", action="store_true")
|
||||
parser.add_argument("--show-last", action="store_true")
|
||||
parser.add_argument("--models", default="all")
|
||||
parser.add_argument("--samples", default=str(_CORPUS_DIR))
|
||||
parser.add_argument("--include-large", action="store_true")
|
||||
parser.add_argument("--scan-disk", metavar="LLM_ROOT")
|
||||
parser.add_argument("--cforch", action="store_true")
|
||||
parser.add_argument("--vllm", action="store_true")
|
||||
parser.add_argument("--cforch-url", default=_CFORCH_URL)
|
||||
parser.add_argument("--max-vram", type=int, default=7200, metavar="MB")
|
||||
parser.add_argument("--workers", type=int, default=1, metavar="N")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "list-models" or args.list_models:
|
||||
cmd_list_models(args)
|
||||
elif args.cmd == "run" or args.run:
|
||||
cmd_run(args)
|
||||
elif args.cmd == "show-last" or args.show_last:
|
||||
cmd_show_last(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
909
scripts/benchmark_voice.py
Normal file
909
scripts/benchmark_voice.py
Normal file
|
|
@ -0,0 +1,909 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Voice benchmark harness -- score local text-gen models for writing style match.
|
||||
|
||||
Runs each model against a set of test prompts, extracts style signals from the
|
||||
outputs, compares them to a voice corpus, and produces a ranked markdown table.
|
||||
|
||||
Usage:
|
||||
# List available ollama models
|
||||
conda run -n cf python scripts/benchmark_voice.py --list-models
|
||||
|
||||
# Run against all models with default test prompts
|
||||
conda run -n cf python scripts/benchmark_voice.py --run
|
||||
|
||||
# Run specific models only
|
||||
conda run -n cf python scripts/benchmark_voice.py --run --models mistral:7b,llama3.1:8b
|
||||
|
||||
# Use a custom corpus directory
|
||||
conda run -n cf python scripts/benchmark_voice.py --run --samples data/voice_corpus/
|
||||
|
||||
# Print last results table
|
||||
conda run -n cf python scripts/benchmark_voice.py --show-last
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import httpx
|
||||
|
||||
_ROOT = Path(__file__).parent.parent
|
||||
_CORPUS_DIR = _ROOT / "data" / "voice_corpus"
|
||||
_RESULTS_DIR = _ROOT / "benchmark_results"
|
||||
_OLLAMA_URL = "http://localhost:11434"
|
||||
_CFORCH_URL = "http://localhost:7700"
|
||||
|
||||
# Subdirectories under --scan-disk root that may contain GGUFs
|
||||
_SCAN_SUBDIRS = ["textgen/models", "llama.cpp/models", "cf-text/models", "vllm/models"]
|
||||
|
||||
# ── Filler phrases that should be absent from good voice-match output ─────────
|
||||
FILLER_PHRASES: list[str] = [
|
||||
"delve", "certainly", "absolutely", "i apologize", "i'd be happy to",
|
||||
"of course", "great question", "i understand", "let me know if",
|
||||
"feel free to", "it's important to note", "it's worth noting",
|
||||
"in conclusion", "to summarize", "in summary",
|
||||
]
|
||||
|
||||
# ── Test prompts: (thread_title, thread_body, context_tag) ───────────────────
|
||||
# These are representative threads that Magpie might reply to.
|
||||
# Extend this list with real examples as the corpus grows.
|
||||
TEST_PROMPTS: list[dict[str, str]] = [
|
||||
{
|
||||
"tag": "selfhosted_ai_fatigue",
|
||||
"thread_title": "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
|
||||
"thread_body": (
|
||||
"Every session I start over. My whole hardware setup, what tools I use, "
|
||||
"what I've already tried. It's exhausting. There has to be a better way."
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "privacy_local_llm",
|
||||
"thread_title": "What's the point of running local LLMs if the apps still phone home?",
|
||||
"thread_body": (
|
||||
"I went through all the trouble of setting up ollama and now I find out "
|
||||
"the frontend I'm using is sending telemetry. Kind of defeats the purpose."
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "solarpunk_tech",
|
||||
"thread_title": "What does solarpunk computing actually look like in practice?",
|
||||
"thread_body": (
|
||||
"I keep seeing the aesthetic but not a lot of concrete examples of "
|
||||
"people living it out with their tech choices. What does it mean day to day?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "nd_tools",
|
||||
"thread_title": "Tools that actually help with executive function vs ones that just add friction",
|
||||
"thread_body": (
|
||||
"I've tried a dozen productivity apps and most of them require more "
|
||||
"executive function to maintain than they save. What actually sticks for you?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "data_ownership",
|
||||
"thread_title": "Who actually owns your data when you use a 'free' AI tool?",
|
||||
"thread_body": (
|
||||
"Read the ToS on three different AI assistants today. In all three cases "
|
||||
"your inputs can be used for training, shared with partners, and retained "
|
||||
"indefinitely. At what point does 'free' just mean you're the product?"
|
||||
),
|
||||
},
|
||||
{
|
||||
"tag": "digital_culture",
|
||||
"thread_title": "The internet used to feel like it belonged to everyone. What happened?",
|
||||
"thread_body": (
|
||||
"I grew up on forums, IRC, personal homepages. Now everything is a platform "
|
||||
"owned by someone trying to extract value from the community that built it. "
|
||||
"Is the fediverse / self-hosting movement actually reversing this or just "
|
||||
"a niche hobby?"
|
||||
),
|
||||
},
|
||||
]
|
||||
|
||||
GENERATION_PARAMS: dict[str, Any] = {
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
"num_predict": 300,
|
||||
}
|
||||
|
||||
SYSTEM_PROMPT = (
|
||||
"You are a writing assistant. Your job is to write a Reddit reply that matches "
|
||||
"the voice, tone, and style of the provided samples exactly.\n\n"
|
||||
"Voice characteristics:\n"
|
||||
"- Casual engineer tone. Short punchy sentences.\n"
|
||||
"- No hype, no buzzwords, no em dashes, no semicolons.\n"
|
||||
"- Community-first perspective. Solarpunk values.\n"
|
||||
"- Direct and opinionated. No throat-clearing or filler.\n"
|
||||
"- When relevant, mention personal experience with real tools.\n\n"
|
||||
"Write ONLY the reply. No preamble, no 'Here is a reply:', no meta-commentary."
|
||||
)
|
||||
|
||||
|
||||
# ── Style signal extraction ───────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class StyleSignals:
|
||||
"""Quantitative style signals extracted from a text sample."""
|
||||
sentence_count: int = 0
|
||||
word_count: int = 0
|
||||
avg_sentence_length: float = 0.0
|
||||
em_dash_count: int = 0
|
||||
semicolon_count: int = 0
|
||||
filler_hits: list[str] = field(default_factory=list)
|
||||
question_ratio: float = 0.0 # fraction of sentences ending in '?'
|
||||
first_person_ratio: float = 0.0 # fraction of sentences starting with 'I'
|
||||
avg_word_length: float = 0.0
|
||||
|
||||
|
||||
def extract_signals(text: str) -> StyleSignals:
|
||||
"""Extract style signals from a text sample."""
|
||||
text = text.strip()
|
||||
if text.startswith("[ERROR:"):
|
||||
return StyleSignals() # zero-score sentinel — caller checks for empty output
|
||||
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
|
||||
words = text.split()
|
||||
|
||||
if not sentences:
|
||||
return StyleSignals()
|
||||
|
||||
avg_sentence_length = len(words) / len(sentences) if sentences else 0.0
|
||||
avg_word_length = (sum(len(w.strip('.,!?;:"\'')) for w in words) / len(words)) if words else 0.0
|
||||
|
||||
em_dash_count = text.count('\u2014') + text.count(' -- ') + text.count('--')
|
||||
semicolon_count = text.count(';')
|
||||
|
||||
filler_hits = [p for p in FILLER_PHRASES if p.lower() in text.lower()]
|
||||
|
||||
question_ratio = sum(1 for s in sentences if s.endswith('?')) / len(sentences)
|
||||
first_person_ratio = sum(1 for s in sentences if re.match(r"^I\b", s)) / len(sentences)
|
||||
|
||||
return StyleSignals(
|
||||
sentence_count=len(sentences),
|
||||
word_count=len(words),
|
||||
avg_sentence_length=avg_sentence_length,
|
||||
em_dash_count=em_dash_count,
|
||||
semicolon_count=semicolon_count,
|
||||
filler_hits=filler_hits,
|
||||
question_ratio=question_ratio,
|
||||
first_person_ratio=first_person_ratio,
|
||||
avg_word_length=avg_word_length,
|
||||
)
|
||||
|
||||
|
||||
def build_corpus_profile(corpus_dir: Path) -> StyleSignals | None:
|
||||
"""Aggregate style signals across all corpus samples into a target profile."""
|
||||
samples = list(corpus_dir.glob("*.txt"))
|
||||
if not samples:
|
||||
return None
|
||||
|
||||
all_signals = [extract_signals(p.read_text(encoding="utf-8")) for p in samples]
|
||||
n = len(all_signals)
|
||||
|
||||
return StyleSignals(
|
||||
sentence_count=int(sum(s.sentence_count for s in all_signals) / n),
|
||||
word_count=int(sum(s.word_count for s in all_signals) / n),
|
||||
avg_sentence_length=sum(s.avg_sentence_length for s in all_signals) / n,
|
||||
em_dash_count=int(sum(s.em_dash_count for s in all_signals) / n),
|
||||
semicolon_count=int(sum(s.semicolon_count for s in all_signals) / n),
|
||||
question_ratio=sum(s.question_ratio for s in all_signals) / n,
|
||||
first_person_ratio=sum(s.first_person_ratio for s in all_signals) / n,
|
||||
avg_word_length=sum(s.avg_word_length for s in all_signals) / n,
|
||||
)
|
||||
|
||||
|
||||
def score_against_profile(output_signals: StyleSignals, profile: StyleSignals | None) -> float:
|
||||
"""Score a model output against the corpus profile. Returns 0-100.
|
||||
|
||||
Penalties:
|
||||
- Em dashes / semicolons: -5 each occurrence (hard CF style violation)
|
||||
- Filler phrases: -8 each hit (strong signal of non-voice output)
|
||||
- Sentence length delta: proportional penalty (target: close to corpus avg)
|
||||
- Word length delta: smaller penalty
|
||||
|
||||
When no corpus profile is available, falls back to absolute signal scores only.
|
||||
"""
|
||||
score = 100.0
|
||||
|
||||
# Hard violations -- always penalised regardless of corpus
|
||||
score -= output_signals.em_dash_count * 5
|
||||
score -= output_signals.semicolon_count * 3
|
||||
score -= len(output_signals.filler_hits) * 8
|
||||
|
||||
if profile is not None:
|
||||
# Sentence length delta: penalise proportionally
|
||||
length_delta = abs(output_signals.avg_sentence_length - profile.avg_sentence_length)
|
||||
score -= min(length_delta * 2, 20)
|
||||
|
||||
# Question ratio delta
|
||||
question_delta = abs(output_signals.question_ratio - profile.question_ratio)
|
||||
score -= min(question_delta * 10, 10)
|
||||
|
||||
return max(0.0, score)
|
||||
|
||||
|
||||
# ── Ollama generation ─────────────────────────────────────────────────────────
|
||||
|
||||
_CFORCH_NODE_ID = "heimdall"
|
||||
|
||||
|
||||
def cforch_list_catalog(
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
node_id: str = _CFORCH_NODE_ID,
|
||||
) -> dict[str, int]:
|
||||
"""Return the cf-text catalog from cf-orch as {model_id: vram_mb}.
|
||||
|
||||
Uses ?node_id= to request the catalog from a specific node's profile,
|
||||
avoiding cross-node catalog shadowing when multiple nodes define catalogs
|
||||
for the same service.
|
||||
"""
|
||||
try:
|
||||
resp = httpx.get(
|
||||
f"{cforch_url}/api/services/cf-text/catalog",
|
||||
params={"node_id": node_id} if node_id else {},
|
||||
timeout=10.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
raw = resp.json()
|
||||
return {
|
||||
model_id: (entry.get("vram_mb", 0) if isinstance(entry, dict) else 0)
|
||||
for model_id, entry in raw.items()
|
||||
}
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not reach cf-orch catalog at {cforch_url}: {exc}", file=sys.stderr)
|
||||
return {}
|
||||
|
||||
|
||||
def _cforch_allocate_service(
|
||||
service: str,
|
||||
model_id: str,
|
||||
cforch_url: str,
|
||||
startup_timeout_s: float,
|
||||
health_path: str,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Generic cf-orch allocate + health-poll. Returns (service_url, allocation_id) or None."""
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{cforch_url}/api/services/{service}/allocate",
|
||||
json={
|
||||
"model_candidates": [model_id],
|
||||
"caller": "avocet",
|
||||
"pipeline": "voice_benchmark",
|
||||
},
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
service_url: str = data["url"]
|
||||
allocation_id: str = data.get("allocation_id", "")
|
||||
|
||||
if data.get("started", False) and not data.get("warm", True):
|
||||
label = service
|
||||
print(f" [cold start] waiting for {label} to load {model_id!r}...", end=" ", flush=True)
|
||||
deadline = time.monotonic() + startup_timeout_s
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
health = httpx.get(f"{service_url}{health_path}", timeout=3.0)
|
||||
if health.is_success:
|
||||
print(f"ready ({time.monotonic() - (deadline - startup_timeout_s):.0f}s)", flush=True)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
time.sleep(2.0)
|
||||
else:
|
||||
print(f"timed out after {startup_timeout_s:.0f}s", flush=True)
|
||||
return None
|
||||
|
||||
return service_url, allocation_id
|
||||
except Exception as exc:
|
||||
print(f"[warn] cf-orch allocation failed for {model_id!r} ({service}): {exc}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def cforch_allocate(
|
||||
model_id: str,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
startup_timeout_s: float = 180.0,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Allocate a cf-text instance for model_id. Returns (service_url, allocation_id) or None."""
|
||||
return _cforch_allocate_service("cf-text", model_id, cforch_url, startup_timeout_s, "/health")
|
||||
|
||||
|
||||
def cforch_allocate_vllm(
|
||||
model_id: str,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
startup_timeout_s: float = 300.0,
|
||||
) -> tuple[str, str] | None:
|
||||
"""Allocate a vllm instance for model_id. Returns (service_url, allocation_id) or None.
|
||||
|
||||
vllm exposes an OpenAI-compatible API — generate_cftext() works unchanged
|
||||
against the returned service_url. Startup timeout is longer (300s) because
|
||||
vllm loads large model weights from disk before becoming ready.
|
||||
"""
|
||||
return _cforch_allocate_service("vllm", model_id, cforch_url, startup_timeout_s, "/health")
|
||||
|
||||
|
||||
def cforch_release(allocation_id: str, cforch_url: str = _CFORCH_URL) -> None:
|
||||
"""Release a cf-orch allocation."""
|
||||
if not allocation_id:
|
||||
return
|
||||
try:
|
||||
httpx.post(f"{cforch_url}/api/leases/{allocation_id}/release", timeout=10.0)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def generate_cftext(
|
||||
service_url: str,
|
||||
model_id: str,
|
||||
prompt: str,
|
||||
system: str = "",
|
||||
) -> tuple[str, float]:
|
||||
"""Call cf-text via OpenAI-compatible /v1/chat/completions. Returns (text, elapsed_ms)."""
|
||||
messages: list[dict[str, str]] = []
|
||||
if system:
|
||||
messages.append({"role": "system", "content": system})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
"model": model_id,
|
||||
"messages": messages,
|
||||
"max_tokens": GENERATION_PARAMS.get("num_predict", 300),
|
||||
"temperature": GENERATION_PARAMS.get("temperature", 0.7),
|
||||
"top_p": GENERATION_PARAMS.get("top_p", 0.9),
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{service_url.rstrip('/')}/v1/chat/completions",
|
||||
json=payload,
|
||||
timeout=180.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
content = resp.json()["choices"][0]["message"]["content"]
|
||||
return content.strip(), elapsed_ms
|
||||
except Exception as exc:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return f"[ERROR: {exc}]", elapsed_ms
|
||||
|
||||
|
||||
def generate(model_id: str, prompt: str, system: str = "") -> tuple[str, float]:
|
||||
"""Call ollama /api/generate. Returns (text, elapsed_ms)."""
|
||||
payload: dict[str, Any] = {
|
||||
"model": model_id,
|
||||
"prompt": prompt,
|
||||
"stream": False,
|
||||
"options": GENERATION_PARAMS,
|
||||
}
|
||||
if system:
|
||||
payload["system"] = system
|
||||
|
||||
t0 = time.monotonic()
|
||||
try:
|
||||
resp = httpx.post(
|
||||
f"{_OLLAMA_URL}/api/generate",
|
||||
json=payload,
|
||||
timeout=120.0,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return resp.json().get("response", "").strip(), elapsed_ms
|
||||
except Exception as exc:
|
||||
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||
return f"[ERROR: {exc}]", elapsed_ms
|
||||
|
||||
|
||||
def find_disk_ggufs(llm_root: Path) -> list[Path]:
|
||||
"""Recursively find .gguf files under known subdirs of llm_root.
|
||||
|
||||
Skips vocab-only GGUFs (ggml-vocab-*) which aren't standalone models.
|
||||
"""
|
||||
found: list[Path] = []
|
||||
search_dirs = [llm_root / sub for sub in _SCAN_SUBDIRS] + [llm_root]
|
||||
seen: set[Path] = set()
|
||||
for base in search_dirs:
|
||||
if not base.exists():
|
||||
continue
|
||||
for gguf in base.rglob("*.gguf"):
|
||||
if gguf in seen:
|
||||
continue
|
||||
seen.add(gguf)
|
||||
if gguf.name.startswith("ggml-vocab-"):
|
||||
continue
|
||||
found.append(gguf)
|
||||
return sorted(found)
|
||||
|
||||
|
||||
def gguf_to_ollama_tag(gguf_path: Path) -> str:
|
||||
"""Derive a stable ollama tag from a GGUF path.
|
||||
|
||||
Uses parent dir name + stem to avoid collisions, e.g.:
|
||||
claude-3.7-sonnet-reasoning-gemma3-12B/foo.Q8_0.gguf
|
||||
→ bench-claude-3.7-sonnet-reasoning-gemma3-12b-foo-q8-0
|
||||
"""
|
||||
parent = gguf_path.parent.name.lower()
|
||||
stem = gguf_path.stem.lower()
|
||||
# If stem is contained in parent (common pattern), just use parent
|
||||
slug = parent if stem.replace("-", "").replace("_", "") in parent.replace("-", "").replace("_", "") else f"{parent}-{stem}"
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
|
||||
return f"bench-{slug}:latest"
|
||||
|
||||
|
||||
def register_gguf(gguf_path: Path, tag: str) -> bool:
|
||||
"""Create a temporary ollama model entry from a GGUF file. Returns True on success."""
|
||||
import subprocess
|
||||
import tempfile
|
||||
modelfile = f"FROM {gguf_path.resolve()}\n"
|
||||
with tempfile.NamedTemporaryFile(mode="w", suffix=".Modelfile", delete=False) as f:
|
||||
f.write(modelfile)
|
||||
modelfile_path = f.name
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["ollama", "create", tag, "-f", modelfile_path],
|
||||
capture_output=True, text=True, timeout=60,
|
||||
)
|
||||
return result.returncode == 0
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not register {gguf_path.name}: {exc}", file=sys.stderr)
|
||||
return False
|
||||
finally:
|
||||
Path(modelfile_path).unlink(missing_ok=True)
|
||||
|
||||
|
||||
def deregister_gguf(tag: str) -> None:
|
||||
"""Remove a temporary ollama model entry."""
|
||||
import subprocess
|
||||
try:
|
||||
subprocess.run(["ollama", "rm", tag], capture_output=True, timeout=30)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def backfill_disk_models(
|
||||
llm_root: Path,
|
||||
existing_tags: set[str],
|
||||
max_vram_mb: int = 0,
|
||||
) -> list[str]:
|
||||
"""Register GGUFs from disk that aren't already in ollama. Returns new tags.
|
||||
|
||||
max_vram_mb: skip files whose size exceeds this threshold (0 = no limit).
|
||||
GGUF file size is a reliable VRAM proxy -- quantized weights load ~1:1.
|
||||
"""
|
||||
ggufs = find_disk_ggufs(llm_root)
|
||||
if not ggufs:
|
||||
print(f"No .gguf files found under {llm_root}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
new_tags: list[str] = []
|
||||
skipped_oom = 0
|
||||
for gguf in ggufs:
|
||||
size_mb = gguf.stat().st_size // (1024 * 1024)
|
||||
if max_vram_mb and size_mb > max_vram_mb:
|
||||
print(f" [skip-oom] {gguf.name} ({size_mb} MB > {max_vram_mb} MB limit)")
|
||||
skipped_oom += 1
|
||||
continue
|
||||
tag = gguf_to_ollama_tag(gguf)
|
||||
if tag in existing_tags:
|
||||
print(f" [skip] {gguf.name} already registered as {tag}")
|
||||
continue
|
||||
print(f" [register] {gguf.name} ({size_mb} MB) → {tag} ...", end=" ", flush=True)
|
||||
if register_gguf(gguf, tag):
|
||||
print("ok")
|
||||
new_tags.append(tag)
|
||||
else:
|
||||
print("failed")
|
||||
|
||||
if skipped_oom:
|
||||
print(f" [info] {skipped_oom} GGUF(s) skipped (exceed {max_vram_mb} MB VRAM limit)")
|
||||
return new_tags
|
||||
|
||||
|
||||
def list_ollama_models() -> list[str]:
|
||||
"""Return model names from ollama /api/tags, filtered to text-gen candidates."""
|
||||
try:
|
||||
resp = httpx.get(f"{_OLLAMA_URL}/api/tags", timeout=10.0)
|
||||
resp.raise_for_status()
|
||||
models = resp.json().get("models", [])
|
||||
# Exclude embedding-only models
|
||||
exclude = {"mxbai-embed-large", "nomic-embed-text", "all-minilm"}
|
||||
return [
|
||||
m["name"] for m in models
|
||||
if not any(x in m["name"].lower() for x in exclude)
|
||||
]
|
||||
except Exception as exc:
|
||||
print(f"[warn] Could not reach ollama: {exc}", file=sys.stderr)
|
||||
return []
|
||||
|
||||
|
||||
# ── Run benchmark ─────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ModelResult:
|
||||
model_id: str
|
||||
prompt_results: list[dict[str, Any]] = field(default_factory=list)
|
||||
avg_score: float = 0.0
|
||||
avg_latency_ms: float = 0.0
|
||||
total_filler_hits: int = 0
|
||||
total_em_dashes: int = 0
|
||||
total_semicolons: int = 0
|
||||
|
||||
|
||||
def _bench_one_model(
|
||||
model_id: str,
|
||||
prompts: list[dict[str, str]],
|
||||
profile: Any,
|
||||
use_cforch: bool,
|
||||
cforch_url: str,
|
||||
use_vllm: bool = False,
|
||||
) -> "ModelResult | None":
|
||||
"""Run all prompts for a single model. Thread-safe — all output is prefixed with model_id.
|
||||
|
||||
Dispatch priority:
|
||||
use_vllm=True → allocate vllm via cf-orch, then generate_cftext() (OpenAI-compatible)
|
||||
use_cforch=True → allocate cf-text via cf-orch, then generate_cftext()
|
||||
else → direct ollama generate()
|
||||
Both vllm and cf-text expose /v1/chat/completions so generate_cftext() works for both.
|
||||
"""
|
||||
prefix = f"[{model_id}]"
|
||||
result = ModelResult(model_id=model_id)
|
||||
|
||||
service_url: str | None = None
|
||||
allocation_id: str = ""
|
||||
if use_vllm:
|
||||
alloc = cforch_allocate_vllm(model_id, cforch_url)
|
||||
if alloc is None:
|
||||
print(f"{prefix} [skip] vllm allocation failed", flush=True)
|
||||
return None
|
||||
service_url, allocation_id = alloc
|
||||
print(f"{prefix} vllm allocated: {service_url}", flush=True)
|
||||
elif use_cforch:
|
||||
alloc = cforch_allocate(model_id, cforch_url)
|
||||
if alloc is None:
|
||||
print(f"{prefix} [skip] cf-orch allocation failed", flush=True)
|
||||
return None
|
||||
service_url, allocation_id = alloc
|
||||
print(f"{prefix} allocated: {service_url}", flush=True)
|
||||
|
||||
try:
|
||||
for prompt_def in prompts:
|
||||
tag = prompt_def["tag"]
|
||||
user_prompt = (
|
||||
f"Thread: {prompt_def['thread_title']}\n\n"
|
||||
f"{prompt_def['thread_body']}\n\n"
|
||||
f"Write a reply:"
|
||||
)
|
||||
print(f"{prefix} [{tag}] generating...", flush=True)
|
||||
|
||||
if (use_cforch or use_vllm) and service_url:
|
||||
# Both cf-text and vllm expose /v1/chat/completions — same call
|
||||
output, elapsed_ms = generate_cftext(service_url, model_id, user_prompt, system=SYSTEM_PROMPT)
|
||||
else:
|
||||
output, elapsed_ms = generate(model_id, user_prompt, system=SYSTEM_PROMPT)
|
||||
|
||||
signals = extract_signals(output)
|
||||
score = score_against_profile(signals, profile)
|
||||
|
||||
print(f"{prefix} [{tag}] {score:.0f}/100 ({elapsed_ms:.0f}ms)", flush=True)
|
||||
if signals.filler_hits:
|
||||
print(f"{prefix} ⚠ filler: {signals.filler_hits}", flush=True)
|
||||
if signals.em_dash_count:
|
||||
print(f"{prefix} ⚠ em-dashes: {signals.em_dash_count}", flush=True)
|
||||
|
||||
result.prompt_results.append({
|
||||
"tag": tag,
|
||||
"user_prompt": user_prompt,
|
||||
"output": output,
|
||||
"signals": {
|
||||
"avg_sentence_length": signals.avg_sentence_length,
|
||||
"em_dash_count": signals.em_dash_count,
|
||||
"semicolon_count": signals.semicolon_count,
|
||||
"filler_hits": signals.filler_hits,
|
||||
"question_ratio": signals.question_ratio,
|
||||
"word_count": signals.word_count,
|
||||
},
|
||||
"score": score,
|
||||
"latency_ms": elapsed_ms,
|
||||
})
|
||||
finally:
|
||||
if use_cforch and allocation_id:
|
||||
cforch_release(allocation_id, cforch_url)
|
||||
|
||||
if not result.prompt_results:
|
||||
return None
|
||||
|
||||
scores = [r["score"] for r in result.prompt_results]
|
||||
latencies = [r["latency_ms"] for r in result.prompt_results]
|
||||
result.avg_score = sum(scores) / len(scores)
|
||||
result.avg_latency_ms = sum(latencies) / len(latencies)
|
||||
result.total_filler_hits = sum(len(r["signals"]["filler_hits"]) for r in result.prompt_results)
|
||||
result.total_em_dashes = sum(r["signals"]["em_dash_count"] for r in result.prompt_results)
|
||||
result.total_semicolons = sum(r["signals"]["semicolon_count"] for r in result.prompt_results)
|
||||
|
||||
print(f"{prefix} done — avg score {result.avg_score:.0f}/100", flush=True)
|
||||
return result
|
||||
|
||||
|
||||
def run_benchmark(
|
||||
model_ids: list[str],
|
||||
corpus_dir: Path,
|
||||
prompts: list[dict[str, str]],
|
||||
use_cforch: bool = False,
|
||||
use_vllm: bool = False,
|
||||
cforch_url: str = _CFORCH_URL,
|
||||
workers: int = 1,
|
||||
) -> list[ModelResult]:
|
||||
profile = build_corpus_profile(corpus_dir)
|
||||
if profile:
|
||||
print(f"Corpus profile loaded from {corpus_dir} ({len(list(corpus_dir.glob('*.txt')))} samples)")
|
||||
print(f" Target avg sentence length: {profile.avg_sentence_length:.1f} words")
|
||||
else:
|
||||
print(f"[warn] No corpus samples found in {corpus_dir} -- scoring on hard violations only")
|
||||
|
||||
backend = "vllm via cf-orch" if use_vllm else ("cf-text via cf-orch" if use_cforch else "ollama")
|
||||
print(f" Backend: {backend}")
|
||||
|
||||
effective_workers = min(workers, len(model_ids)) if model_ids else 1
|
||||
print(f" Workers: {effective_workers} (of {len(model_ids)} models)", flush=True)
|
||||
|
||||
results: list[ModelResult] = []
|
||||
|
||||
if effective_workers <= 1:
|
||||
# Sequential path — simpler output, easier to follow for single-model runs
|
||||
for model_id in model_ids:
|
||||
print(f"\n{'='*60}\nModel: {model_id}", flush=True)
|
||||
r = _bench_one_model(model_id, prompts, profile, use_cforch, cforch_url, use_vllm)
|
||||
if r:
|
||||
results.append(r)
|
||||
else:
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
print(f" Fanning out {len(model_ids)} models across {effective_workers} workers...", flush=True)
|
||||
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
|
||||
futures = {
|
||||
pool.submit(_bench_one_model, mid, prompts, profile, use_cforch, cforch_url, use_vllm): mid
|
||||
for mid in model_ids
|
||||
}
|
||||
for future in as_completed(futures):
|
||||
r = future.result()
|
||||
if r:
|
||||
results.append(r)
|
||||
|
||||
return sorted(results, key=lambda r: r.avg_score, reverse=True)
|
||||
|
||||
|
||||
# ── Markdown report ───────────────────────────────────────────────────────────
|
||||
|
||||
def render_report(results: list[ModelResult], corpus_dir: Path) -> str:
|
||||
date_str = datetime.now().strftime("%Y-%m-%d %H:%M")
|
||||
lines: list[str] = [
|
||||
f"# Voice Benchmark Results",
|
||||
f"",
|
||||
f"**Date:** {date_str} ",
|
||||
f"**Corpus:** `{corpus_dir}` ",
|
||||
f"**Models tested:** {len(results)} ",
|
||||
f"**Prompts per model:** {len(TEST_PROMPTS)}",
|
||||
f"",
|
||||
f"## Rankings",
|
||||
f"",
|
||||
f"| Rank | Model | Score | Latency | Em-dashes | Fillers | Semicolons |",
|
||||
f"|------|-------|-------|---------|-----------|---------|------------|",
|
||||
]
|
||||
|
||||
for i, r in enumerate(results, 1):
|
||||
medal = {1: "🥇", 2: "🥈", 3: "🥉"}.get(i, f"#{i}")
|
||||
lines.append(
|
||||
f"| {medal} | `{r.model_id}` | {r.avg_score:.0f}/100 "
|
||||
f"| {r.avg_latency_ms:.0f}ms "
|
||||
f"| {r.total_em_dashes} "
|
||||
f"| {r.total_filler_hits} "
|
||||
f"| {r.total_semicolons} |"
|
||||
)
|
||||
|
||||
lines += ["", "## Sample Outputs", ""]
|
||||
|
||||
for r in results[:3]: # top 3 only to keep report readable
|
||||
lines += [f"### `{r.model_id}` (avg score: {r.avg_score:.0f})", ""]
|
||||
for pr in r.prompt_results:
|
||||
lines += [
|
||||
f"**Prompt:** {pr['tag']} ",
|
||||
f"**Score:** {pr['score']:.0f}/100 ",
|
||||
f"",
|
||||
f"```",
|
||||
pr["output"],
|
||||
f"```",
|
||||
f"",
|
||||
]
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def save_report(results: list[ModelResult], corpus_dir: Path) -> Path:
|
||||
_RESULTS_DIR.mkdir(exist_ok=True)
|
||||
date_str = datetime.now().strftime("%Y-%m-%d_%H%M")
|
||||
report_path = _RESULTS_DIR / f"voice_{date_str}.md"
|
||||
report_path.write_text(render_report(results, corpus_dir), encoding="utf-8")
|
||||
|
||||
# Also save raw JSON for programmatic use
|
||||
json_path = _RESULTS_DIR / f"voice_{date_str}.json"
|
||||
json_path.write_text(
|
||||
json.dumps(
|
||||
[
|
||||
{
|
||||
"model_id": r.model_id,
|
||||
"avg_score": r.avg_score,
|
||||
"avg_latency_ms": r.avg_latency_ms,
|
||||
"total_filler_hits": r.total_filler_hits,
|
||||
"total_em_dashes": r.total_em_dashes,
|
||||
"total_semicolons": r.total_semicolons,
|
||||
"prompt_results": r.prompt_results,
|
||||
}
|
||||
for r in results
|
||||
],
|
||||
indent=2,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
return report_path
|
||||
|
||||
|
||||
# ── CLI commands ──────────────────────────────────────────────────────────────
|
||||
|
||||
def cmd_list_models(_args: argparse.Namespace) -> None:
|
||||
models = list_ollama_models()
|
||||
if not models:
|
||||
print("No models found (is ollama running?)")
|
||||
return
|
||||
print(f"{len(models)} models available:\n")
|
||||
for m in models:
|
||||
print(f" {m}")
|
||||
|
||||
|
||||
def cmd_run(args: argparse.Namespace) -> None:
|
||||
corpus_dir = Path(args.samples)
|
||||
if not corpus_dir.exists():
|
||||
print(f"[error] Corpus directory not found: {corpus_dir}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
max_vram_mb: int = getattr(args, "max_vram", 7200)
|
||||
use_cforch: bool = getattr(args, "cforch", False)
|
||||
use_vllm: bool = getattr(args, "vllm", False)
|
||||
cforch_url: str = getattr(args, "cforch_url", _CFORCH_URL)
|
||||
registered_tags: list[str] = []
|
||||
|
||||
def _filter_ollama_by_size(ids: list[str], include_large: bool) -> list[str]:
|
||||
"""Apply name-pattern size filter to ollama model list."""
|
||||
if include_large:
|
||||
return ids
|
||||
skip_patterns = ["270b", "70b", "32b", "30b", "21b", "20b", "deepseek-r1"]
|
||||
filtered = [m for m in ids if not any(p in m.lower() for p in skip_patterns)]
|
||||
skipped = len(ids) - len(filtered)
|
||||
if skipped:
|
||||
print(f"[info] Skipped {skipped} large model(s) by name pattern. "
|
||||
"Pass --include-large to include them.")
|
||||
return filtered
|
||||
|
||||
if args.models and args.models != "all":
|
||||
model_ids = [m.strip() for m in args.models.split(",") if m.strip()]
|
||||
elif use_cforch:
|
||||
# cf-orch path: pull model list from catalog, filter by vram_mb
|
||||
catalog = cforch_list_catalog(cforch_url)
|
||||
if not catalog:
|
||||
print("[warn] cf-orch catalog empty or unreachable -- falling back to ollama models")
|
||||
use_cforch = False
|
||||
model_ids = _filter_ollama_by_size(list_ollama_models(), args.include_large)
|
||||
if not model_ids:
|
||||
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
else:
|
||||
before = list(catalog.items())
|
||||
allowed = {mid: mb for mid, mb in before if mb == 0 or mb <= max_vram_mb}
|
||||
skipped_oom = {mid: mb for mid, mb in before if mid not in allowed}
|
||||
model_ids = list(allowed.keys())
|
||||
print(f"[info] cf-orch catalog: {len(before)} model(s), "
|
||||
f"{len(allowed)} within {max_vram_mb} MB VRAM limit")
|
||||
if skipped_oom:
|
||||
print(f"[info] Skipped (OOM risk): "
|
||||
+ ", ".join(f"{mid} ({mb} MB)" for mid, mb in sorted(skipped_oom.items())))
|
||||
else:
|
||||
# Ollama path
|
||||
model_ids = list_ollama_models()
|
||||
if not model_ids:
|
||||
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
# Backfill GGUFs from disk before filtering -- skips files that exceed VRAM limit
|
||||
if getattr(args, "scan_disk", None):
|
||||
llm_root = Path(args.scan_disk)
|
||||
print(f"\nScanning {llm_root} for unregistered GGUFs (limit: {max_vram_mb} MB)...")
|
||||
registered_tags = backfill_disk_models(llm_root, set(model_ids), max_vram_mb=max_vram_mb)
|
||||
model_ids = list_ollama_models() # re-fetch with new registrations
|
||||
|
||||
model_ids = _filter_ollama_by_size(model_ids, args.include_large)
|
||||
|
||||
print(f"\nRunning voice benchmark on {len(model_ids)} model(s)...")
|
||||
try:
|
||||
results = run_benchmark(model_ids, corpus_dir, TEST_PROMPTS, use_cforch=use_cforch, use_vllm=use_vllm, cforch_url=cforch_url, workers=args.workers)
|
||||
report_path = save_report(results, corpus_dir)
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Results saved to: {report_path}")
|
||||
print(f"\n{render_report(results, corpus_dir)}")
|
||||
finally:
|
||||
if registered_tags:
|
||||
print(f"\nCleaning up {len(registered_tags)} temporary ollama registrations...")
|
||||
for tag in registered_tags:
|
||||
deregister_gguf(tag)
|
||||
|
||||
|
||||
def cmd_show_last(_args: argparse.Namespace) -> None:
|
||||
reports = sorted(_RESULTS_DIR.glob("voice_*.md"), reverse=True)
|
||||
if not reports:
|
||||
print("No benchmark results found. Run --run first.")
|
||||
return
|
||||
print(reports[0].read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
# ── Entry point ───────────────────────────────────────────────────────────────
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Voice benchmark harness for local text-gen models",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
)
|
||||
sub = parser.add_subparsers(dest="cmd")
|
||||
|
||||
sub.add_parser("list-models", help="List available ollama models")
|
||||
|
||||
run_p = sub.add_parser("run", help="Run the benchmark")
|
||||
run_p.add_argument("--models", default="all", help="Comma-separated model IDs, or 'all'")
|
||||
run_p.add_argument("--samples", default=str(_CORPUS_DIR), help="Path to voice corpus directory")
|
||||
run_p.add_argument("--include-large", action="store_true", help="Include models >20B params")
|
||||
run_p.add_argument("--scan-disk", metavar="LLM_ROOT", help="Scan directory for GGUFs not yet in ollama (e.g. /Library/Assets/LLM)")
|
||||
run_p.add_argument("--cforch", action="store_true", help="Route generation through cf-orch/cf-text instead of direct ollama")
|
||||
run_p.add_argument("--vllm", action="store_true", help="Route generation through cf-orch/vllm (OpenAI-compatible) instead of ollama")
|
||||
run_p.add_argument("--cforch-url", default=_CFORCH_URL, help=f"cf-orch coordinator URL (default: {_CFORCH_URL})")
|
||||
run_p.add_argument("--max-vram", type=int, default=7200, metavar="MB",
|
||||
help="Skip models whose VRAM footprint exceeds this limit in MB (default: 7200)")
|
||||
run_p.add_argument("--workers", type=int, default=1, metavar="N",
|
||||
help="Parallel workers — run N models simultaneously (default: 1; use 4+ with cf-orch)")
|
||||
|
||||
sub.add_parser("show-last", help="Print the most recent benchmark report")
|
||||
|
||||
# Also support legacy --list-models / --run / --show-last flags for manage.sh compat
|
||||
parser.add_argument("--list-models", action="store_true")
|
||||
parser.add_argument("--run", action="store_true")
|
||||
parser.add_argument("--show-last", action="store_true")
|
||||
parser.add_argument("--models", default="all")
|
||||
parser.add_argument("--samples", default=str(_CORPUS_DIR))
|
||||
parser.add_argument("--include-large", action="store_true")
|
||||
parser.add_argument("--scan-disk", metavar="LLM_ROOT")
|
||||
parser.add_argument("--cforch", action="store_true")
|
||||
parser.add_argument("--vllm", action="store_true")
|
||||
parser.add_argument("--cforch-url", default=_CFORCH_URL)
|
||||
parser.add_argument("--max-vram", type=int, default=7200, metavar="MB")
|
||||
parser.add_argument("--workers", type=int, default=1, metavar="N")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.cmd == "list-models" or args.list_models:
|
||||
cmd_list_models(args)
|
||||
elif args.cmd == "run" or args.run:
|
||||
cmd_run(args)
|
||||
elif args.cmd == "show-last" or args.show_last:
|
||||
cmd_show_last(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load diff
1026
web/src/views/ClassifierTab.vue
Normal file
1026
web/src/views/ClassifierTab.vue
Normal file
File diff suppressed because it is too large
Load diff
708
web/src/views/CompareTab.vue
Normal file
708
web/src/views/CompareTab.vue
Normal file
|
|
@ -0,0 +1,708 @@
|
|||
<template>
|
||||
<div class="compare-tab">
|
||||
|
||||
<!-- Source toggle -->
|
||||
<div class="source-toggle" role="group" aria-label="Prompt source">
|
||||
<button class="source-btn" :class="{ active: promptSource === 'tasks' }" @click="promptSource = 'tasks'">
|
||||
📋 cf-orch Tasks
|
||||
</button>
|
||||
<button class="source-btn" :class="{ active: promptSource === 'style' }" @click="promptSource = 'style'">
|
||||
✍️ Writing Style Prompts
|
||||
</button>
|
||||
</div>
|
||||
|
||||
<!-- Task selector (cf-orch tasks) -->
|
||||
<details v-if="promptSource === 'tasks'" class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">📋 Pick a Task</span>
|
||||
<span class="picker-badge">{{ cmpSelectedTask ? cmpSelectedTask.name : 'None selected' }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
|
||||
<div v-else-if="llmTasks.length === 0" class="picker-empty">No tasks found — check cforch config.</div>
|
||||
<template v-else>
|
||||
<div v-for="(tasks, type) in llmTasksByType" :key="type" class="picker-category">
|
||||
<span class="picker-cat-name picker-cat-section">{{ type }}</span>
|
||||
<div class="picker-model-list">
|
||||
<label v-for="t in tasks" :key="t.id" class="picker-model-row">
|
||||
<input
|
||||
type="radio"
|
||||
name="cmp-task"
|
||||
:checked="cmpSelectedTask?.id === t.id"
|
||||
@change="selectCmpTask(t)"
|
||||
/>
|
||||
<span class="picker-model-name" :title="t.name">{{ t.name }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Writing style prompt selector -->
|
||||
<details v-if="promptSource === 'style'" class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">✍️ Pick a Writing Style Prompt</span>
|
||||
<span class="picker-badge">{{ selectedVoicePrompt ? selectedVoicePrompt.tag : 'None selected' }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div class="picker-model-list style-prompt-list">
|
||||
<label v-for="vp in STYLE_PROMPTS" :key="vp.tag" class="picker-model-row style-prompt-row">
|
||||
<input
|
||||
type="radio"
|
||||
name="cmp-style-prompt"
|
||||
:checked="selectedVoicePrompt?.tag === vp.tag"
|
||||
@change="selectVoicePrompt(vp)"
|
||||
/>
|
||||
<span class="style-prompt-tag">{{ vp.tag }}</span>
|
||||
<span class="style-prompt-title">{{ vp.thread_title }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Prompt editor + model picker (shown once a prompt source is ready) -->
|
||||
<template v-if="promptSource === 'tasks' ? !!cmpSelectedTask : !!selectedVoicePrompt">
|
||||
<label class="prompt-label" for="cmp-prompt">Prompt</label>
|
||||
<textarea
|
||||
id="cmp-prompt"
|
||||
class="cmp-prompt-editor"
|
||||
v-model="cmpPrompt"
|
||||
rows="6"
|
||||
/>
|
||||
|
||||
<!-- Ollama model picker -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">🤖 Ollama Models</span>
|
||||
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ ollamaLlmModels.length }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<label class="picker-cat-header">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="cmpSelectedModels.size === ollamaLlmModels.length"
|
||||
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < ollamaLlmModels.length"
|
||||
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-cat-name">All ollama models</span>
|
||||
</label>
|
||||
<div class="picker-model-list">
|
||||
<label v-for="m in ollamaLlmModels" :key="m.id" class="picker-model-row">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="cmpSelectedModels.has(m.id)"
|
||||
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-model-name">{{ m.name }}</span>
|
||||
<span class="picker-adapter-type">{{ m.tags.slice(0, 3).join(', ') }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Run controls -->
|
||||
<div class="run-controls">
|
||||
<button
|
||||
class="btn-run"
|
||||
:disabled="cmpRunning || cmpSelectedModels.size === 0"
|
||||
@click="startCompare"
|
||||
>{{ cmpRunning ? '⏳ Running…' : '⚖️ Compare Models' }}</button>
|
||||
<button v-if="cmpRunning" class="btn-cancel" @click="cancelCompare">✕ Cancel</button>
|
||||
</div>
|
||||
|
||||
<!-- Progress log -->
|
||||
<div v-if="cmpLog.length > 0" class="run-log">
|
||||
<div class="log-lines">
|
||||
<div v-for="(line, i) in cmpLog" :key="i" class="log-line">{{ line }}</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Side-by-side results -->
|
||||
<template v-if="cmpResults.length > 0">
|
||||
<h2 class="chart-title">Side-by-Side Responses</h2>
|
||||
<div class="cmp-results-grid">
|
||||
<div
|
||||
v-for="r in cmpResults"
|
||||
:key="r.model"
|
||||
class="cmp-result-card"
|
||||
:class="{ 'cmp-error': !!r.error }"
|
||||
>
|
||||
<div class="cmp-result-header">
|
||||
<span class="cmp-model-name">{{ r.model }}</span>
|
||||
<span class="cmp-meta">
|
||||
<template v-if="r.error"><span class="err-badge">error</span></template>
|
||||
<template v-else>{{ (r.elapsed_ms / 1000).toFixed(1) }}s</template>
|
||||
</span>
|
||||
</div>
|
||||
<pre v-if="r.error" class="cmp-error-text">{{ r.error }}</pre>
|
||||
<pre v-else class="cmp-response">{{ r.response }}</pre>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</template>
|
||||
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted } from 'vue'
|
||||
import { useApiFetch } from '../composables/useApi'
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
interface CfOrchTask {
|
||||
id: string
|
||||
name: string
|
||||
type: string
|
||||
prompt: string
|
||||
system: string
|
||||
}
|
||||
|
||||
interface CfOrchModel {
|
||||
name: string
|
||||
id: string
|
||||
service: string
|
||||
tags: string[]
|
||||
vram_estimate_mb?: number
|
||||
}
|
||||
|
||||
interface CmpResult {
|
||||
model: string
|
||||
response: string
|
||||
elapsed_ms: number
|
||||
error: string | null
|
||||
}
|
||||
|
||||
interface VoicePrompt {
|
||||
tag: string
|
||||
thread_title: string
|
||||
thread_body: string
|
||||
}
|
||||
|
||||
// ── Writing style prompts (mirrors TEST_PROMPTS in benchmark_style.py) ──────
|
||||
const STYLE_SYSTEM = "You are a writing assistant. Your job is to write a Reddit reply that matches the user's voice — casual, direct, community-first. No em dashes. No filler phrases. No semicolons. Short punchy sentences."
|
||||
|
||||
const STYLE_PROMPTS: VoicePrompt[] = [
|
||||
{
|
||||
tag: 'selfhosted_ai_fatigue',
|
||||
thread_title: "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
|
||||
thread_body: "Every session I start over. My whole hardware setup, what tools I use, what I've already tried. It's exhausting. There has to be a better way.",
|
||||
},
|
||||
{
|
||||
tag: 'privacy_local_llm',
|
||||
thread_title: "What's the point of running local LLMs if the apps still phone home?",
|
||||
thread_body: "I went through all the trouble of setting up ollama and now I find out the frontend I'm using is sending telemetry. Kind of defeats the purpose.",
|
||||
},
|
||||
{
|
||||
tag: 'solarpunk_tech',
|
||||
thread_title: "What does solarpunk computing actually look like in practice?",
|
||||
thread_body: "I keep seeing the aesthetic but not a lot of concrete examples of people living it out with their tech choices. What does it mean day to day?",
|
||||
},
|
||||
{
|
||||
tag: 'nd_tools',
|
||||
thread_title: "Tools that actually help with executive function vs ones that just add friction",
|
||||
thread_body: "I've tried a dozen productivity apps and most of them require more executive function to maintain than they save. What actually sticks for you?",
|
||||
},
|
||||
{
|
||||
tag: 'data_ownership',
|
||||
thread_title: "Who actually owns your data when you use a 'free' AI tool?",
|
||||
thread_body: "Read the ToS on three different AI assistants today. In all three cases your inputs can be used for training, shared with partners, and retained indefinitely. Is this just accepted now?",
|
||||
},
|
||||
{
|
||||
tag: 'digital_culture',
|
||||
thread_title: "The internet used to feel like it belonged to everyone. What happened?",
|
||||
thread_body: "I grew up on forums, IRC, personal homepages. Now everything is a platform owned by someone trying to extract value from the community that built it.",
|
||||
},
|
||||
]
|
||||
|
||||
// ── State ───────────────────────────────────────────────────────────────────
|
||||
const llmTasks = ref<CfOrchTask[]>([])
|
||||
const llmTasksLoading = ref(false)
|
||||
const llmModels = ref<CfOrchModel[]>([])
|
||||
|
||||
const promptSource = ref<'tasks' | 'style'>('tasks')
|
||||
const cmpSelectedTask = ref<CfOrchTask | null>(null)
|
||||
const selectedVoicePrompt = ref<VoicePrompt | null>(null)
|
||||
const cmpSystemPrompt = ref('')
|
||||
const cmpPrompt = ref('')
|
||||
const cmpSelectedModels = ref<Set<string>>(new Set())
|
||||
const cmpRunning = ref(false)
|
||||
const cmpLog = ref<string[]>([])
|
||||
const cmpResults = ref<CmpResult[]>([])
|
||||
const cmpEventSource = ref<EventSource | null>(null)
|
||||
|
||||
// ── Computed ────────────────────────────────────────────────────────────────
|
||||
const ollamaLlmModels = computed(() =>
|
||||
llmModels.value.filter(m => m.service === 'ollama')
|
||||
)
|
||||
|
||||
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||
const groups: Record<string, CfOrchTask[]> = {}
|
||||
for (const t of llmTasks.value) {
|
||||
if (!groups[t.type]) groups[t.type] = []
|
||||
groups[t.type].push(t)
|
||||
}
|
||||
return groups
|
||||
})
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
function selectCmpTask(t: CfOrchTask) {
|
||||
cmpSelectedTask.value = t
|
||||
cmpPrompt.value = t.prompt || ''
|
||||
cmpSystemPrompt.value = t.system || ''
|
||||
cmpResults.value = []
|
||||
cmpLog.value = []
|
||||
}
|
||||
|
||||
function selectVoicePrompt(vp: VoicePrompt) {
|
||||
selectedVoicePrompt.value = vp
|
||||
cmpPrompt.value = `Thread: ${vp.thread_title}\n\n${vp.thread_body}\n\nWrite a reply:`
|
||||
cmpSystemPrompt.value = STYLE_SYSTEM
|
||||
cmpResults.value = []
|
||||
cmpLog.value = []
|
||||
}
|
||||
|
||||
function toggleCmpModel(id: string, checked: boolean) {
|
||||
const next = new Set(cmpSelectedModels.value)
|
||||
checked ? next.add(id) : next.delete(id)
|
||||
cmpSelectedModels.value = next
|
||||
}
|
||||
|
||||
function toggleAllCmpModels(checked: boolean) {
|
||||
cmpSelectedModels.value = checked
|
||||
? new Set(ollamaLlmModels.value.map(m => m.id))
|
||||
: new Set()
|
||||
}
|
||||
|
||||
// ── Data loaders ──────────────────────────────────────────────────────────────
|
||||
async function loadLlmTasks() {
|
||||
llmTasksLoading.value = true
|
||||
const { data } = await useApiFetch<{ tasks: CfOrchTask[]; types: string[] }>('/api/cforch/tasks')
|
||||
llmTasksLoading.value = false
|
||||
if (data?.tasks) {
|
||||
llmTasks.value = data.tasks
|
||||
}
|
||||
}
|
||||
|
||||
async function loadLlmModels() {
|
||||
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
||||
if (data?.models) {
|
||||
llmModels.value = data.models
|
||||
// Pre-select all ollama models
|
||||
cmpSelectedModels.value = new Set(
|
||||
data.models.filter(m => m.service === 'ollama').map(m => m.id)
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ── Run / cancel ──────────────────────────────────────────────────────────────
|
||||
function startCompare() {
|
||||
if (!cmpPrompt.value.trim() || cmpSelectedModels.value.size === 0) return
|
||||
cmpRunning.value = true
|
||||
cmpResults.value = []
|
||||
cmpLog.value = []
|
||||
|
||||
const params = new URLSearchParams({
|
||||
prompt: cmpPrompt.value,
|
||||
model_ids: [...cmpSelectedModels.value].join(','),
|
||||
system: cmpSystemPrompt.value,
|
||||
})
|
||||
|
||||
const es = new EventSource(`/api/imitate/run?${params}`)
|
||||
cmpEventSource.value = es
|
||||
|
||||
es.onmessage = (event: MessageEvent) => {
|
||||
try {
|
||||
const msg = JSON.parse(event.data)
|
||||
if (msg.type === 'start') {
|
||||
cmpLog.value.push(`Comparing ${msg.total_models} models…`)
|
||||
} else if (msg.type === 'model_start') {
|
||||
cmpLog.value.push(`→ ${msg.model}…`)
|
||||
} else if (msg.type === 'model_done') {
|
||||
const status = msg.error
|
||||
? `✕ ${msg.error}`
|
||||
: `✓ ${(msg.elapsed_ms / 1000).toFixed(1)}s`
|
||||
cmpLog.value.push(` ${msg.model}: ${status}`)
|
||||
cmpResults.value.push({
|
||||
model: msg.model,
|
||||
response: msg.response,
|
||||
elapsed_ms: msg.elapsed_ms,
|
||||
error: msg.error ?? null,
|
||||
})
|
||||
} else if (msg.type === 'complete') {
|
||||
cmpRunning.value = false
|
||||
es.close()
|
||||
}
|
||||
} catch { /* ignore malformed frames */ }
|
||||
}
|
||||
|
||||
es.onerror = () => {
|
||||
cmpLog.value.push('Connection error.')
|
||||
cmpRunning.value = false
|
||||
es.close()
|
||||
cmpEventSource.value = null
|
||||
}
|
||||
}
|
||||
|
||||
function cancelCompare() {
|
||||
cmpEventSource.value?.close()
|
||||
cmpEventSource.value = null
|
||||
cmpRunning.value = false
|
||||
cmpLog.value.push('Cancelled.')
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
loadLlmTasks()
|
||||
loadLlmModels()
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.compare-tab {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.75rem;
|
||||
}
|
||||
|
||||
/* ── Source toggle ──────────────────────────────────────── */
|
||||
.source-toggle {
|
||||
display: inline-flex;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
align-self: flex-start;
|
||||
}
|
||||
|
||||
.source-btn {
|
||||
padding: 0.4rem 1rem;
|
||||
font-size: 0.83rem;
|
||||
font-family: var(--font-body, sans-serif);
|
||||
font-weight: 500;
|
||||
border: none;
|
||||
background: var(--color-surface, #fff);
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
transition: background 0.15s, color 0.15s;
|
||||
}
|
||||
.source-btn:not(:last-child) { border-right: 1px solid var(--color-border, #d0d7e8); }
|
||||
.source-btn.active { background: var(--app-primary, #2A6080); color: #fff; }
|
||||
.source-btn:not(.active):hover { background: var(--color-surface-raised, #e4ebf5); }
|
||||
|
||||
/* ── Voice prompt list ──────────────────────────────────── */
|
||||
.style-prompt-list { flex-direction: column !important; flex-wrap: nowrap !important; padding-left: 0 !important; gap: 0.4rem !important; }
|
||||
|
||||
.style-prompt-row {
|
||||
flex-direction: column !important;
|
||||
align-items: flex-start !important;
|
||||
gap: 0.15rem !important;
|
||||
padding: 0.5rem 0.6rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.35rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
cursor: pointer;
|
||||
transition: background 0.1s;
|
||||
}
|
||||
.style-prompt-row:hover { background: var(--color-surface-raised, #e4ebf5); }
|
||||
.style-prompt-row:has(input:checked) {
|
||||
background: color-mix(in srgb, var(--app-primary, #2A6080) 10%, transparent);
|
||||
border-color: var(--app-primary, #2A6080);
|
||||
}
|
||||
.style-prompt-row input { display: none; }
|
||||
|
||||
.style-prompt-tag {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.72rem;
|
||||
color: var(--app-primary, #2A6080);
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
}
|
||||
|
||||
.style-prompt-title {
|
||||
font-size: 0.83rem;
|
||||
color: var(--color-text, #1a2338);
|
||||
line-height: 1.4;
|
||||
}
|
||||
|
||||
/* ── Buttons ────────────────────────────────────────────── */
|
||||
.btn-run {
|
||||
padding: 0.45rem 1.1rem;
|
||||
border-radius: 0.375rem;
|
||||
border: none;
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
font-size: 0.88rem;
|
||||
font-family: var(--font-body, sans-serif);
|
||||
cursor: pointer;
|
||||
transition: opacity 0.15s;
|
||||
}
|
||||
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
.btn-run:not(:disabled):hover { opacity: 0.85; }
|
||||
|
||||
.btn-cancel {
|
||||
padding: 0.45rem 0.9rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--color-text-secondary, #6b7a99);
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
border-radius: 0.4rem;
|
||||
font-size: 0.85rem;
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-cancel:hover {
|
||||
background: color-mix(in srgb, var(--color-text-secondary, #6b7a99) 12%, transparent);
|
||||
}
|
||||
|
||||
/* ── Run controls row ───────────────────────────────────── */
|
||||
.run-controls {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
/* ── Run log ────────────────────────────────────────────── */
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.log-lines {
|
||||
max-height: 160px;
|
||||
overflow-y: auto;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: var(--color-surface, #fff);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.1rem;
|
||||
}
|
||||
|
||||
.log-line { color: var(--color-text, #1a2338); line-height: 1.5; }
|
||||
|
||||
/* ── Chart title ────────────────────────────────────────── */
|
||||
.chart-title {
|
||||
font-size: 0.95rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ── Model Picker ───────────────────────────────────────── */
|
||||
.model-picker {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.picker-summary {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.6rem;
|
||||
padding: 0.65rem 0.9rem;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
list-style: none;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
}
|
||||
.picker-summary::-webkit-details-marker { display: none; }
|
||||
.picker-summary::before { content: '▶ '; font-size: 0.65rem; color: var(--color-text-secondary, #6b7a99); }
|
||||
details[open] .picker-summary::before { content: '▼ '; }
|
||||
|
||||
.picker-title {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.picker-badge {
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
background: var(--color-surface, #fff);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
padding: 0.15rem 0.5rem;
|
||||
border-radius: 1rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
margin-left: auto;
|
||||
}
|
||||
|
||||
.picker-body {
|
||||
padding: 0.75rem;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.picker-loading, .picker-empty {
|
||||
font-size: 0.85rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
padding: 0.5rem 0;
|
||||
}
|
||||
|
||||
.picker-category {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.3rem;
|
||||
}
|
||||
|
||||
.picker-cat-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.45rem;
|
||||
font-size: 0.82rem;
|
||||
font-weight: 700;
|
||||
color: var(--color-text, #1a2338);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.picker-cat-name { /* inherits from cat-header or section */ }
|
||||
|
||||
.picker-cat-section {
|
||||
font-weight: 600;
|
||||
font-size: 0.82rem;
|
||||
padding: 0.35rem 0;
|
||||
display: block;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.picker-model-list {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.35rem 0.75rem;
|
||||
padding-left: 1.4rem;
|
||||
}
|
||||
|
||||
.picker-model-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.35rem;
|
||||
font-size: 0.82rem;
|
||||
cursor: pointer;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.picker-model-name {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.78rem;
|
||||
white-space: nowrap;
|
||||
max-width: 18ch;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
.picker-adapter-type {
|
||||
font-size: 0.68rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.25rem;
|
||||
padding: 0.05rem 0.3rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
}
|
||||
|
||||
/* ── Prompt editor ──────────────────────────────────────── */
|
||||
.prompt-label {
|
||||
font-size: 0.85rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
margin-top: 0.5rem;
|
||||
}
|
||||
|
||||
.cmp-prompt-editor {
|
||||
width: 100%;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.85rem;
|
||||
padding: 0.75rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.375rem;
|
||||
background: var(--color-surface, #f0f4fc);
|
||||
color: var(--color-text, #1a2338);
|
||||
resize: vertical;
|
||||
line-height: 1.5;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
.cmp-prompt-editor:focus {
|
||||
outline: 2px solid var(--app-primary, #2A6080);
|
||||
outline-offset: -1px;
|
||||
}
|
||||
|
||||
/* ── Results grid ───────────────────────────────────────── */
|
||||
.cmp-results-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
|
||||
gap: 1rem;
|
||||
margin-top: 0.5rem;
|
||||
}
|
||||
|
||||
.cmp-result-card {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
background: var(--color-surface, #f0f4fc);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.cmp-result-card.cmp-error {
|
||||
border-color: #fca5a5;
|
||||
}
|
||||
|
||||
.cmp-result-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
|
||||
.cmp-model-name {
|
||||
font-size: 0.82rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.cmp-meta {
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
flex-shrink: 0;
|
||||
margin-left: 0.5rem;
|
||||
}
|
||||
|
||||
.err-badge {
|
||||
background: #fee2e2;
|
||||
color: #991b1b;
|
||||
border-radius: 9999px;
|
||||
padding: 0.1rem 0.45rem;
|
||||
font-size: 0.7rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.cmp-response, .cmp-error-text {
|
||||
padding: 0.75rem;
|
||||
font-size: 0.82rem;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
max-height: 300px;
|
||||
overflow-y: auto;
|
||||
margin: 0;
|
||||
flex: 1;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.cmp-error-text { color: #b91c1c; }
|
||||
|
||||
@media (max-width: 600px) {
|
||||
.picker-model-list { padding-left: 0; }
|
||||
.picker-model-name { max-width: 14ch; }
|
||||
}
|
||||
</style>
|
||||
715
web/src/views/LlmEvalTab.vue
Normal file
715
web/src/views/LlmEvalTab.vue
Normal file
|
|
@ -0,0 +1,715 @@
|
|||
<template>
|
||||
<div class="llm-eval-tab">
|
||||
|
||||
<!-- Task Selection -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">📋 Task Selection</span>
|
||||
<span class="picker-badge">{{ llmTaskBadge }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
|
||||
<div v-else-if="Object.keys(llmTasksByType).length === 0" class="picker-empty">
|
||||
No tasks found — check API connection.
|
||||
</div>
|
||||
<template v-else>
|
||||
<div v-for="(tasks, type) in llmTasksByType" :key="type" class="picker-category">
|
||||
<label class="picker-cat-header">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isTaskTypeAllSelected(tasks)"
|
||||
:indeterminate="isTaskTypeIndeterminate(tasks)"
|
||||
@change="toggleTaskType(tasks, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-cat-name">{{ type }}</span>
|
||||
<span class="picker-cat-count">({{ tasks.length }})</span>
|
||||
</label>
|
||||
<div class="picker-model-list">
|
||||
<label v-for="t in tasks" :key="t.id" class="picker-model-row">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="selectedLlmTasks.has(t.id)"
|
||||
@change="toggleLlmTask(t.id, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-model-name" :title="t.name">{{ t.name }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Model Selection -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">🎯 Model Selection</span>
|
||||
<span class="picker-badge">{{ llmModelBadge }}</span>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
|
||||
<div v-else-if="Object.keys(llmModelsByService).length === 0" class="picker-empty">
|
||||
No models found — check cf-orch connection.
|
||||
</div>
|
||||
<template v-else>
|
||||
<div v-for="(models, service) in llmModelsByService" :key="service" class="picker-category">
|
||||
<label class="picker-cat-header">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isServiceAllSelected(models)"
|
||||
:indeterminate="isServiceIndeterminate(models)"
|
||||
@change="toggleService(models, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-cat-name">{{ service }}</span>
|
||||
<span class="picker-cat-count">({{ models.length }})</span>
|
||||
</label>
|
||||
<div class="picker-model-list">
|
||||
<label v-for="m in models" :key="m.id" class="picker-model-row">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="selectedLlmModels.has(m.id)"
|
||||
@change="toggleLlmModel(m.id, ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="picker-model-name" :title="m.name">{{ m.name }}</span>
|
||||
<span class="picker-adapter-type" v-if="m.tags.length">{{ m.tags.join(', ') }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Run Controls -->
|
||||
<div class="run-controls">
|
||||
<button
|
||||
class="btn-run"
|
||||
:disabled="llmRunning || selectedLlmTasks.size === 0 || selectedLlmModels.size === 0"
|
||||
@click="startLlmBenchmark"
|
||||
>
|
||||
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
|
||||
</button>
|
||||
<button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark">✕ Cancel</button>
|
||||
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
|
||||
Select at least one task and one model to run.
|
||||
</span>
|
||||
</div>
|
||||
|
||||
<!-- Progress log -->
|
||||
<div v-if="llmRunning || llmRunLog.length" class="run-log">
|
||||
<div class="run-log-title">
|
||||
<span>{{ llmRunning ? '⏳ Running LLM eval…' : llmError ? '❌ Failed' : '✅ Done' }}</span>
|
||||
<button class="btn-ghost" @click="llmRunLog = []; llmError = ''">Clear</button>
|
||||
</div>
|
||||
<div class="log-lines" ref="llmLogEl">
|
||||
<div
|
||||
v-for="(line, i) in llmRunLog"
|
||||
:key="i"
|
||||
class="log-line"
|
||||
:class="{ 'log-error': line.startsWith('ERROR') || line.startsWith('[error]') }"
|
||||
>{{ line }}</div>
|
||||
</div>
|
||||
<p v-if="llmError" class="run-error">{{ llmError }}</p>
|
||||
</div>
|
||||
|
||||
<!-- Results table -->
|
||||
<template v-if="llmResults.length > 0">
|
||||
<h2 class="chart-title">LLM Eval Results</h2>
|
||||
<div class="heatmap-scroll">
|
||||
<table class="heatmap llm-results-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="hm-label-col">Model</th>
|
||||
<th class="hm-model-col">overall</th>
|
||||
<th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
|
||||
<th class="hm-model-col">tok/s</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr v-for="row in llmResults" :key="row.model_id">
|
||||
<td class="hm-label-cell llm-model-name-cell" :title="row.model_id">{{ row.model_name }}</td>
|
||||
<td
|
||||
class="hm-value-cell"
|
||||
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
|
||||
>{{ pct(row.avg_quality_score) }}</td>
|
||||
<td
|
||||
v-for="col in llmTaskTypeCols"
|
||||
:key="col"
|
||||
class="hm-value-cell"
|
||||
:class="{ 'bt-best': llmBestByCol[col] === row.model_id }"
|
||||
>{{ row.quality_by_task_type[col] != null ? pct(row.quality_by_task_type[col]) : '—' }}</td>
|
||||
<td class="hm-value-cell llm-tps-cell">{{ row.avg_tokens_per_sec.toFixed(1) }}</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<p class="heatmap-hint">Run LLM Eval to refresh. Green = best per column.</p>
|
||||
</template>
|
||||
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted, nextTick } from 'vue'
|
||||
import { useApiFetch } from '../composables/useApi'
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
interface CfOrchTask {
|
||||
id: string
|
||||
name: string
|
||||
type: string
|
||||
prompt: string
|
||||
system: string
|
||||
}
|
||||
|
||||
interface CfOrchModel {
|
||||
name: string
|
||||
id: string
|
||||
service: string
|
||||
tags: string[]
|
||||
vram_estimate_mb?: number
|
||||
}
|
||||
|
||||
interface LlmModelResult {
|
||||
model_name: string
|
||||
model_id: string
|
||||
node_id: string
|
||||
avg_tokens_per_sec: number
|
||||
avg_completion_ms: number
|
||||
avg_quality_score: number
|
||||
finetune_candidates: number
|
||||
error_count: number
|
||||
quality_by_task_type: Record<string, number>
|
||||
}
|
||||
|
||||
// ── State ───────────────────────────────────────────────────────────────────
|
||||
const llmTasks = ref<CfOrchTask[]>([])
|
||||
const llmTasksLoading = ref(false)
|
||||
const llmModels = ref<CfOrchModel[]>([])
|
||||
const llmModelsLoading = ref(false)
|
||||
|
||||
const selectedLlmTasks = ref<Set<string>>(new Set())
|
||||
const selectedLlmModels = ref<Set<string>>(new Set())
|
||||
|
||||
const llmRunning = ref(false)
|
||||
const llmRunLog = ref<string[]>([])
|
||||
const llmError = ref('')
|
||||
const llmResults = ref<LlmModelResult[]>([])
|
||||
const llmEventSource = ref<EventSource | null>(null)
|
||||
const llmLogEl = ref<HTMLElement | null>(null)
|
||||
|
||||
// ── Computed ────────────────────────────────────────────────────────────────
|
||||
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||
const groups: Record<string, CfOrchTask[]> = {}
|
||||
for (const t of llmTasks.value) {
|
||||
if (!groups[t.type]) groups[t.type] = []
|
||||
groups[t.type].push(t)
|
||||
}
|
||||
return groups
|
||||
})
|
||||
|
||||
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
|
||||
const groups: Record<string, CfOrchModel[]> = {}
|
||||
for (const m of llmModels.value) {
|
||||
if (!groups[m.service]) groups[m.service] = []
|
||||
groups[m.service].push(m)
|
||||
}
|
||||
return groups
|
||||
})
|
||||
|
||||
const llmTaskBadge = computed(() => {
|
||||
const total = llmTasks.value.length
|
||||
if (total === 0) return 'No tasks available'
|
||||
const sel = selectedLlmTasks.value.size
|
||||
if (sel === total) return `All tasks (${total})`
|
||||
return `${sel} of ${total} tasks selected`
|
||||
})
|
||||
|
||||
const llmModelBadge = computed(() => {
|
||||
const total = llmModels.value.length
|
||||
if (total === 0) return 'No models available'
|
||||
const sel = selectedLlmModels.value.size
|
||||
if (sel === total) return `All models (${total})`
|
||||
return `${sel} of ${total} selected`
|
||||
})
|
||||
|
||||
const llmTaskTypeCols = computed(() => {
|
||||
const types = new Set<string>()
|
||||
for (const r of llmResults.value) {
|
||||
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
|
||||
}
|
||||
return [...types].sort()
|
||||
})
|
||||
|
||||
const llmBestByCol = computed((): Record<string, string> => {
|
||||
const best: Record<string, string> = {}
|
||||
if (llmResults.value.length === 0) return best
|
||||
|
||||
let bestId = '', bestVal = -Infinity
|
||||
for (const r of llmResults.value) {
|
||||
if (r.avg_quality_score > bestVal) { bestVal = r.avg_quality_score; bestId = r.model_id }
|
||||
}
|
||||
best['overall'] = bestId
|
||||
|
||||
for (const col of llmTaskTypeCols.value) {
|
||||
bestId = ''; bestVal = -Infinity
|
||||
for (const r of llmResults.value) {
|
||||
const v = r.quality_by_task_type[col]
|
||||
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
||||
}
|
||||
best[col] = bestId
|
||||
}
|
||||
return best
|
||||
})
|
||||
|
||||
// ── Helpers ─────────────────────────────────────────────────────────────────
|
||||
function pct(v: number): string {
|
||||
return `${(v * 100).toFixed(1)}%`
|
||||
}
|
||||
|
||||
// Task picker helpers
|
||||
function isTaskTypeAllSelected(tasks: CfOrchTask[]): boolean {
|
||||
return tasks.length > 0 && tasks.every(t => selectedLlmTasks.value.has(t.id))
|
||||
}
|
||||
function isTaskTypeIndeterminate(tasks: CfOrchTask[]): boolean {
|
||||
const some = tasks.some(t => selectedLlmTasks.value.has(t.id))
|
||||
return some && !isTaskTypeAllSelected(tasks)
|
||||
}
|
||||
function toggleLlmTask(id: string, checked: boolean) {
|
||||
const next = new Set(selectedLlmTasks.value)
|
||||
checked ? next.add(id) : next.delete(id)
|
||||
selectedLlmTasks.value = next
|
||||
}
|
||||
function toggleTaskType(tasks: CfOrchTask[], checked: boolean) {
|
||||
const next = new Set(selectedLlmTasks.value)
|
||||
for (const t of tasks) {
|
||||
checked ? next.add(t.id) : next.delete(t.id)
|
||||
}
|
||||
selectedLlmTasks.value = next
|
||||
}
|
||||
|
||||
// Model picker helpers
|
||||
function isServiceAllSelected(models: CfOrchModel[]): boolean {
|
||||
return models.length > 0 && models.every(m => selectedLlmModels.value.has(m.id))
|
||||
}
|
||||
function isServiceIndeterminate(models: CfOrchModel[]): boolean {
|
||||
const some = models.some(m => selectedLlmModels.value.has(m.id))
|
||||
return some && !isServiceAllSelected(models)
|
||||
}
|
||||
function toggleLlmModel(id: string, checked: boolean) {
|
||||
const next = new Set(selectedLlmModels.value)
|
||||
checked ? next.add(id) : next.delete(id)
|
||||
selectedLlmModels.value = next
|
||||
}
|
||||
function toggleService(models: CfOrchModel[], checked: boolean) {
|
||||
const next = new Set(selectedLlmModels.value)
|
||||
for (const m of models) {
|
||||
checked ? next.add(m.id) : next.delete(m.id)
|
||||
}
|
||||
selectedLlmModels.value = next
|
||||
}
|
||||
|
||||
// ── Data loaders ─────────────────────────────────────────────────────────────
|
||||
async function loadLlmTasks() {
|
||||
llmTasksLoading.value = true
|
||||
const { data } = await useApiFetch<{ tasks: CfOrchTask[]; types: string[] }>('/api/cforch/tasks')
|
||||
llmTasksLoading.value = false
|
||||
if (data?.tasks) {
|
||||
llmTasks.value = data.tasks
|
||||
selectedLlmTasks.value = new Set(data.tasks.map(t => t.id))
|
||||
}
|
||||
}
|
||||
|
||||
async function loadLlmModels() {
|
||||
llmModelsLoading.value = true
|
||||
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
||||
llmModelsLoading.value = false
|
||||
if (data?.models) {
|
||||
llmModels.value = data.models
|
||||
selectedLlmModels.value = new Set(data.models.map(m => m.id))
|
||||
}
|
||||
}
|
||||
|
||||
async function loadLlmResults() {
|
||||
const { data } = await useApiFetch<LlmModelResult[]>('/api/cforch/results')
|
||||
if (Array.isArray(data) && data.length > 0) {
|
||||
llmResults.value = data
|
||||
}
|
||||
}
|
||||
|
||||
// ── Run / cancel ──────────────────────────────────────────────────────────────
|
||||
function startLlmBenchmark() {
|
||||
llmRunning.value = true
|
||||
llmRunLog.value = []
|
||||
llmError.value = ''
|
||||
|
||||
const params = new URLSearchParams()
|
||||
const taskIds = [...selectedLlmTasks.value].join(',')
|
||||
if (taskIds) params.set('task_ids', taskIds)
|
||||
|
||||
const es = new EventSource(`/api/cforch/run?${params}`)
|
||||
llmEventSource.value = es
|
||||
|
||||
es.onmessage = async (e: MessageEvent) => {
|
||||
const msg = JSON.parse(e.data)
|
||||
if (msg.type === 'progress' && typeof msg.message === 'string') {
|
||||
llmRunLog.value.push(msg.message)
|
||||
await nextTick()
|
||||
llmLogEl.value?.scrollTo({ top: llmLogEl.value.scrollHeight, behavior: 'smooth' })
|
||||
} else if (msg.type === 'result' && Array.isArray(msg.summary)) {
|
||||
llmResults.value = msg.summary
|
||||
} else if (msg.type === 'complete') {
|
||||
llmRunning.value = false
|
||||
es.close()
|
||||
llmEventSource.value = null
|
||||
} else if (msg.type === 'error' && typeof msg.message === 'string') {
|
||||
llmError.value = msg.message
|
||||
llmRunning.value = false
|
||||
es.close()
|
||||
llmEventSource.value = null
|
||||
}
|
||||
}
|
||||
|
||||
es.onerror = () => {
|
||||
if (llmRunning.value) llmError.value = 'Connection lost'
|
||||
llmRunning.value = false
|
||||
es.close()
|
||||
llmEventSource.value = null
|
||||
}
|
||||
}
|
||||
|
||||
async function cancelLlmBenchmark() {
|
||||
llmEventSource.value?.close()
|
||||
llmEventSource.value = null
|
||||
llmRunning.value = false
|
||||
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {})
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
loadLlmTasks()
|
||||
loadLlmModels()
|
||||
loadLlmResults()
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.llm-eval-tab {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.75rem;
|
||||
}
|
||||
|
||||
/* ── Buttons ────────────────────────────────────────────── */
|
||||
.btn-run {
|
||||
padding: 0.45rem 1.1rem;
|
||||
border-radius: 0.375rem;
|
||||
border: none;
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
font-size: 0.88rem;
|
||||
font-family: var(--font-body, sans-serif);
|
||||
cursor: pointer;
|
||||
transition: opacity 0.15s;
|
||||
}
|
||||
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
.btn-run:not(:disabled):hover { opacity: 0.85; }
|
||||
|
||||
.btn-cancel {
|
||||
padding: 0.45rem 0.9rem;
|
||||
background: transparent;
|
||||
border: 1px solid var(--color-text-secondary, #6b7a99);
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
border-radius: 0.4rem;
|
||||
font-size: 0.85rem;
|
||||
font-weight: 500;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-cancel:hover {
|
||||
background: color-mix(in srgb, var(--color-text-secondary, #6b7a99) 12%, transparent);
|
||||
}
|
||||
|
||||
.btn-ghost {
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
font-size: 0.78rem;
|
||||
padding: 0.1rem 0.3rem;
|
||||
border-radius: 0.2rem;
|
||||
}
|
||||
.btn-ghost:hover { background: var(--color-border, #d0d7e8); }
|
||||
|
||||
/* ── Run controls row ───────────────────────────────────── */
|
||||
.run-controls {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.run-hint {
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
/* ── Run log ────────────────────────────────────────────── */
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.run-log-title {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.log-lines {
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: var(--color-surface, #fff);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.1rem;
|
||||
}
|
||||
|
||||
.log-line { color: var(--color-text, #1a2338); line-height: 1.5; }
|
||||
.log-line.log-error { color: var(--color-error, #ef4444); }
|
||||
|
||||
.run-error {
|
||||
margin: 0;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: color-mix(in srgb, var(--color-error, #ef4444) 10%, transparent);
|
||||
color: var(--color-error, #ef4444);
|
||||
font-size: 0.82rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
}
|
||||
|
||||
/* ── Chart title ────────────────────────────────────────── */
|
||||
.chart-title {
|
||||
font-size: 0.95rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ── Heatmap ────────────────────────────────────────────── */
|
||||
.heatmap-scroll {
|
||||
overflow-x: auto;
|
||||
border-radius: 0.5rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
|
||||
.heatmap {
|
||||
border-collapse: collapse;
|
||||
min-width: 100%;
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.hm-label-col {
|
||||
text-align: left;
|
||||
min-width: 11rem;
|
||||
padding: 0.4rem 0.6rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
font-weight: 600;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
position: sticky;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.hm-model-col {
|
||||
min-width: 5rem;
|
||||
max-width: 8rem;
|
||||
padding: 0.4rem 0.5rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.7rem;
|
||||
text-overflow: ellipsis;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.hm-label-cell {
|
||||
padding: 0.35rem 0.6rem;
|
||||
background: var(--color-surface, #fff);
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
white-space: nowrap;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.74rem;
|
||||
position: sticky;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.hm-value-cell {
|
||||
padding: 0.35rem 0.5rem;
|
||||
text-align: center;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-variant-numeric: tabular-nums;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
cursor: default;
|
||||
}
|
||||
|
||||
.heatmap-hint {
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* LLM-specific table styles */
|
||||
.llm-results-table .bt-best {
|
||||
color: var(--color-success, #3a7a32);
|
||||
font-weight: 700;
|
||||
background: color-mix(in srgb, var(--color-success, #3a7a32) 8%, transparent);
|
||||
}
|
||||
|
||||
.llm-model-name-cell {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.75rem;
|
||||
white-space: nowrap;
|
||||
max-width: 16rem;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
background: var(--color-surface, #fff);
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
padding: 0.35rem 0.6rem;
|
||||
position: sticky;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.llm-tps-cell {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-variant-numeric: tabular-nums;
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* ── Model Picker ───────────────────────────────────────── */
|
||||
.model-picker {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.picker-summary {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.6rem;
|
||||
padding: 0.65rem 0.9rem;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
list-style: none;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
}
|
||||
.picker-summary::-webkit-details-marker { display: none; }
|
||||
.picker-summary::before { content: '▶ '; font-size: 0.65rem; color: var(--color-text-secondary, #6b7a99); }
|
||||
details[open] .picker-summary::before { content: '▼ '; }
|
||||
|
||||
.picker-title {
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.picker-badge {
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
background: var(--color-surface, #fff);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
padding: 0.15rem 0.5rem;
|
||||
border-radius: 1rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
margin-left: auto;
|
||||
}
|
||||
|
||||
.picker-body {
|
||||
padding: 0.75rem;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.picker-loading, .picker-empty {
|
||||
font-size: 0.85rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
padding: 0.5rem 0;
|
||||
}
|
||||
|
||||
.picker-category {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.3rem;
|
||||
}
|
||||
|
||||
.picker-cat-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.45rem;
|
||||
font-size: 0.82rem;
|
||||
font-weight: 700;
|
||||
color: var(--color-text, #1a2338);
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.picker-cat-name { /* inherits from cat-header */ }
|
||||
|
||||
.picker-cat-count {
|
||||
font-weight: 400;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.75rem;
|
||||
text-transform: none;
|
||||
letter-spacing: 0;
|
||||
}
|
||||
|
||||
.picker-model-list {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.35rem 0.75rem;
|
||||
padding-left: 1.4rem;
|
||||
}
|
||||
|
||||
.picker-model-row {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.35rem;
|
||||
font-size: 0.82rem;
|
||||
cursor: pointer;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.picker-model-name {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.78rem;
|
||||
white-space: nowrap;
|
||||
max-width: 18ch;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
}
|
||||
|
||||
.picker-adapter-type {
|
||||
font-size: 0.68rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.25rem;
|
||||
padding: 0.05rem 0.3rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
}
|
||||
|
||||
@media (max-width: 600px) {
|
||||
.picker-model-list { padding-left: 0; }
|
||||
.picker-model-name { max-width: 14ch; }
|
||||
}
|
||||
</style>
|
||||
919
web/src/views/StyleTab.vue
Normal file
919
web/src/views/StyleTab.vue
Normal file
|
|
@ -0,0 +1,919 @@
|
|||
<template>
|
||||
<div class="style-tab">
|
||||
|
||||
<!-- ── Controls row ──────────────────────────────────────────────────── -->
|
||||
<div class="style-controls">
|
||||
|
||||
<!-- Model picker -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">✍️ Models</span>
|
||||
<span class="picker-badge">{{ selectedCount }} selected</span>
|
||||
<button class="btn-refresh" :disabled="modelsLoading" @click.stop="loadModels" title="Refresh model list">
|
||||
{{ modelsLoading ? '⏳' : '🔄' }}
|
||||
</button>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="modelsLoading" class="picker-loading">Loading models…</div>
|
||||
<div v-else-if="loadError" class="picker-error">{{ loadError }}</div>
|
||||
<template v-else>
|
||||
|
||||
<!-- Ollama group -->
|
||||
<div class="picker-group" v-if="ollamaModels.length">
|
||||
<div class="group-header">
|
||||
<label class="group-check">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isGroupAllSelected('ollama')"
|
||||
:indeterminate="isGroupIndeterminate('ollama')"
|
||||
@change="toggleGroup('ollama', ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="group-label">Ollama</span>
|
||||
<span class="group-count">({{ ollamaModels.length }})</span>
|
||||
</label>
|
||||
<span class="group-note">auto-synced with Models view</span>
|
||||
</div>
|
||||
<div class="model-list">
|
||||
<label v-for="m in ollamaModels" :key="m.id" class="model-item">
|
||||
<input type="checkbox" :value="m.id" v-model="selectedModels" />
|
||||
<span class="model-name">{{ m.name }}</span>
|
||||
<span v-if="m.size_mb" class="model-meta">{{ formatMb(m.size_mb) }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- cf-text group -->
|
||||
<div class="picker-group" v-if="cftextModels.length">
|
||||
<div class="group-header">
|
||||
<label class="group-check">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isGroupAllSelected('cf-text')"
|
||||
:indeterminate="isGroupIndeterminate('cf-text')"
|
||||
@change="toggleGroup('cf-text', ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="group-label">cf-text (cf-orch)</span>
|
||||
<span class="group-count">({{ cftextModels.length }})</span>
|
||||
</label>
|
||||
<span class="group-note">GGUFs via coordinator — enable cf-orch below</span>
|
||||
</div>
|
||||
<div class="model-list">
|
||||
<label v-for="m in cftextModels" :key="m.id" class="model-item">
|
||||
<input type="checkbox" :value="m.id" v-model="selectedModels" />
|
||||
<span class="model-name">{{ m.name }}</span>
|
||||
<span v-if="m.vram_mb" class="model-meta">{{ formatMb(m.vram_mb) }} VRAM</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div v-if="!ollamaModels.length && !cftextModels.length" class="picker-empty">
|
||||
No models available — check Ollama and cf-orch connections.
|
||||
</div>
|
||||
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Options panel -->
|
||||
<details class="options-panel">
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">⚙️ Options</span>
|
||||
</summary>
|
||||
<div class="options-body">
|
||||
<label class="option-row">
|
||||
<input type="checkbox" v-model="useCforch" :disabled="running" />
|
||||
<span class="option-label">Use cf-orch backend</span>
|
||||
<span class="option-hint">Routes generation through cf-text instead of ollama</span>
|
||||
</label>
|
||||
<label class="option-row" :class="{ dimmed: !useCforch }">
|
||||
<span class="option-label">Max VRAM (MB)</span>
|
||||
<input
|
||||
type="number"
|
||||
v-model.number="maxVram"
|
||||
:disabled="running || !useCforch"
|
||||
min="1024"
|
||||
max="24576"
|
||||
step="512"
|
||||
class="option-number"
|
||||
/>
|
||||
<span class="option-hint">Skip models exceeding this VRAM limit</span>
|
||||
</label>
|
||||
<label class="option-row">
|
||||
<span class="option-label">Parallel workers</span>
|
||||
<input
|
||||
type="number"
|
||||
v-model.number="workers"
|
||||
:disabled="running"
|
||||
min="1"
|
||||
max="16"
|
||||
step="1"
|
||||
class="option-number"
|
||||
/>
|
||||
<span class="option-hint">Models to score simultaneously (1 = sequential)</span>
|
||||
</label>
|
||||
<label class="option-row">
|
||||
<input type="checkbox" v-model="includeLarge" :disabled="running" />
|
||||
<span class="option-label">Include large models (30B+)</span>
|
||||
<span class="option-hint">Off by default — these take much longer</span>
|
||||
</label>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- ── Run controls ──────────────────────────────────────────────────── -->
|
||||
<div class="run-bar">
|
||||
<button class="btn-run" :disabled="running || selectedCount === 0" @click="startBenchmark">
|
||||
{{ running ? '⏳ Running…' : results.length ? '🔄 Re-run' : '▶ Run Benchmark' }}
|
||||
</button>
|
||||
<button v-if="running" class="btn-cancel" @click="cancelBenchmark">✕ Cancel</button>
|
||||
<span v-if="selectedCount === 0 && !running" class="run-hint">Select at least one model above</span>
|
||||
</div>
|
||||
|
||||
<!-- ── Progress log ──────────────────────────────────────────────────── -->
|
||||
<div v-if="runLog.length" class="run-log">
|
||||
<div class="run-log-header">
|
||||
<span class="run-log-title">Run log</span>
|
||||
<button class="btn-clear" @click="runLog = []">Clear</button>
|
||||
</div>
|
||||
<pre class="run-log-body" ref="logEl">{{ runLog.join('\n') }}</pre>
|
||||
</div>
|
||||
|
||||
<!-- ── Past runs picker ─────────────────────────────────────────────── -->
|
||||
<div class="history-bar" v-if="pastRuns.length">
|
||||
<label class="history-label">📂 Past runs:</label>
|
||||
<select class="history-select" v-model="selectedRun" @change="loadRun(selectedRun)">
|
||||
<option value="">— select a past run —</option>
|
||||
<option v-for="r in pastRuns" :key="r.filename" :value="r.filename">
|
||||
{{ r.date }} · {{ r.model_count }} model{{ r.model_count !== 1 ? 's' : '' }} · top {{ r.top_score }}/100
|
||||
</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<!-- ── Results table ─────────────────────────────────────────────────── -->
|
||||
<div v-if="results.length" class="results-section">
|
||||
<div class="results-header">
|
||||
<h2 class="results-title">Rankings</h2>
|
||||
<button
|
||||
class="btn-corrections"
|
||||
:disabled="sendingCorrections"
|
||||
@click="sendToCorrections"
|
||||
title="Push all outputs from this run into the Corrections review queue"
|
||||
>
|
||||
{{ sendingCorrections ? '⏳ Sending…' : correctionsMsg || '✍️ Send to Corrections' }}
|
||||
</button>
|
||||
</div>
|
||||
<div class="results-table-wrap">
|
||||
<table class="results-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Rank</th>
|
||||
<th>Model</th>
|
||||
<th>Score</th>
|
||||
<th>Latency</th>
|
||||
<th title="Em-dash count">—</th>
|
||||
<th title="Filler phrase hits">Fillers</th>
|
||||
<th title="Semicolons">;</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr
|
||||
v-for="(r, i) in results"
|
||||
:key="r.model_id"
|
||||
class="result-row"
|
||||
:class="{ 'top-row': i === 0 }"
|
||||
@click="toggleExpanded(r.model_id)"
|
||||
>
|
||||
<td class="rank-cell">{{ medal(i) }}</td>
|
||||
<td class="model-cell">
|
||||
<span class="model-name-text">{{ r.model_id }}</span>
|
||||
</td>
|
||||
<td class="score-cell">
|
||||
<span class="score-pill" :style="scorePillStyle(r.avg_score)">
|
||||
{{ r.avg_score.toFixed(0) }}
|
||||
</span>
|
||||
</td>
|
||||
<td class="latency-cell">{{ formatLatency(r.avg_latency_ms) }}</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_em_dashes > 0 }">
|
||||
{{ r.total_em_dashes }}
|
||||
</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_filler_hits > 0 }">
|
||||
{{ r.total_filler_hits }}
|
||||
</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_semicolons > 0 }">
|
||||
{{ r.total_semicolons }}
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<!-- Expandable sample outputs -->
|
||||
<div v-for="r in results" :key="'exp-' + r.model_id">
|
||||
<div v-if="expandedModels.has(r.model_id)" class="sample-outputs">
|
||||
<div class="sample-header">
|
||||
<strong>{{ r.model_id }}</strong>
|
||||
<button class="btn-collapse" @click="toggleExpanded(r.model_id)">✕ Close</button>
|
||||
</div>
|
||||
<div v-for="pr in r.prompt_results" :key="pr.tag" class="sample-prompt">
|
||||
<div class="sample-tag">
|
||||
<span class="tag-name">{{ pr.tag }}</span>
|
||||
<span class="tag-score">{{ pr.score.toFixed(0) }}/100</span>
|
||||
<span class="tag-latency">{{ formatLatency(pr.latency_ms) }}</span>
|
||||
</div>
|
||||
<pre class="sample-text">{{ pr.output || '(no output)' }}</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted, nextTick, watch } from 'vue'
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface StyleModel {
|
||||
id: string
|
||||
name: string
|
||||
source: 'ollama' | 'cf-text'
|
||||
size_mb?: number | null
|
||||
vram_mb?: number | null
|
||||
description?: string
|
||||
}
|
||||
|
||||
interface PromptResult {
|
||||
tag: string
|
||||
output: string
|
||||
score: number
|
||||
latency_ms: number
|
||||
signals: Record<string, unknown>
|
||||
}
|
||||
|
||||
interface ModelResult {
|
||||
model_id: string
|
||||
avg_score: number
|
||||
avg_latency_ms: number
|
||||
total_filler_hits: number
|
||||
total_em_dashes: number
|
||||
total_semicolons: number
|
||||
prompt_results: PromptResult[]
|
||||
}
|
||||
|
||||
interface PastRun {
|
||||
filename: string
|
||||
date: string
|
||||
model_count: number
|
||||
top_score: number
|
||||
}
|
||||
|
||||
// ── State ───────────────────────────────────────────────────────────────────
|
||||
|
||||
const ollamaModels = ref<StyleModel[]>([])
|
||||
const cftextModels = ref<StyleModel[]>([])
|
||||
const selectedModels = ref<string[]>([])
|
||||
const modelsLoading = ref(false)
|
||||
const loadError = ref('')
|
||||
|
||||
const useCforch = ref(false)
|
||||
const maxVram = ref(7200)
|
||||
const workers = ref(1)
|
||||
const includeLarge = ref(false)
|
||||
|
||||
const running = ref(false)
|
||||
const runLog = ref<string[]>([])
|
||||
const logEl = ref<HTMLPreElement | null>(null)
|
||||
|
||||
const results = ref<ModelResult[]>([])
|
||||
const pastRuns = ref<PastRun[]>([])
|
||||
const selectedRun = ref('')
|
||||
const expandedModels = ref(new Set<string>())
|
||||
const sendingCorrections = ref(false)
|
||||
const correctionsMsg = ref('')
|
||||
|
||||
// ── Computed ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const selectedCount = computed(() => selectedModels.value.length)
|
||||
|
||||
function isGroupAllSelected(source: string): boolean {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
return group.length > 0 && group.every(m => selectedModels.value.includes(m.id))
|
||||
}
|
||||
|
||||
function isGroupIndeterminate(source: string): boolean {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
const count = group.filter(m => selectedModels.value.includes(m.id)).length
|
||||
return count > 0 && count < group.length
|
||||
}
|
||||
|
||||
// ── Actions ──────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadModels() {
|
||||
modelsLoading.value = true
|
||||
loadError.value = ''
|
||||
try {
|
||||
const resp = await fetch('/api/style/models')
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
const data = await resp.json()
|
||||
ollamaModels.value = data.ollama ?? []
|
||||
cftextModels.value = data.cf_text ?? []
|
||||
} catch (e: unknown) {
|
||||
loadError.value = `Failed to load models: ${e instanceof Error ? e.message : String(e)}`
|
||||
} finally {
|
||||
modelsLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
async function loadPastRuns() {
|
||||
try {
|
||||
const resp = await fetch('/api/style/results')
|
||||
if (resp.ok) pastRuns.value = await resp.json()
|
||||
} catch { /* non-fatal */ }
|
||||
}
|
||||
|
||||
async function loadRun(filename: string) {
|
||||
if (!filename) return
|
||||
try {
|
||||
const resp = await fetch(`/api/style/results/${filename}`)
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
results.value = await resp.json()
|
||||
expandedModels.value.clear()
|
||||
} catch (e: unknown) {
|
||||
runLog.value.push(`[error] Failed to load ${filename}: ${e instanceof Error ? e.message : String(e)}`)
|
||||
}
|
||||
}
|
||||
|
||||
function toggleGroup(source: string, checked: boolean) {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
const ids = group.map(m => m.id)
|
||||
if (checked) {
|
||||
const newSet = new Set([...selectedModels.value, ...ids])
|
||||
selectedModels.value = [...newSet]
|
||||
} else {
|
||||
selectedModels.value = selectedModels.value.filter(id => !ids.includes(id))
|
||||
}
|
||||
}
|
||||
|
||||
function toggleExpanded(modelId: string) {
|
||||
if (expandedModels.value.has(modelId)) {
|
||||
expandedModels.value.delete(modelId)
|
||||
} else {
|
||||
expandedModels.value.add(modelId)
|
||||
}
|
||||
expandedModels.value = new Set(expandedModels.value)
|
||||
}
|
||||
|
||||
function startBenchmark() {
|
||||
if (running.value || selectedCount.value === 0) return
|
||||
running.value = true
|
||||
runLog.value = []
|
||||
results.value = []
|
||||
expandedModels.value.clear()
|
||||
|
||||
const params = new URLSearchParams({
|
||||
models: selectedModels.value.join(','),
|
||||
use_cforch: String(useCforch.value),
|
||||
max_vram: String(maxVram.value),
|
||||
workers: String(workers.value),
|
||||
include_large: String(includeLarge.value),
|
||||
})
|
||||
|
||||
const es = new EventSource(`/api/style/run?${params}`)
|
||||
|
||||
es.onmessage = async (ev) => {
|
||||
try {
|
||||
const msg = JSON.parse(ev.data)
|
||||
if (msg.type === 'progress') {
|
||||
runLog.value.push(msg.message)
|
||||
await nextTick()
|
||||
if (logEl.value) logEl.value.scrollTop = logEl.value.scrollHeight
|
||||
} else if (msg.type === 'result') {
|
||||
results.value = msg.results ?? []
|
||||
await loadPastRuns()
|
||||
} else if (msg.type === 'complete') {
|
||||
running.value = false
|
||||
es.close()
|
||||
} else if (msg.type === 'error') {
|
||||
runLog.value.push(`[error] ${msg.message}`)
|
||||
running.value = false
|
||||
es.close()
|
||||
}
|
||||
} catch { /* ignore parse errors */ }
|
||||
}
|
||||
|
||||
es.onerror = () => {
|
||||
if (running.value) {
|
||||
runLog.value.push('[error] Connection lost')
|
||||
running.value = false
|
||||
}
|
||||
es.close()
|
||||
}
|
||||
}
|
||||
|
||||
async function cancelBenchmark() {
|
||||
try {
|
||||
await fetch('/api/style/cancel', { method: 'POST' })
|
||||
} finally {
|
||||
running.value = false
|
||||
runLog.value.push('[cancelled]')
|
||||
}
|
||||
}
|
||||
|
||||
async function sendToCorrections() {
|
||||
if (!selectedRun.value || sendingCorrections.value) return
|
||||
sendingCorrections.value = true
|
||||
correctionsMsg.value = ''
|
||||
try {
|
||||
const resp = await fetch('/api/style/send-to-corrections', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ filename: selectedRun.value, model_ids: [] }),
|
||||
})
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
const data = await resp.json()
|
||||
correctionsMsg.value = `✓ ${data.imported} added to Corrections`
|
||||
} catch (e: unknown) {
|
||||
correctionsMsg.value = `Error: ${e instanceof Error ? e.message : String(e)}`
|
||||
} finally {
|
||||
sendingCorrections.value = false
|
||||
}
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
function formatMb(mb: number): string {
|
||||
return mb >= 1024 ? `${(mb / 1024).toFixed(1)} GB` : `${mb} MB`
|
||||
}
|
||||
|
||||
function formatLatency(ms: number): string {
|
||||
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`
|
||||
}
|
||||
|
||||
function medal(index: number): string {
|
||||
return ['🥇', '🥈', '🥉'][index] ?? `#${index + 1}`
|
||||
}
|
||||
|
||||
function scorePillStyle(score: number): Record<string, string> {
|
||||
const hue = Math.round((score / 100) * 120) // 0=red, 120=green
|
||||
return {
|
||||
background: `hsl(${hue} 60% 88%)`,
|
||||
color: `hsl(${hue} 60% 28%)`,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Lifecycle ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Auto-enable cf-orch when cf-text models are selected
|
||||
watch(selectedModels, (ids) => {
|
||||
const hasCftext = ids.some(id => cftextModels.value.find(m => m.id === id))
|
||||
if (hasCftext) useCforch.value = true
|
||||
})
|
||||
|
||||
onMounted(async () => {
|
||||
await Promise.all([loadModels(), loadPastRuns()])
|
||||
// Auto-load the latest results if any exist
|
||||
if (pastRuns.value.length) {
|
||||
selectedRun.value = pastRuns.value[0].filename
|
||||
await loadRun(pastRuns.value[0].filename)
|
||||
}
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.style-tab {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
/* ── Controls ─────────────────────────────────────────────────────────────── */
|
||||
|
||||
.style-controls {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.75rem;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.model-picker,
|
||||
.options-panel {
|
||||
flex: 1;
|
||||
min-width: 280px;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.picker-summary {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.65rem 0.85rem;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
list-style: none;
|
||||
}
|
||||
|
||||
.picker-summary::-webkit-details-marker { display: none; }
|
||||
|
||||
.picker-title { flex: 1; color: var(--color-text, #1a2338); }
|
||||
.picker-badge {
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
border-radius: 9999px;
|
||||
padding: 0.1rem 0.5rem;
|
||||
font-size: 0.72rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.btn-refresh {
|
||||
border: none;
|
||||
background: transparent;
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
padding: 0.1rem 0.25rem;
|
||||
border-radius: 0.25rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
.btn-refresh:hover { background: var(--color-border, #d0d7e8); }
|
||||
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
|
||||
.picker-body,
|
||||
.options-body {
|
||||
padding: 0.75rem;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
|
||||
.picker-loading, .picker-empty {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-size: 0.85rem;
|
||||
padding: 0.25rem 0;
|
||||
}
|
||||
|
||||
.picker-error {
|
||||
color: #b91c1c;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* ── Model groups ──────────────────────────────────────────────────────────── */
|
||||
|
||||
.picker-group {
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
.picker-group:last-child { margin-bottom: 0; }
|
||||
|
||||
.group-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.4rem;
|
||||
}
|
||||
|
||||
.group-check {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.35rem;
|
||||
font-size: 0.85rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.group-count {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-weight: 400;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.group-note {
|
||||
margin-left: auto;
|
||||
font-size: 0.72rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.model-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.2rem;
|
||||
padding-left: 1.25rem;
|
||||
max-height: 220px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.model-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.4rem;
|
||||
font-size: 0.82rem;
|
||||
cursor: pointer;
|
||||
padding: 0.15rem 0;
|
||||
}
|
||||
|
||||
.model-name { flex: 1; font-family: var(--font-mono, monospace); }
|
||||
|
||||
.model-meta {
|
||||
font-size: 0.72rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
/* ── Options ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.option-row {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
gap: 0.5rem;
|
||||
padding: 0.35rem 0;
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.option-label { font-weight: 500; white-space: nowrap; }
|
||||
|
||||
.option-hint {
|
||||
flex: 1;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
margin-left: auto;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
.option-number {
|
||||
width: 90px;
|
||||
padding: 0.2rem 0.4rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.25rem;
|
||||
font-size: 0.85rem;
|
||||
background: var(--color-bg, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.option-row.dimmed { opacity: 0.45; pointer-events: none; }
|
||||
|
||||
/* ── Run bar ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.run-bar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.65rem;
|
||||
}
|
||||
|
||||
.btn-run {
|
||||
padding: 0.5rem 1.25rem;
|
||||
border: none;
|
||||
border-radius: 0.375rem;
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-run:hover:not(:disabled) { background: color-mix(in srgb, var(--app-primary, #2A6080) 80%, #000); }
|
||||
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
|
||||
.btn-cancel {
|
||||
padding: 0.5rem 0.9rem;
|
||||
border: 1px solid #f85149;
|
||||
border-radius: 0.375rem;
|
||||
background: transparent;
|
||||
color: #b91c1c;
|
||||
font-size: 0.85rem;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-cancel:hover { background: #fee2e2; }
|
||||
|
||||
.run-hint {
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
/* ── Run log ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.run-log-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.run-log-title { text-transform: uppercase; letter-spacing: 0.05em; }
|
||||
|
||||
.btn-clear {
|
||||
border: none;
|
||||
background: transparent;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
padding: 0.1rem 0.3rem;
|
||||
border-radius: 0.25rem;
|
||||
}
|
||||
.btn-clear:hover { background: var(--color-border, #d0d7e8); }
|
||||
|
||||
.run-log-body {
|
||||
margin: 0;
|
||||
padding: 0.65rem 0.85rem;
|
||||
font-size: 0.78rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
max-height: 260px;
|
||||
overflow-y: auto;
|
||||
background: var(--color-bg, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
/* ── History bar ──────────────────────────────────────────────────────────── */
|
||||
|
||||
.history-bar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.6rem;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.history-label { font-weight: 500; white-space: nowrap; }
|
||||
|
||||
.history-select {
|
||||
flex: 1;
|
||||
padding: 0.3rem 0.5rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.375rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* ── Results table ────────────────────────────────────────────────────────── */
|
||||
|
||||
.results-section { display: flex; flex-direction: column; gap: 0.75rem; }
|
||||
|
||||
.results-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.results-title {
|
||||
font-size: 1rem;
|
||||
font-weight: 700;
|
||||
color: var(--color-text, #1a2338);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.btn-corrections {
|
||||
padding: 0.4rem 0.9rem;
|
||||
border: 1px solid var(--app-primary, #2A6080);
|
||||
border-radius: 0.375rem;
|
||||
background: transparent;
|
||||
color: var(--app-primary, #2A6080);
|
||||
font-size: 0.83rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
white-space: nowrap;
|
||||
transition: background 0.15s, color 0.15s;
|
||||
}
|
||||
.btn-corrections:hover:not(:disabled) {
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
}
|
||||
.btn-corrections:disabled { opacity: 0.55; cursor: not-allowed; }
|
||||
|
||||
.results-table-wrap {
|
||||
overflow-x: auto;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
}
|
||||
|
||||
.results-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.results-table th {
|
||||
padding: 0.5rem 0.75rem;
|
||||
text-align: left;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.78rem;
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.result-row {
|
||||
cursor: pointer;
|
||||
transition: background 0.1s;
|
||||
}
|
||||
.result-row:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 6%, transparent); }
|
||||
.result-row.top-row { font-weight: 600; }
|
||||
|
||||
.result-row td {
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.result-row:last-child td { border-bottom: none; }
|
||||
|
||||
.rank-cell { width: 2.5rem; text-align: center; font-size: 1.1rem; }
|
||||
.model-cell { font-family: var(--font-mono, monospace); word-break: break-all; }
|
||||
.score-cell { width: 5rem; text-align: center; }
|
||||
.latency-cell { width: 5rem; text-align: right; color: var(--color-text-secondary, #6b7a99); }
|
||||
.violation-cell { width: 4rem; text-align: center; color: var(--color-text-secondary, #6b7a99); }
|
||||
.violation-cell.has-violation { color: #b91c1c; font-weight: 700; }
|
||||
|
||||
.score-pill {
|
||||
display: inline-block;
|
||||
padding: 0.15rem 0.55rem;
|
||||
border-radius: 9999px;
|
||||
font-weight: 700;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
/* ── Sample outputs ───────────────────────────────────────────────────────── */
|
||||
|
||||
.sample-outputs {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.sample-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0.5rem 0.85rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.btn-collapse {
|
||||
border: none;
|
||||
background: transparent;
|
||||
font-size: 0.78rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.sample-prompt {
|
||||
padding: 0.65rem 0.85rem;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.sample-prompt:last-child { border-bottom: none; }
|
||||
|
||||
.sample-tag {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.35rem;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.tag-name { font-weight: 600; color: var(--color-text, #1a2338); }
|
||||
.tag-score { color: var(--app-primary, #2A6080); font-weight: 700; }
|
||||
.tag-latency { color: var(--color-text-secondary, #6b7a99); margin-left: auto; }
|
||||
|
||||
.sample-text {
|
||||
margin: 0;
|
||||
font-size: 0.82rem;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
background: var(--color-bg, #fff);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.35rem;
|
||||
padding: 0.5rem 0.65rem;
|
||||
color: var(--color-text, #1a2338);
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
@media (max-width: 640px) {
|
||||
.style-controls { flex-direction: column; }
|
||||
.model-picker, .options-panel { min-width: 0; }
|
||||
.option-hint { display: none; }
|
||||
.group-note { display: none; }
|
||||
}
|
||||
</style>
|
||||
919
web/src/views/VoiceTab.vue
Normal file
919
web/src/views/VoiceTab.vue
Normal file
|
|
@ -0,0 +1,919 @@
|
|||
<template>
|
||||
<div class="voice-tab">
|
||||
|
||||
<!-- ── Controls row ──────────────────────────────────────────────────── -->
|
||||
<div class="voice-controls">
|
||||
|
||||
<!-- Model picker -->
|
||||
<details class="model-picker" open>
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">🎙 Models</span>
|
||||
<span class="picker-badge">{{ selectedCount }} selected</span>
|
||||
<button class="btn-refresh" :disabled="modelsLoading" @click.stop="loadModels" title="Refresh model list">
|
||||
{{ modelsLoading ? '⏳' : '🔄' }}
|
||||
</button>
|
||||
</summary>
|
||||
<div class="picker-body">
|
||||
<div v-if="modelsLoading" class="picker-loading">Loading models…</div>
|
||||
<div v-else-if="loadError" class="picker-error">{{ loadError }}</div>
|
||||
<template v-else>
|
||||
|
||||
<!-- Ollama group -->
|
||||
<div class="picker-group" v-if="ollamaModels.length">
|
||||
<div class="group-header">
|
||||
<label class="group-check">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isGroupAllSelected('ollama')"
|
||||
:indeterminate="isGroupIndeterminate('ollama')"
|
||||
@change="toggleGroup('ollama', ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="group-label">Ollama</span>
|
||||
<span class="group-count">({{ ollamaModels.length }})</span>
|
||||
</label>
|
||||
<span class="group-note">auto-synced with Models view</span>
|
||||
</div>
|
||||
<div class="model-list">
|
||||
<label v-for="m in ollamaModels" :key="m.id" class="model-item">
|
||||
<input type="checkbox" :value="m.id" v-model="selectedModels" />
|
||||
<span class="model-name">{{ m.name }}</span>
|
||||
<span v-if="m.size_mb" class="model-meta">{{ formatMb(m.size_mb) }}</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- cf-text group -->
|
||||
<div class="picker-group" v-if="cftextModels.length">
|
||||
<div class="group-header">
|
||||
<label class="group-check">
|
||||
<input
|
||||
type="checkbox"
|
||||
:checked="isGroupAllSelected('cf-text')"
|
||||
:indeterminate="isGroupIndeterminate('cf-text')"
|
||||
@change="toggleGroup('cf-text', ($event.target as HTMLInputElement).checked)"
|
||||
/>
|
||||
<span class="group-label">cf-text (cf-orch)</span>
|
||||
<span class="group-count">({{ cftextModels.length }})</span>
|
||||
</label>
|
||||
<span class="group-note">GGUFs via coordinator — enable cf-orch below</span>
|
||||
</div>
|
||||
<div class="model-list">
|
||||
<label v-for="m in cftextModels" :key="m.id" class="model-item">
|
||||
<input type="checkbox" :value="m.id" v-model="selectedModels" />
|
||||
<span class="model-name">{{ m.name }}</span>
|
||||
<span v-if="m.vram_mb" class="model-meta">{{ formatMb(m.vram_mb) }} VRAM</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div v-if="!ollamaModels.length && !cftextModels.length" class="picker-empty">
|
||||
No models available — check Ollama and cf-orch connections.
|
||||
</div>
|
||||
|
||||
</template>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
<!-- Options panel -->
|
||||
<details class="options-panel">
|
||||
<summary class="picker-summary">
|
||||
<span class="picker-title">⚙️ Options</span>
|
||||
</summary>
|
||||
<div class="options-body">
|
||||
<label class="option-row">
|
||||
<input type="checkbox" v-model="useCforch" :disabled="running" />
|
||||
<span class="option-label">Use cf-orch backend</span>
|
||||
<span class="option-hint">Routes generation through cf-text instead of ollama</span>
|
||||
</label>
|
||||
<label class="option-row" :class="{ dimmed: !useCforch }">
|
||||
<span class="option-label">Max VRAM (MB)</span>
|
||||
<input
|
||||
type="number"
|
||||
v-model.number="maxVram"
|
||||
:disabled="running || !useCforch"
|
||||
min="1024"
|
||||
max="24576"
|
||||
step="512"
|
||||
class="option-number"
|
||||
/>
|
||||
<span class="option-hint">Skip models exceeding this VRAM limit</span>
|
||||
</label>
|
||||
<label class="option-row">
|
||||
<span class="option-label">Parallel workers</span>
|
||||
<input
|
||||
type="number"
|
||||
v-model.number="workers"
|
||||
:disabled="running"
|
||||
min="1"
|
||||
max="16"
|
||||
step="1"
|
||||
class="option-number"
|
||||
/>
|
||||
<span class="option-hint">Models to score simultaneously (1 = sequential)</span>
|
||||
</label>
|
||||
<label class="option-row">
|
||||
<input type="checkbox" v-model="includeLarge" :disabled="running" />
|
||||
<span class="option-label">Include large models (30B+)</span>
|
||||
<span class="option-hint">Off by default — these take much longer</span>
|
||||
</label>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- ── Run controls ──────────────────────────────────────────────────── -->
|
||||
<div class="run-bar">
|
||||
<button class="btn-run" :disabled="running || selectedCount === 0" @click="startBenchmark">
|
||||
{{ running ? '⏳ Running…' : results.length ? '🔄 Re-run' : '▶ Run Benchmark' }}
|
||||
</button>
|
||||
<button v-if="running" class="btn-cancel" @click="cancelBenchmark">✕ Cancel</button>
|
||||
<span v-if="selectedCount === 0 && !running" class="run-hint">Select at least one model above</span>
|
||||
</div>
|
||||
|
||||
<!-- ── Progress log ──────────────────────────────────────────────────── -->
|
||||
<div v-if="runLog.length" class="run-log">
|
||||
<div class="run-log-header">
|
||||
<span class="run-log-title">Run log</span>
|
||||
<button class="btn-clear" @click="runLog = []">Clear</button>
|
||||
</div>
|
||||
<pre class="run-log-body" ref="logEl">{{ runLog.join('\n') }}</pre>
|
||||
</div>
|
||||
|
||||
<!-- ── Past runs picker ─────────────────────────────────────────────── -->
|
||||
<div class="history-bar" v-if="pastRuns.length">
|
||||
<label class="history-label">📂 Past runs:</label>
|
||||
<select class="history-select" v-model="selectedRun" @change="loadRun(selectedRun)">
|
||||
<option value="">— select a past run —</option>
|
||||
<option v-for="r in pastRuns" :key="r.filename" :value="r.filename">
|
||||
{{ r.date }} · {{ r.model_count }} model{{ r.model_count !== 1 ? 's' : '' }} · top {{ r.top_score }}/100
|
||||
</option>
|
||||
</select>
|
||||
</div>
|
||||
|
||||
<!-- ── Results table ─────────────────────────────────────────────────── -->
|
||||
<div v-if="results.length" class="results-section">
|
||||
<div class="results-header">
|
||||
<h2 class="results-title">Rankings</h2>
|
||||
<button
|
||||
class="btn-corrections"
|
||||
:disabled="sendingCorrections"
|
||||
@click="sendToCorrections"
|
||||
title="Push all outputs from this run into the Corrections review queue"
|
||||
>
|
||||
{{ sendingCorrections ? '⏳ Sending…' : correctionsMsg || '✍️ Send to Corrections' }}
|
||||
</button>
|
||||
</div>
|
||||
<div class="results-table-wrap">
|
||||
<table class="results-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Rank</th>
|
||||
<th>Model</th>
|
||||
<th>Score</th>
|
||||
<th>Latency</th>
|
||||
<th title="Em-dash count">—</th>
|
||||
<th title="Filler phrase hits">Fillers</th>
|
||||
<th title="Semicolons">;</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr
|
||||
v-for="(r, i) in results"
|
||||
:key="r.model_id"
|
||||
class="result-row"
|
||||
:class="{ 'top-row': i === 0 }"
|
||||
@click="toggleExpanded(r.model_id)"
|
||||
>
|
||||
<td class="rank-cell">{{ medal(i) }}</td>
|
||||
<td class="model-cell">
|
||||
<span class="model-name-text">{{ r.model_id }}</span>
|
||||
</td>
|
||||
<td class="score-cell">
|
||||
<span class="score-pill" :style="scorePillStyle(r.avg_score)">
|
||||
{{ r.avg_score.toFixed(0) }}
|
||||
</span>
|
||||
</td>
|
||||
<td class="latency-cell">{{ formatLatency(r.avg_latency_ms) }}</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_em_dashes > 0 }">
|
||||
{{ r.total_em_dashes }}
|
||||
</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_filler_hits > 0 }">
|
||||
{{ r.total_filler_hits }}
|
||||
</td>
|
||||
<td class="violation-cell" :class="{ 'has-violation': r.total_semicolons > 0 }">
|
||||
{{ r.total_semicolons }}
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
|
||||
<!-- Expandable sample outputs -->
|
||||
<div v-for="r in results" :key="'exp-' + r.model_id">
|
||||
<div v-if="expandedModels.has(r.model_id)" class="sample-outputs">
|
||||
<div class="sample-header">
|
||||
<strong>{{ r.model_id }}</strong>
|
||||
<button class="btn-collapse" @click="toggleExpanded(r.model_id)">✕ Close</button>
|
||||
</div>
|
||||
<div v-for="pr in r.prompt_results" :key="pr.tag" class="sample-prompt">
|
||||
<div class="sample-tag">
|
||||
<span class="tag-name">{{ pr.tag }}</span>
|
||||
<span class="tag-score">{{ pr.score.toFixed(0) }}/100</span>
|
||||
<span class="tag-latency">{{ formatLatency(pr.latency_ms) }}</span>
|
||||
</div>
|
||||
<pre class="sample-text">{{ pr.output || '(no output)' }}</pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted, nextTick, watch } from 'vue'
|
||||
|
||||
// ── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
interface VoiceModel {
|
||||
id: string
|
||||
name: string
|
||||
source: 'ollama' | 'cf-text'
|
||||
size_mb?: number | null
|
||||
vram_mb?: number | null
|
||||
description?: string
|
||||
}
|
||||
|
||||
interface PromptResult {
|
||||
tag: string
|
||||
output: string
|
||||
score: number
|
||||
latency_ms: number
|
||||
signals: Record<string, unknown>
|
||||
}
|
||||
|
||||
interface ModelResult {
|
||||
model_id: string
|
||||
avg_score: number
|
||||
avg_latency_ms: number
|
||||
total_filler_hits: number
|
||||
total_em_dashes: number
|
||||
total_semicolons: number
|
||||
prompt_results: PromptResult[]
|
||||
}
|
||||
|
||||
interface PastRun {
|
||||
filename: string
|
||||
date: string
|
||||
model_count: number
|
||||
top_score: number
|
||||
}
|
||||
|
||||
// ── State ───────────────────────────────────────────────────────────────────
|
||||
|
||||
const ollamaModels = ref<VoiceModel[]>([])
|
||||
const cftextModels = ref<VoiceModel[]>([])
|
||||
const selectedModels = ref<string[]>([])
|
||||
const modelsLoading = ref(false)
|
||||
const loadError = ref('')
|
||||
|
||||
const useCforch = ref(false)
|
||||
const maxVram = ref(7200)
|
||||
const workers = ref(1)
|
||||
const includeLarge = ref(false)
|
||||
|
||||
const running = ref(false)
|
||||
const runLog = ref<string[]>([])
|
||||
const logEl = ref<HTMLPreElement | null>(null)
|
||||
|
||||
const results = ref<ModelResult[]>([])
|
||||
const pastRuns = ref<PastRun[]>([])
|
||||
const selectedRun = ref('')
|
||||
const expandedModels = ref(new Set<string>())
|
||||
const sendingCorrections = ref(false)
|
||||
const correctionsMsg = ref('')
|
||||
|
||||
// ── Computed ─────────────────────────────────────────────────────────────────
|
||||
|
||||
const selectedCount = computed(() => selectedModels.value.length)
|
||||
|
||||
function isGroupAllSelected(source: string): boolean {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
return group.length > 0 && group.every(m => selectedModels.value.includes(m.id))
|
||||
}
|
||||
|
||||
function isGroupIndeterminate(source: string): boolean {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
const count = group.filter(m => selectedModels.value.includes(m.id)).length
|
||||
return count > 0 && count < group.length
|
||||
}
|
||||
|
||||
// ── Actions ──────────────────────────────────────────────────────────────────
|
||||
|
||||
async function loadModels() {
|
||||
modelsLoading.value = true
|
||||
loadError.value = ''
|
||||
try {
|
||||
const resp = await fetch('/api/voice/models')
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
const data = await resp.json()
|
||||
ollamaModels.value = data.ollama ?? []
|
||||
cftextModels.value = data.cf_text ?? []
|
||||
} catch (e: unknown) {
|
||||
loadError.value = `Failed to load models: ${e instanceof Error ? e.message : String(e)}`
|
||||
} finally {
|
||||
modelsLoading.value = false
|
||||
}
|
||||
}
|
||||
|
||||
async function loadPastRuns() {
|
||||
try {
|
||||
const resp = await fetch('/api/voice/results')
|
||||
if (resp.ok) pastRuns.value = await resp.json()
|
||||
} catch { /* non-fatal */ }
|
||||
}
|
||||
|
||||
async function loadRun(filename: string) {
|
||||
if (!filename) return
|
||||
try {
|
||||
const resp = await fetch(`/api/voice/results/${filename}`)
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
results.value = await resp.json()
|
||||
expandedModels.value.clear()
|
||||
} catch (e: unknown) {
|
||||
runLog.value.push(`[error] Failed to load ${filename}: ${e instanceof Error ? e.message : String(e)}`)
|
||||
}
|
||||
}
|
||||
|
||||
function toggleGroup(source: string, checked: boolean) {
|
||||
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
|
||||
const ids = group.map(m => m.id)
|
||||
if (checked) {
|
||||
const newSet = new Set([...selectedModels.value, ...ids])
|
||||
selectedModels.value = [...newSet]
|
||||
} else {
|
||||
selectedModels.value = selectedModels.value.filter(id => !ids.includes(id))
|
||||
}
|
||||
}
|
||||
|
||||
function toggleExpanded(modelId: string) {
|
||||
if (expandedModels.value.has(modelId)) {
|
||||
expandedModels.value.delete(modelId)
|
||||
} else {
|
||||
expandedModels.value.add(modelId)
|
||||
}
|
||||
expandedModels.value = new Set(expandedModels.value)
|
||||
}
|
||||
|
||||
function startBenchmark() {
|
||||
if (running.value || selectedCount.value === 0) return
|
||||
running.value = true
|
||||
runLog.value = []
|
||||
results.value = []
|
||||
expandedModels.value.clear()
|
||||
|
||||
const params = new URLSearchParams({
|
||||
models: selectedModels.value.join(','),
|
||||
use_cforch: String(useCforch.value),
|
||||
max_vram: String(maxVram.value),
|
||||
workers: String(workers.value),
|
||||
include_large: String(includeLarge.value),
|
||||
})
|
||||
|
||||
const es = new EventSource(`/api/voice/run?${params}`)
|
||||
|
||||
es.onmessage = async (ev) => {
|
||||
try {
|
||||
const msg = JSON.parse(ev.data)
|
||||
if (msg.type === 'progress') {
|
||||
runLog.value.push(msg.message)
|
||||
await nextTick()
|
||||
if (logEl.value) logEl.value.scrollTop = logEl.value.scrollHeight
|
||||
} else if (msg.type === 'result') {
|
||||
results.value = msg.results ?? []
|
||||
await loadPastRuns()
|
||||
} else if (msg.type === 'complete') {
|
||||
running.value = false
|
||||
es.close()
|
||||
} else if (msg.type === 'error') {
|
||||
runLog.value.push(`[error] ${msg.message}`)
|
||||
running.value = false
|
||||
es.close()
|
||||
}
|
||||
} catch { /* ignore parse errors */ }
|
||||
}
|
||||
|
||||
es.onerror = () => {
|
||||
if (running.value) {
|
||||
runLog.value.push('[error] Connection lost')
|
||||
running.value = false
|
||||
}
|
||||
es.close()
|
||||
}
|
||||
}
|
||||
|
||||
async function cancelBenchmark() {
|
||||
try {
|
||||
await fetch('/api/voice/cancel', { method: 'POST' })
|
||||
} finally {
|
||||
running.value = false
|
||||
runLog.value.push('[cancelled]')
|
||||
}
|
||||
}
|
||||
|
||||
async function sendToCorrections() {
|
||||
if (!selectedRun.value || sendingCorrections.value) return
|
||||
sendingCorrections.value = true
|
||||
correctionsMsg.value = ''
|
||||
try {
|
||||
const resp = await fetch('/api/voice/send-to-corrections', {
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
body: JSON.stringify({ filename: selectedRun.value, model_ids: [] }),
|
||||
})
|
||||
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
|
||||
const data = await resp.json()
|
||||
correctionsMsg.value = `✓ ${data.imported} added to Corrections`
|
||||
} catch (e: unknown) {
|
||||
correctionsMsg.value = `Error: ${e instanceof Error ? e.message : String(e)}`
|
||||
} finally {
|
||||
sendingCorrections.value = false
|
||||
}
|
||||
}
|
||||
|
||||
// ── Formatting helpers ────────────────────────────────────────────────────────
|
||||
|
||||
function formatMb(mb: number): string {
|
||||
return mb >= 1024 ? `${(mb / 1024).toFixed(1)} GB` : `${mb} MB`
|
||||
}
|
||||
|
||||
function formatLatency(ms: number): string {
|
||||
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`
|
||||
}
|
||||
|
||||
function medal(index: number): string {
|
||||
return ['🥇', '🥈', '🥉'][index] ?? `#${index + 1}`
|
||||
}
|
||||
|
||||
function scorePillStyle(score: number): Record<string, string> {
|
||||
const hue = Math.round((score / 100) * 120) // 0=red, 120=green
|
||||
return {
|
||||
background: `hsl(${hue} 60% 88%)`,
|
||||
color: `hsl(${hue} 60% 28%)`,
|
||||
}
|
||||
}
|
||||
|
||||
// ── Lifecycle ─────────────────────────────────────────────────────────────────
|
||||
|
||||
// Auto-enable cf-orch when cf-text models are selected
|
||||
watch(selectedModels, (ids) => {
|
||||
const hasCftext = ids.some(id => cftextModels.value.find(m => m.id === id))
|
||||
if (hasCftext) useCforch.value = true
|
||||
})
|
||||
|
||||
onMounted(async () => {
|
||||
await Promise.all([loadModels(), loadPastRuns()])
|
||||
// Auto-load the latest results if any exist
|
||||
if (pastRuns.value.length) {
|
||||
selectedRun.value = pastRuns.value[0].filename
|
||||
await loadRun(pastRuns.value[0].filename)
|
||||
}
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.voice-tab {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
padding: 1rem 0;
|
||||
}
|
||||
|
||||
/* ── Controls ─────────────────────────────────────────────────────────────── */
|
||||
|
||||
.voice-controls {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.75rem;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.model-picker,
|
||||
.options-panel {
|
||||
flex: 1;
|
||||
min-width: 280px;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.picker-summary {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 0.65rem 0.85rem;
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
list-style: none;
|
||||
}
|
||||
|
||||
.picker-summary::-webkit-details-marker { display: none; }
|
||||
|
||||
.picker-title { flex: 1; color: var(--color-text, #1a2338); }
|
||||
.picker-badge {
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
border-radius: 9999px;
|
||||
padding: 0.1rem 0.5rem;
|
||||
font-size: 0.72rem;
|
||||
font-weight: 700;
|
||||
}
|
||||
|
||||
.btn-refresh {
|
||||
border: none;
|
||||
background: transparent;
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
padding: 0.1rem 0.25rem;
|
||||
border-radius: 0.25rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
.btn-refresh:hover { background: var(--color-border, #d0d7e8); }
|
||||
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
|
||||
.picker-body,
|
||||
.options-body {
|
||||
padding: 0.75rem;
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
|
||||
.picker-loading, .picker-empty {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-size: 0.85rem;
|
||||
padding: 0.25rem 0;
|
||||
}
|
||||
|
||||
.picker-error {
|
||||
color: #b91c1c;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* ── Model groups ──────────────────────────────────────────────────────────── */
|
||||
|
||||
.picker-group {
|
||||
margin-bottom: 0.75rem;
|
||||
}
|
||||
.picker-group:last-child { margin-bottom: 0; }
|
||||
|
||||
.group-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.4rem;
|
||||
}
|
||||
|
||||
.group-check {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.35rem;
|
||||
font-size: 0.85rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.group-count {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-weight: 400;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.group-note {
|
||||
margin-left: auto;
|
||||
font-size: 0.72rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.model-list {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.2rem;
|
||||
padding-left: 1.25rem;
|
||||
max-height: 220px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.model-item {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.4rem;
|
||||
font-size: 0.82rem;
|
||||
cursor: pointer;
|
||||
padding: 0.15rem 0;
|
||||
}
|
||||
|
||||
.model-name { flex: 1; font-family: var(--font-mono, monospace); }
|
||||
|
||||
.model-meta {
|
||||
font-size: 0.72rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
/* ── Options ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.option-row {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
gap: 0.5rem;
|
||||
padding: 0.35rem 0;
|
||||
cursor: pointer;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.option-label { font-weight: 500; white-space: nowrap; }
|
||||
|
||||
.option-hint {
|
||||
flex: 1;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
margin-left: auto;
|
||||
text-align: right;
|
||||
}
|
||||
|
||||
.option-number {
|
||||
width: 90px;
|
||||
padding: 0.2rem 0.4rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.25rem;
|
||||
font-size: 0.85rem;
|
||||
background: var(--color-bg, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.option-row.dimmed { opacity: 0.45; pointer-events: none; }
|
||||
|
||||
/* ── Run bar ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.run-bar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.65rem;
|
||||
}
|
||||
|
||||
.btn-run {
|
||||
padding: 0.5rem 1.25rem;
|
||||
border: none;
|
||||
border-radius: 0.375rem;
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
font-size: 0.9rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-run:hover:not(:disabled) { background: color-mix(in srgb, var(--app-primary, #2A6080) 80%, #000); }
|
||||
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
|
||||
.btn-cancel {
|
||||
padding: 0.5rem 0.9rem;
|
||||
border: 1px solid #f85149;
|
||||
border-radius: 0.375rem;
|
||||
background: transparent;
|
||||
color: #b91c1c;
|
||||
font-size: 0.85rem;
|
||||
cursor: pointer;
|
||||
transition: background 0.15s;
|
||||
}
|
||||
.btn-cancel:hover { background: #fee2e2; }
|
||||
|
||||
.run-hint {
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
/* ── Run log ──────────────────────────────────────────────────────────────── */
|
||||
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.run-log-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.8rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.run-log-title { text-transform: uppercase; letter-spacing: 0.05em; }
|
||||
|
||||
.btn-clear {
|
||||
border: none;
|
||||
background: transparent;
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
padding: 0.1rem 0.3rem;
|
||||
border-radius: 0.25rem;
|
||||
}
|
||||
.btn-clear:hover { background: var(--color-border, #d0d7e8); }
|
||||
|
||||
.run-log-body {
|
||||
margin: 0;
|
||||
padding: 0.65rem 0.85rem;
|
||||
font-size: 0.78rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
white-space: pre-wrap;
|
||||
word-break: break-all;
|
||||
max-height: 260px;
|
||||
overflow-y: auto;
|
||||
background: var(--color-bg, #fff);
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
/* ── History bar ──────────────────────────────────────────────────────────── */
|
||||
|
||||
.history-bar {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.6rem;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.history-label { font-weight: 500; white-space: nowrap; }
|
||||
|
||||
.history-select {
|
||||
flex: 1;
|
||||
padding: 0.3rem 0.5rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.375rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
color: var(--color-text, #1a2338);
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
/* ── Results table ────────────────────────────────────────────────────────── */
|
||||
|
||||
.results-section { display: flex; flex-direction: column; gap: 0.75rem; }
|
||||
|
||||
.results-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.results-title {
|
||||
font-size: 1rem;
|
||||
font-weight: 700;
|
||||
color: var(--color-text, #1a2338);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.btn-corrections {
|
||||
padding: 0.4rem 0.9rem;
|
||||
border: 1px solid var(--app-primary, #2A6080);
|
||||
border-radius: 0.375rem;
|
||||
background: transparent;
|
||||
color: var(--app-primary, #2A6080);
|
||||
font-size: 0.83rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
white-space: nowrap;
|
||||
transition: background 0.15s, color 0.15s;
|
||||
}
|
||||
.btn-corrections:hover:not(:disabled) {
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
}
|
||||
.btn-corrections:disabled { opacity: 0.55; cursor: not-allowed; }
|
||||
|
||||
.results-table-wrap {
|
||||
overflow-x: auto;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
}
|
||||
|
||||
.results-table {
|
||||
width: 100%;
|
||||
border-collapse: collapse;
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.results-table th {
|
||||
padding: 0.5rem 0.75rem;
|
||||
text-align: left;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.78rem;
|
||||
font-weight: 700;
|
||||
text-transform: uppercase;
|
||||
letter-spacing: 0.04em;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
.result-row {
|
||||
cursor: pointer;
|
||||
transition: background 0.1s;
|
||||
}
|
||||
.result-row:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 6%, transparent); }
|
||||
.result-row.top-row { font-weight: 600; }
|
||||
|
||||
.result-row td {
|
||||
padding: 0.5rem 0.75rem;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.result-row:last-child td { border-bottom: none; }
|
||||
|
||||
.rank-cell { width: 2.5rem; text-align: center; font-size: 1.1rem; }
|
||||
.model-cell { font-family: var(--font-mono, monospace); word-break: break-all; }
|
||||
.score-cell { width: 5rem; text-align: center; }
|
||||
.latency-cell { width: 5rem; text-align: right; color: var(--color-text-secondary, #6b7a99); }
|
||||
.violation-cell { width: 4rem; text-align: center; color: var(--color-text-secondary, #6b7a99); }
|
||||
.violation-cell.has-violation { color: #b91c1c; font-weight: 700; }
|
||||
|
||||
.score-pill {
|
||||
display: inline-block;
|
||||
padding: 0.15rem 0.55rem;
|
||||
border-radius: 9999px;
|
||||
font-weight: 700;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
/* ── Sample outputs ───────────────────────────────────────────────────────── */
|
||||
|
||||
.sample-outputs {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.sample-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 0.5rem 0.85rem;
|
||||
background: var(--color-surface, #f4f7fc);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.85rem;
|
||||
}
|
||||
|
||||
.btn-collapse {
|
||||
border: none;
|
||||
background: transparent;
|
||||
font-size: 0.78rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
}
|
||||
|
||||
.sample-prompt {
|
||||
padding: 0.65rem 0.85rem;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
.sample-prompt:last-child { border-bottom: none; }
|
||||
|
||||
.sample-tag {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
margin-bottom: 0.35rem;
|
||||
font-size: 0.8rem;
|
||||
}
|
||||
|
||||
.tag-name { font-weight: 600; color: var(--color-text, #1a2338); }
|
||||
.tag-score { color: var(--app-primary, #2A6080); font-weight: 700; }
|
||||
.tag-latency { color: var(--color-text-secondary, #6b7a99); margin-left: auto; }
|
||||
|
||||
.sample-text {
|
||||
margin: 0;
|
||||
font-size: 0.82rem;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
background: var(--color-bg, #fff);
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.35rem;
|
||||
padding: 0.5rem 0.65rem;
|
||||
color: var(--color-text, #1a2338);
|
||||
font-family: inherit;
|
||||
}
|
||||
|
||||
@media (max-width: 640px) {
|
||||
.voice-controls { flex-direction: column; }
|
||||
.model-picker, .options-panel { min-width: 0; }
|
||||
.option-hint { display: none; }
|
||||
.group-note { display: none; }
|
||||
}
|
||||
</style>
|
||||
Loading…
Reference in a new issue