refactor(bench): extract benchmark tabs — classifier, compare, llm-eval, style, voice

- BenchmarkView.vue: convert from monolithic view to tabbed shell; each tab is
  now its own component (ClassifierTab, CompareTab, LlmEvalTab, StyleTab, VoiceTab)
- StyleTab + VoiceTab: new benchmark modes for style and voice model evaluation
- app/style.py: FastAPI router for style imitation benchmarks
- app/voice.py: FastAPI router for voice benchmark endpoints
- scripts/benchmark_style.py + benchmark_voice.py: headless runner scripts
This commit is contained in:
pyr0ball 2026-04-24 14:56:17 -07:00
parent cc24cd0d7d
commit ddb56efb89
10 changed files with 7023 additions and 1837 deletions

427
app/style.py Normal file
View file

@ -0,0 +1,427 @@
"""Avocet — Writing style benchmark integration API.
Wraps scripts/benchmark_style.py and exposes it via the Avocet API.
Connection config (coordinator_url, ollama_url, python_bin) is read
from label_tool.yaml under the `cforch:` key the same block used
by cforch.py, so no new config section is needed.
All endpoints are registered on `router` (a FastAPI APIRouter).
api.py includes this router with prefix="/api/style".
Module-level globals (_BENCH_RUNNING, _bench_proc) follow the same
testability pattern as cforch.py.
"""
from __future__ import annotations
import json
import logging
import subprocess as _subprocess
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
import yaml
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent
_CONFIG_DIR: Path | None = None # override in tests via set_config_dir()
_BENCH_RUNNING: bool = False
_bench_proc: Any = None
_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_style.py"
_RESULTS_DIR = _ROOT / "benchmark_results"
router = APIRouter()
# ── Testability seams ──────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
# ── Internal helpers ───────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _load_config() -> dict:
"""Read label_tool.yaml cforch section for coordinator/ollama/python config."""
f = _config_file()
file_cfg: dict = {}
if f.exists():
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
file_cfg = raw.get("cforch", {}) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse style config %s: %s", f, exc)
return {
"coordinator_url": file_cfg.get("coordinator_url", "http://10.1.10.71:7700"),
"ollama_url": file_cfg.get("ollama_url", "http://localhost:11434"),
"python_bin": file_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python"),
}
# ── GET /models ────────────────────────────────────────────────────────────────
@router.get("/models")
def get_models() -> dict:
"""Return available models grouped by source.
- ollama: fetched live from /api/tags (includes any models downloaded
via the Models view automatically in sync)
- cf_text: fetched from cf-orch catalog endpoint (requires node profile
entry + coordinator restart when new GGUFs are added)
"""
cfg = _load_config()
# Ollama models — live query so newly downloaded models appear immediately
ollama_models: list[dict] = []
try:
resp = httpx.get(f"{cfg['ollama_url']}/api/tags", timeout=5.0)
resp.raise_for_status()
for m in resp.json().get("models", []):
name = m.get("name", "")
if name:
size_bytes = m.get("size", 0)
ollama_models.append({
"id": name,
"name": name,
"source": "ollama",
"size_mb": round(size_bytes / (1024 * 1024)) if size_bytes else None,
"vram_mb": None,
})
except Exception as exc:
logger.warning("Failed to fetch ollama models: %s", exc)
# cf-text catalog — fetched from cf-orch coordinator
cftext_models: list[dict] = []
try:
resp = httpx.get(
f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
timeout=5.0,
)
resp.raise_for_status()
for model_id, entry in resp.json().items():
if isinstance(entry, dict):
cftext_models.append({
"id": model_id,
"name": model_id,
"source": "cf-text",
"vram_mb": entry.get("vram_mb"),
"description": entry.get("description", ""),
})
except Exception as exc:
logger.warning("Failed to fetch cf-text catalog: %s", exc)
return {"ollama": ollama_models, "cf_text": cftext_models}
# ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/run")
def run_style_benchmark(
models: str = Query("", description="Comma-separated model IDs (empty = all)"),
use_cforch: bool = Query(False),
max_vram: int = Query(7200, description="Max VRAM MB for cf-orch OOM filter"),
include_large: bool = Query(False, description="Include large (30B+) ollama models"),
workers: int = Query(1, description="Parallel workers — run N models simultaneously"),
) -> StreamingResponse:
"""Spawn benchmark_style.py and stream stdout as SSE progress events.
On successful completion, emits a final `type: result` event containing
the parsed JSON from the newest style_*.json file.
"""
global _BENCH_RUNNING, _bench_proc
if _BENCH_RUNNING:
raise HTTPException(409, "A writing style benchmark is already running")
cfg = _load_config()
python_bin = cfg["python_bin"]
def generate():
global _BENCH_RUNNING, _bench_proc
if not _BENCH_SCRIPT.exists():
yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_style.py not found at {_BENCH_SCRIPT}'})}\n\n"
return
cmd = [python_bin, str(_BENCH_SCRIPT), "run"]
if models:
cmd.extend(["--models", ",".join(m.strip() for m in models.split(",") if m.strip())])
if use_cforch:
cmd.extend(["--cforch", "--cforch-url", cfg["coordinator_url"],
"--max-vram", str(max_vram)])
if include_large:
cmd.append("--include-large")
if workers > 1:
cmd.extend(["--workers", str(workers)])
_BENCH_RUNNING = True
try:
proc = _subprocess.Popen(
cmd,
stdout=_subprocess.PIPE,
stderr=_subprocess.STDOUT,
text=True,
bufsize=1,
cwd=str(_ROOT),
)
_bench_proc = proc
try:
for line in proc.stdout:
line = line.rstrip()
if line:
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
proc.wait()
if proc.returncode == 0:
result_files = sorted(_RESULTS_DIR.glob("style_*.json"))
if result_files:
try:
results = json.loads(result_files[-1].read_text(encoding="utf-8"))
yield f"data: {json.dumps({'type': 'result', 'results': results, 'filename': result_files[-1].name})}\n\n"
except Exception as exc:
logger.warning("Failed to read style results: %s", exc)
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
finally:
_bench_proc = None
except Exception as exc:
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
finally:
_BENCH_RUNNING = False
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
# ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results")
def list_results() -> list[dict]:
"""List past writing style benchmark runs, newest first.
Returns lightweight summaries (date, model count, top score).
Use /results/{filename} to fetch full model-level detail.
"""
if not _RESULTS_DIR.exists():
return []
runs: list[dict] = []
for f in sorted(_RESULTS_DIR.glob("style_*.json"), reverse=True):
stem = f.stem # style_2026-04-22_1502
date_str = stem.removeprefix("style_") # 2026-04-22_1502
try:
date_part, time_part = date_str.split("_")
display_date = f"{date_part} {time_part[:2]}:{time_part[2:]}"
except Exception:
display_date = date_str
try:
results = json.loads(f.read_text(encoding="utf-8"))
top_score = max((r.get("avg_score", 0) for r in results), default=0)
model_count = len(results)
except Exception:
top_score = 0
model_count = 0
runs.append({
"filename": f.name,
"date": display_date,
"model_count": model_count,
"top_score": round(top_score, 1),
})
return runs
@router.get("/results/latest")
def get_latest_results() -> list[dict]:
"""Return the latest writing style benchmark result list."""
if not _RESULTS_DIR.exists():
raise HTTPException(404, "No benchmark results found")
files = sorted(_RESULTS_DIR.glob("style_*.json"))
if not files:
raise HTTPException(404, "No benchmark results found")
try:
return json.loads(files[-1].read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
@router.get("/results/{filename}")
def get_results_by_filename(filename: str) -> list[dict]:
"""Return writing style benchmark results for a specific run file."""
if not filename.startswith("style_") or not filename.endswith(".json"):
raise HTTPException(400, "Invalid filename — expected style_*.json")
f = _RESULTS_DIR / filename
if not f.exists():
raise HTTPException(404, f"Results file not found: {filename}")
try:
return json.loads(f.read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
# ── POST /send-to-corrections ──────────────────────────────────────────────────
class SendToCorrectionsRequest(BaseModel):
filename: str # style_YYYY-MM-DD_HHMM.json — the source run file
model_ids: list[str] = [] # empty = all models in the run
@router.post("/send-to-corrections")
def send_to_corrections(req: SendToCorrectionsRequest) -> dict:
"""Push writing style benchmark outputs into the SFT corrections queue.
Each prompt_result from the selected models becomes one SFT candidate
with status='needs_review'. Duplicates are skipped via the 'id' field
(hash of model_id + tag).
"""
if not req.filename.startswith("style_") or not req.filename.endswith(".json"):
raise HTTPException(400, "Invalid filename")
src = _RESULTS_DIR / req.filename
if not src.exists():
raise HTTPException(404, f"Results file not found: {req.filename}")
try:
run_results: list[dict] = json.loads(src.read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
# Resolve sft_candidates.jsonl path (same logic as sft.py)
sft_data_dir = _ROOT / "data"
sft_file = sft_data_dir / "sft_candidates.jsonl"
# Load existing IDs to deduplicate
existing_ids: set[str] = set()
if sft_file.exists():
for line in sft_file.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line:
try:
existing_ids.add(json.loads(line)["id"])
except Exception:
pass
run_id = req.filename.removesuffix(".json") # style_2026-04-22_1502
timestamp = datetime.now(tz=timezone.utc).isoformat()
new_candidates: list[dict] = []
for model_result in run_results:
model_id = model_result.get("model_id", "")
if req.model_ids and model_id not in req.model_ids:
continue
for pr in model_result.get("prompt_results", []):
tag = pr.get("tag", "")
# Stable id: deterministic hash of run + model + prompt tag
candidate_id = str(uuid.uuid5(
uuid.NAMESPACE_URL,
f"style-benchmark/{run_id}/{model_id}/{tag}",
))
if candidate_id in existing_ids:
continue
score_pct = pr.get("score", 0.0) / 100.0
signals = pr.get("signals", {})
# Build the prompt message list matching the benchmark's actual request
prompt_messages = [
{"role": "system", "content": _STYLE_SYSTEM_PROMPT},
{"role": "user", "content": pr.get("user_prompt", tag)},
]
new_candidates.append({
"id": candidate_id,
"source": "style-benchmark",
"benchmark_run_id": run_id,
"timestamp": timestamp,
"status": "needs_review",
"prompt_messages": prompt_messages,
"model_response": pr.get("output", ""),
"corrected_response": None,
"quality_score": round(score_pct, 4),
"failure_reason": _build_failure_reason(pr, signals),
"failure_category": None,
"task_id": f"style/{tag}",
"task_type": "style-match",
"task_name": tag.replace("_", " ").title(),
"model_id": model_id,
"model_name": model_id,
"node_id": "",
"gpu_id": 0,
"tokens_per_sec": 0,
})
existing_ids.add(candidate_id)
if new_candidates:
sft_data_dir.mkdir(parents=True, exist_ok=True)
with open(sft_file, "a", encoding="utf-8") as fh:
for c in new_candidates:
fh.write(json.dumps(c) + "\n")
return {"imported": len(new_candidates), "skipped": 0}
# Excerpt of the system prompt used in benchmark_style.py — reproduced here
# so the SFT candidate captures the full generation context.
_STYLE_SYSTEM_PROMPT = (
"You are a writing assistant. Your job is to write a Reddit reply that matches "
"the voice, tone, and style of the provided samples exactly.\n\n"
"Voice characteristics:\n"
"- Casual engineer tone. Short punchy sentences.\n"
"- No em dashes. No semicolons. No filler phrases.\n"
"- Direct. Opinionated. Community-first."
)
def _build_failure_reason(pr: dict, signals: dict) -> str | None:
"""Return a human-readable failure reason string if there are violations."""
reasons = []
if signals.get("em_dash_count", 0) > 0:
reasons.append(f"{signals['em_dash_count']} em dash(es)")
if signals.get("semicolon_count", 0) > 0:
reasons.append(f"{signals['semicolon_count']} semicolon(s)")
if signals.get("filler_hits"):
reasons.append(f"filler phrases: {', '.join(signals['filler_hits'])}")
if not pr.get("output", "").strip():
reasons.append("empty output")
return "; ".join(reasons) if reasons else None
# ── POST /cancel ───────────────────────────────────────────────────────────────
@router.post("/cancel")
def cancel_style_benchmark() -> dict:
"""Kill the running writing style benchmark subprocess."""
global _BENCH_RUNNING, _bench_proc
if not _BENCH_RUNNING:
raise HTTPException(404, "No writing style benchmark is currently running")
if _bench_proc is not None:
try:
_bench_proc.terminate()
except Exception as exc:
logger.warning("Failed to terminate style benchmark: %s", exc)
_BENCH_RUNNING = False
_bench_proc = None
return {"status": "cancelled"}

427
app/voice.py Normal file
View file

@ -0,0 +1,427 @@
"""Avocet — Voice benchmark integration API.
Wraps scripts/benchmark_voice.py and exposes it via the Avocet API.
Connection config (coordinator_url, ollama_url, python_bin) is read
from label_tool.yaml under the `cforch:` key the same block used
by cforch.py, so no new config section is needed.
All endpoints are registered on `router` (a FastAPI APIRouter).
api.py includes this router with prefix="/api/voice".
Module-level globals (_BENCH_RUNNING, _bench_proc) follow the same
testability pattern as cforch.py.
"""
from __future__ import annotations
import json
import logging
import subprocess as _subprocess
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
import yaml
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent
_CONFIG_DIR: Path | None = None # override in tests via set_config_dir()
_BENCH_RUNNING: bool = False
_bench_proc: Any = None
_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_voice.py"
_RESULTS_DIR = _ROOT / "benchmark_results"
router = APIRouter()
# ── Testability seams ──────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
# ── Internal helpers ───────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _load_config() -> dict:
"""Read label_tool.yaml cforch section for coordinator/ollama/python config."""
f = _config_file()
file_cfg: dict = {}
if f.exists():
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
file_cfg = raw.get("cforch", {}) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse voice config %s: %s", f, exc)
return {
"coordinator_url": file_cfg.get("coordinator_url", "http://10.1.10.71:7700"),
"ollama_url": file_cfg.get("ollama_url", "http://localhost:11434"),
"python_bin": file_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python"),
}
# ── GET /models ────────────────────────────────────────────────────────────────
@router.get("/models")
def get_models() -> dict:
"""Return available models grouped by source.
- ollama: fetched live from /api/tags (includes any models downloaded
via the Models view automatically in sync)
- cf_text: fetched from cf-orch catalog endpoint (requires node profile
entry + coordinator restart when new GGUFs are added)
"""
cfg = _load_config()
# Ollama models — live query so newly downloaded models appear immediately
ollama_models: list[dict] = []
try:
resp = httpx.get(f"{cfg['ollama_url']}/api/tags", timeout=5.0)
resp.raise_for_status()
for m in resp.json().get("models", []):
name = m.get("name", "")
if name:
size_bytes = m.get("size", 0)
ollama_models.append({
"id": name,
"name": name,
"source": "ollama",
"size_mb": round(size_bytes / (1024 * 1024)) if size_bytes else None,
"vram_mb": None,
})
except Exception as exc:
logger.warning("Failed to fetch ollama models: %s", exc)
# cf-text catalog — fetched from cf-orch coordinator
cftext_models: list[dict] = []
try:
resp = httpx.get(
f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
timeout=5.0,
)
resp.raise_for_status()
for model_id, entry in resp.json().items():
if isinstance(entry, dict):
cftext_models.append({
"id": model_id,
"name": model_id,
"source": "cf-text",
"vram_mb": entry.get("vram_mb"),
"description": entry.get("description", ""),
})
except Exception as exc:
logger.warning("Failed to fetch cf-text catalog: %s", exc)
return {"ollama": ollama_models, "cf_text": cftext_models}
# ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/run")
def run_voice_benchmark(
models: str = Query("", description="Comma-separated model IDs (empty = all)"),
use_cforch: bool = Query(False),
max_vram: int = Query(7200, description="Max VRAM MB for cf-orch OOM filter"),
include_large: bool = Query(False, description="Include large (30B+) ollama models"),
workers: int = Query(1, description="Parallel workers — run N models simultaneously"),
) -> StreamingResponse:
"""Spawn benchmark_voice.py and stream stdout as SSE progress events.
On successful completion, emits a final `type: result` event containing
the parsed JSON from the newest voice_*.json file.
"""
global _BENCH_RUNNING, _bench_proc
if _BENCH_RUNNING:
raise HTTPException(409, "A voice benchmark is already running")
cfg = _load_config()
python_bin = cfg["python_bin"]
def generate():
global _BENCH_RUNNING, _bench_proc
if not _BENCH_SCRIPT.exists():
yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_voice.py not found at {_BENCH_SCRIPT}'})}\n\n"
return
cmd = [python_bin, str(_BENCH_SCRIPT), "run"]
if models:
cmd.extend(["--models", ",".join(m.strip() for m in models.split(",") if m.strip())])
if use_cforch:
cmd.extend(["--cforch", "--cforch-url", cfg["coordinator_url"],
"--max-vram", str(max_vram)])
if include_large:
cmd.append("--include-large")
if workers > 1:
cmd.extend(["--workers", str(workers)])
_BENCH_RUNNING = True
try:
proc = _subprocess.Popen(
cmd,
stdout=_subprocess.PIPE,
stderr=_subprocess.STDOUT,
text=True,
bufsize=1,
cwd=str(_ROOT),
)
_bench_proc = proc
try:
for line in proc.stdout:
line = line.rstrip()
if line:
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
proc.wait()
if proc.returncode == 0:
result_files = sorted(_RESULTS_DIR.glob("voice_*.json"))
if result_files:
try:
results = json.loads(result_files[-1].read_text(encoding="utf-8"))
yield f"data: {json.dumps({'type': 'result', 'results': results, 'filename': result_files[-1].name})}\n\n"
except Exception as exc:
logger.warning("Failed to read voice results: %s", exc)
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
finally:
_bench_proc = None
except Exception as exc:
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
finally:
_BENCH_RUNNING = False
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
# ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results")
def list_results() -> list[dict]:
"""List past voice benchmark runs, newest first.
Returns lightweight summaries (date, model count, top score).
Use /results/{filename} to fetch full model-level detail.
"""
if not _RESULTS_DIR.exists():
return []
runs: list[dict] = []
for f in sorted(_RESULTS_DIR.glob("voice_*.json"), reverse=True):
stem = f.stem # voice_2026-04-22_1502
date_str = stem.removeprefix("voice_") # 2026-04-22_1502
try:
date_part, time_part = date_str.split("_")
display_date = f"{date_part} {time_part[:2]}:{time_part[2:]}"
except Exception:
display_date = date_str
try:
results = json.loads(f.read_text(encoding="utf-8"))
top_score = max((r.get("avg_score", 0) for r in results), default=0)
model_count = len(results)
except Exception:
top_score = 0
model_count = 0
runs.append({
"filename": f.name,
"date": display_date,
"model_count": model_count,
"top_score": round(top_score, 1),
})
return runs
@router.get("/results/latest")
def get_latest_results() -> list[dict]:
"""Return the latest voice benchmark result list."""
if not _RESULTS_DIR.exists():
raise HTTPException(404, "No benchmark results found")
files = sorted(_RESULTS_DIR.glob("voice_*.json"))
if not files:
raise HTTPException(404, "No benchmark results found")
try:
return json.loads(files[-1].read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
@router.get("/results/{filename}")
def get_results_by_filename(filename: str) -> list[dict]:
"""Return voice benchmark results for a specific run file."""
if not filename.startswith("voice_") or not filename.endswith(".json"):
raise HTTPException(400, "Invalid filename — expected voice_*.json")
f = _RESULTS_DIR / filename
if not f.exists():
raise HTTPException(404, f"Results file not found: {filename}")
try:
return json.loads(f.read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
# ── POST /send-to-corrections ──────────────────────────────────────────────────
class SendToCorrectionsRequest(BaseModel):
filename: str # voice_YYYY-MM-DD_HHMM.json — the source run file
model_ids: list[str] = [] # empty = all models in the run
@router.post("/send-to-corrections")
def send_to_corrections(req: SendToCorrectionsRequest) -> dict:
"""Push voice benchmark outputs into the SFT corrections queue.
Each prompt_result from the selected models becomes one SFT candidate
with status='needs_review'. Duplicates are skipped via the 'id' field
(hash of model_id + tag).
"""
if not req.filename.startswith("voice_") or not req.filename.endswith(".json"):
raise HTTPException(400, "Invalid filename")
src = _RESULTS_DIR / req.filename
if not src.exists():
raise HTTPException(404, f"Results file not found: {req.filename}")
try:
run_results: list[dict] = json.loads(src.read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
# Resolve sft_candidates.jsonl path (same logic as sft.py)
sft_data_dir = _ROOT / "data"
sft_file = sft_data_dir / "sft_candidates.jsonl"
# Load existing IDs to deduplicate
existing_ids: set[str] = set()
if sft_file.exists():
for line in sft_file.read_text(encoding="utf-8").splitlines():
line = line.strip()
if line:
try:
existing_ids.add(json.loads(line)["id"])
except Exception:
pass
run_id = req.filename.removesuffix(".json") # voice_2026-04-22_1502
timestamp = datetime.now(tz=timezone.utc).isoformat()
new_candidates: list[dict] = []
for model_result in run_results:
model_id = model_result.get("model_id", "")
if req.model_ids and model_id not in req.model_ids:
continue
for pr in model_result.get("prompt_results", []):
tag = pr.get("tag", "")
# Stable id: deterministic hash of run + model + prompt tag
candidate_id = str(uuid.uuid5(
uuid.NAMESPACE_URL,
f"voice-benchmark/{run_id}/{model_id}/{tag}",
))
if candidate_id in existing_ids:
continue
score_pct = pr.get("score", 0.0) / 100.0
signals = pr.get("signals", {})
# Build the prompt message list matching the benchmark's actual request
prompt_messages = [
{"role": "system", "content": _VOICE_SYSTEM_PROMPT},
{"role": "user", "content": pr.get("user_prompt", tag)},
]
new_candidates.append({
"id": candidate_id,
"source": "voice-benchmark",
"benchmark_run_id": run_id,
"timestamp": timestamp,
"status": "needs_review",
"prompt_messages": prompt_messages,
"model_response": pr.get("output", ""),
"corrected_response": None,
"quality_score": round(score_pct, 4),
"failure_reason": _build_failure_reason(pr, signals),
"failure_category": None,
"task_id": f"voice/{tag}",
"task_type": "voice-match",
"task_name": tag.replace("_", " ").title(),
"model_id": model_id,
"model_name": model_id,
"node_id": "",
"gpu_id": 0,
"tokens_per_sec": 0,
})
existing_ids.add(candidate_id)
if new_candidates:
sft_data_dir.mkdir(parents=True, exist_ok=True)
with open(sft_file, "a", encoding="utf-8") as fh:
for c in new_candidates:
fh.write(json.dumps(c) + "\n")
return {"imported": len(new_candidates), "skipped": 0}
# Excerpt of the system prompt used in benchmark_voice.py — reproduced here
# so the SFT candidate captures the full generation context.
_VOICE_SYSTEM_PROMPT = (
"You are a writing assistant. Your job is to write a Reddit reply that matches "
"the voice, tone, and style of the provided samples exactly.\n\n"
"Voice characteristics:\n"
"- Casual engineer tone. Short punchy sentences.\n"
"- No em dashes. No semicolons. No filler phrases.\n"
"- Direct. Opinionated. Community-first."
)
def _build_failure_reason(pr: dict, signals: dict) -> str | None:
"""Return a human-readable failure reason string if there are violations."""
reasons = []
if signals.get("em_dash_count", 0) > 0:
reasons.append(f"{signals['em_dash_count']} em dash(es)")
if signals.get("semicolon_count", 0) > 0:
reasons.append(f"{signals['semicolon_count']} semicolon(s)")
if signals.get("filler_hits"):
reasons.append(f"filler phrases: {', '.join(signals['filler_hits'])}")
if not pr.get("output", "").strip():
reasons.append("empty output")
return "; ".join(reasons) if reasons else None
# ── POST /cancel ───────────────────────────────────────────────────────────────
@router.post("/cancel")
def cancel_voice_benchmark() -> dict:
"""Kill the running voice benchmark subprocess."""
global _BENCH_RUNNING, _bench_proc
if not _BENCH_RUNNING:
raise HTTPException(404, "No voice benchmark is currently running")
if _bench_proc is not None:
try:
_bench_proc.terminate()
except Exception as exc:
logger.warning("Failed to terminate voice benchmark: %s", exc)
_BENCH_RUNNING = False
_bench_proc = None
return {"status": "cancelled"}

952
scripts/benchmark_style.py Normal file
View file

@ -0,0 +1,952 @@
#!/usr/bin/env python
"""
Writing style benchmark harness -- score local text-gen models for writing style match.
Runs each model against a set of test prompts, extracts style signals from the
outputs, compares them to a style corpus, and produces a ranked markdown table.
Usage:
# List available ollama models
conda run -n cf python scripts/benchmark_style.py --list-models
# Run against all models with default test prompts
conda run -n cf python scripts/benchmark_style.py --run
# Run specific models only
conda run -n cf python scripts/benchmark_style.py --run --models mistral:7b,llama3.1:8b
# Use a custom corpus directory
conda run -n cf python scripts/benchmark_style.py --run --samples data/style_corpus/
# Print last results table
conda run -n cf python scripts/benchmark_style.py --show-last
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
import httpx
_ROOT = Path(__file__).parent.parent
_CORPUS_DIR = _ROOT / "data" / "style_corpus"
_RESULTS_DIR = _ROOT / "benchmark_results"
_OLLAMA_URL = "http://localhost:11434"
_CFORCH_URL = "http://localhost:7700"
# Subdirectories under --scan-disk root that may contain GGUFs
_SCAN_SUBDIRS = ["textgen/models", "llama.cpp/models", "cf-text/models", "vllm/models"]
# ── Filler phrases that should be absent from good style-match output ──────────
FILLER_PHRASES: list[str] = [
"delve", "certainly", "absolutely", "i apologize", "i'd be happy to",
"of course", "great question", "i understand", "let me know if",
"feel free to", "it's important to note", "it's worth noting",
"in conclusion", "to summarize", "in summary",
]
# ── Test prompts: (thread_title, thread_body, context_tag) ───────────────────
# These are representative threads that Magpie might reply to.
# Extend this list with real examples as the corpus grows.
TEST_PROMPTS: list[dict[str, str]] = [
{
"tag": "selfhosted_ai_fatigue",
"thread_title": "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
"thread_body": (
"Every session I start over. My whole hardware setup, what tools I use, "
"what I've already tried. It's exhausting. There has to be a better way."
),
},
{
"tag": "privacy_local_llm",
"thread_title": "What's the point of running local LLMs if the apps still phone home?",
"thread_body": (
"I went through all the trouble of setting up ollama and now I find out "
"the frontend I'm using is sending telemetry. Kind of defeats the purpose."
),
},
{
"tag": "solarpunk_tech",
"thread_title": "What does solarpunk computing actually look like in practice?",
"thread_body": (
"I keep seeing the aesthetic but not a lot of concrete examples of "
"people living it out with their tech choices. What does it mean day to day?"
),
},
{
"tag": "nd_tools",
"thread_title": "Tools that actually help with executive function vs ones that just add friction",
"thread_body": (
"I've tried a dozen productivity apps and most of them require more "
"executive function to maintain than they save. What actually sticks for you?"
),
},
{
"tag": "data_ownership",
"thread_title": "Who actually owns your data when you use a 'free' AI tool?",
"thread_body": (
"Read the ToS on three different AI assistants today. In all three cases "
"your inputs can be used for training, shared with partners, and retained "
"indefinitely. At what point does 'free' just mean you're the product?"
),
},
{
"tag": "digital_culture",
"thread_title": "The internet used to feel like it belonged to everyone. What happened?",
"thread_body": (
"I grew up on forums, IRC, personal homepages. Now everything is a platform "
"owned by someone trying to extract value from the community that built it. "
"Is the fediverse / self-hosting movement actually reversing this or just "
"a niche hobby?"
),
},
]
GENERATION_PARAMS: dict[str, Any] = {
"temperature": 0.7,
"top_p": 0.9,
"num_predict": 300,
}
SYSTEM_PROMPT = (
"You are a writing assistant. Your job is to write a Reddit reply that matches "
"the voice, tone, and style of the provided samples exactly.\n\n"
"Voice characteristics:\n"
"- Casual engineer tone. Short punchy sentences.\n"
"- No hype, no buzzwords, no em dashes, no semicolons.\n"
"- Community-first perspective. Solarpunk values.\n"
"- Direct and opinionated. No throat-clearing or filler.\n"
"- When relevant, mention personal experience with real tools.\n\n"
"Write ONLY the reply. No preamble, no 'Here is a reply:', no meta-commentary."
)
# ── Style signal extraction ───────────────────────────────────────────────────
@dataclass
class StyleSignals:
"""Quantitative style signals extracted from a text sample."""
sentence_count: int = 0
word_count: int = 0
avg_sentence_length: float = 0.0
em_dash_count: int = 0
semicolon_count: int = 0
filler_hits: list[str] = field(default_factory=list)
question_ratio: float = 0.0 # fraction of sentences ending in '?'
first_person_ratio: float = 0.0 # fraction of sentences starting with 'I'
avg_word_length: float = 0.0
def extract_signals(text: str) -> StyleSignals:
"""Extract style signals from a text sample."""
text = text.strip()
if text.startswith("[ERROR:"):
return StyleSignals() # zero-score sentinel — caller checks for empty output
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
words = text.split()
if not sentences:
return StyleSignals()
avg_sentence_length = len(words) / len(sentences) if sentences else 0.0
avg_word_length = (sum(len(w.strip('.,!?;:"\'')) for w in words) / len(words)) if words else 0.0
em_dash_count = text.count('\u2014') + text.count(' -- ') + text.count('--')
semicolon_count = text.count(';')
filler_hits = [p for p in FILLER_PHRASES if p.lower() in text.lower()]
question_ratio = sum(1 for s in sentences if s.endswith('?')) / len(sentences)
first_person_ratio = sum(1 for s in sentences if re.match(r"^I\b", s)) / len(sentences)
return StyleSignals(
sentence_count=len(sentences),
word_count=len(words),
avg_sentence_length=avg_sentence_length,
em_dash_count=em_dash_count,
semicolon_count=semicolon_count,
filler_hits=filler_hits,
question_ratio=question_ratio,
first_person_ratio=first_person_ratio,
avg_word_length=avg_word_length,
)
def build_corpus_profile(corpus_dir: Path) -> StyleSignals | None:
"""Aggregate style signals across all corpus samples into a target profile."""
samples = list(corpus_dir.glob("*.txt"))
if not samples:
return None
all_signals = [extract_signals(p.read_text(encoding="utf-8")) for p in samples]
n = len(all_signals)
return StyleSignals(
sentence_count=int(sum(s.sentence_count for s in all_signals) / n),
word_count=int(sum(s.word_count for s in all_signals) / n),
avg_sentence_length=sum(s.avg_sentence_length for s in all_signals) / n,
em_dash_count=int(sum(s.em_dash_count for s in all_signals) / n),
semicolon_count=int(sum(s.semicolon_count for s in all_signals) / n),
question_ratio=sum(s.question_ratio for s in all_signals) / n,
first_person_ratio=sum(s.first_person_ratio for s in all_signals) / n,
avg_word_length=sum(s.avg_word_length for s in all_signals) / n,
)
def score_against_profile(output_signals: StyleSignals, profile: StyleSignals | None) -> float:
"""Score a model output against the corpus profile. Returns 0-100.
Penalties:
- Em dashes / semicolons: -5 each occurrence (hard CF style violation)
- Filler phrases: -8 each hit (strong signal of non-style output)
- Sentence length delta: proportional penalty (target: close to corpus avg)
- Word length delta: smaller penalty
When no corpus profile is available, falls back to absolute signal scores only.
"""
score = 100.0
# Hard violations -- always penalised regardless of corpus
score -= output_signals.em_dash_count * 5
score -= output_signals.semicolon_count * 3
score -= len(output_signals.filler_hits) * 8
if profile is not None:
# Sentence length delta: penalise proportionally
length_delta = abs(output_signals.avg_sentence_length - profile.avg_sentence_length)
score -= min(length_delta * 2, 20)
# Question ratio delta
question_delta = abs(output_signals.question_ratio - profile.question_ratio)
score -= min(question_delta * 10, 10)
return max(0.0, score)
# ── Ollama generation ─────────────────────────────────────────────────────────
_CFORCH_NODE_ID = "heimdall"
def cforch_list_catalog(
cforch_url: str = _CFORCH_URL,
node_id: str = _CFORCH_NODE_ID,
) -> dict[str, int]:
"""Return the cf-text catalog from cf-orch as {model_id: vram_mb}.
Uses ?node_id= to request the catalog from a specific node's profile,
avoiding cross-node catalog shadowing when multiple nodes define catalogs
for the same service.
"""
try:
resp = httpx.get(
f"{cforch_url}/api/services/cf-text/catalog",
params={"node_id": node_id} if node_id else {},
timeout=10.0,
)
resp.raise_for_status()
raw = resp.json()
return {
model_id: (entry.get("vram_mb", 0) if isinstance(entry, dict) else 0)
for model_id, entry in raw.items()
}
except Exception as exc:
print(f"[warn] Could not reach cf-orch catalog at {cforch_url}: {exc}", file=sys.stderr)
return {}
def _cforch_allocate_service(
service: str,
model_id: str,
cforch_url: str,
startup_timeout_s: float,
health_path: str,
) -> tuple[str, str] | None:
"""Generic cf-orch allocate + state-signal wait. Returns (service_url, allocation_id) or None.
After allocating, waits for the coordinator's service state to reach 'running'.
Fails immediately if the state reaches 'stopped' (crashed load) no waiting out
the full timeout for a model that already failed.
Falls back to health-polling if the coordinator doesn't expose a matching instance
(e.g. older coordinator version or service not yet registered in probe loop).
"""
try:
resp = httpx.post(
f"{cforch_url}/api/services/{service}/allocate",
json={
"model_candidates": [model_id],
"caller": "avocet",
"pipeline": "style_benchmark",
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
service_url: str = data["url"]
allocation_id: str = data.get("allocation_id", "")
node_id: str = data.get("node_id", "")
gpu_id: int | None = data.get("gpu_id")
if data.get("started", False) and not data.get("warm", True):
print(f" [cold start] waiting for {service} to load {model_id!r}...", end=" ", flush=True)
t0 = time.monotonic()
deadline = t0 + startup_timeout_s
probe_misses = 0 # consecutive polls with no matching instance in status
while time.monotonic() < deadline:
try:
status = httpx.get(
f"{cforch_url}/api/services/{service}/status", timeout=5.0
)
if status.is_success:
instances = status.json().get("instances", [])
# Find our specific instance by node+gpu
match = next(
(i for i in instances
if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
None,
)
if match:
probe_misses = 0
state = match.get("state", "")
if state == "running":
elapsed = time.monotonic() - t0
print(f"ready ({elapsed:.0f}s)", flush=True)
return service_url, allocation_id
elif state == "stopped":
print(f"failed (service stopped — model load error)", flush=True)
return None
# state == "starting" or unknown → keep waiting
else:
probe_misses += 1
# After a grace period with no instance visible, fall back to
# direct health-poll (coordinator may not have probed yet)
if probe_misses >= 6:
try:
health = httpx.get(f"{service_url}{health_path}", timeout=3.0)
if health.is_success:
elapsed = time.monotonic() - t0
print(f"ready via health ({elapsed:.0f}s)", flush=True)
return service_url, allocation_id
except Exception:
pass
except Exception:
pass
time.sleep(3.0)
elapsed = time.monotonic() - t0
print(f"timed out after {elapsed:.0f}s", flush=True)
return None
return service_url, allocation_id
except Exception as exc:
print(f"[warn] cf-orch allocation failed for {model_id!r} ({service}): {exc}", file=sys.stderr)
return None
def cforch_allocate(
model_id: str,
cforch_url: str = _CFORCH_URL,
startup_timeout_s: float = 180.0,
) -> tuple[str, str] | None:
"""Allocate a cf-text instance for model_id. Returns (service_url, allocation_id) or None."""
return _cforch_allocate_service("cf-text", model_id, cforch_url, startup_timeout_s, "/health")
def cforch_allocate_vllm(
model_id: str,
cforch_url: str = _CFORCH_URL,
startup_timeout_s: float = 300.0,
) -> tuple[str, str] | None:
"""Allocate a vllm instance for model_id. Returns (service_url, allocation_id) or None.
vllm exposes an OpenAI-compatible API generate_cftext() works unchanged
against the returned service_url. Startup timeout is longer (300s) because
vllm loads large model weights from disk before becoming ready.
"""
return _cforch_allocate_service("vllm", model_id, cforch_url, startup_timeout_s, "/health")
def cforch_release(allocation_id: str, cforch_url: str = _CFORCH_URL) -> None:
"""Release a cf-orch allocation."""
if not allocation_id:
return
try:
httpx.delete(f"{cforch_url}/api/services/cf-text/allocations/{allocation_id}", timeout=10.0)
except Exception:
pass
def generate_cftext(
service_url: str,
model_id: str,
prompt: str,
system: str = "",
) -> tuple[str, float]:
"""Call cf-text via OpenAI-compatible /v1/chat/completions. Returns (text, elapsed_ms)."""
messages: list[dict[str, str]] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
payload: dict[str, Any] = {
"model": model_id,
"messages": messages,
"max_tokens": GENERATION_PARAMS.get("num_predict", 300),
"temperature": GENERATION_PARAMS.get("temperature", 0.7),
"top_p": GENERATION_PARAMS.get("top_p", 0.9),
"stream": False,
}
t0 = time.monotonic()
try:
resp = httpx.post(
f"{service_url.rstrip('/')}/v1/chat/completions",
json=payload,
timeout=180.0,
)
resp.raise_for_status()
elapsed_ms = (time.monotonic() - t0) * 1000
content = resp.json()["choices"][0]["message"]["content"]
return content.strip(), elapsed_ms
except Exception as exc:
elapsed_ms = (time.monotonic() - t0) * 1000
return f"[ERROR: {exc}]", elapsed_ms
def generate(model_id: str, prompt: str, system: str = "") -> tuple[str, float]:
"""Call ollama /api/generate. Returns (text, elapsed_ms)."""
payload: dict[str, Any] = {
"model": model_id,
"prompt": prompt,
"stream": False,
"options": GENERATION_PARAMS,
}
if system:
payload["system"] = system
t0 = time.monotonic()
try:
resp = httpx.post(
f"{_OLLAMA_URL}/api/generate",
json=payload,
timeout=120.0,
)
resp.raise_for_status()
elapsed_ms = (time.monotonic() - t0) * 1000
return resp.json().get("response", "").strip(), elapsed_ms
except Exception as exc:
elapsed_ms = (time.monotonic() - t0) * 1000
return f"[ERROR: {exc}]", elapsed_ms
def find_disk_ggufs(llm_root: Path) -> list[Path]:
"""Recursively find .gguf files under known subdirs of llm_root.
Skips vocab-only GGUFs (ggml-vocab-*) which aren't standalone models.
"""
found: list[Path] = []
search_dirs = [llm_root / sub for sub in _SCAN_SUBDIRS] + [llm_root]
seen: set[Path] = set()
for base in search_dirs:
if not base.exists():
continue
for gguf in base.rglob("*.gguf"):
if gguf in seen:
continue
seen.add(gguf)
if gguf.name.startswith("ggml-vocab-"):
continue
found.append(gguf)
return sorted(found)
def gguf_to_ollama_tag(gguf_path: Path) -> str:
"""Derive a stable ollama tag from a GGUF path.
Uses parent dir name + stem to avoid collisions, e.g.:
claude-3.7-sonnet-reasoning-gemma3-12B/foo.Q8_0.gguf
bench-claude-3.7-sonnet-reasoning-gemma3-12b-foo-q8-0
"""
parent = gguf_path.parent.name.lower()
stem = gguf_path.stem.lower()
# If stem is contained in parent (common pattern), just use parent
slug = parent if stem.replace("-", "").replace("_", "") in parent.replace("-", "").replace("_", "") else f"{parent}-{stem}"
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
return f"bench-{slug}:latest"
def register_gguf(gguf_path: Path, tag: str) -> bool:
"""Create a temporary ollama model entry from a GGUF file. Returns True on success."""
import subprocess
import tempfile
modelfile = f"FROM {gguf_path.resolve()}\n"
with tempfile.NamedTemporaryFile(mode="w", suffix=".Modelfile", delete=False) as f:
f.write(modelfile)
modelfile_path = f.name
try:
result = subprocess.run(
["ollama", "create", tag, "-f", modelfile_path],
capture_output=True, text=True, timeout=60,
)
return result.returncode == 0
except Exception as exc:
print(f"[warn] Could not register {gguf_path.name}: {exc}", file=sys.stderr)
return False
finally:
Path(modelfile_path).unlink(missing_ok=True)
def deregister_gguf(tag: str) -> None:
"""Remove a temporary ollama model entry."""
import subprocess
try:
subprocess.run(["ollama", "rm", tag], capture_output=True, timeout=30)
except Exception:
pass
def backfill_disk_models(
llm_root: Path,
existing_tags: set[str],
max_vram_mb: int = 0,
) -> list[str]:
"""Register GGUFs from disk that aren't already in ollama. Returns new tags.
max_vram_mb: skip files whose size exceeds this threshold (0 = no limit).
GGUF file size is a reliable VRAM proxy -- quantized weights load ~1:1.
"""
ggufs = find_disk_ggufs(llm_root)
if not ggufs:
print(f"No .gguf files found under {llm_root}", file=sys.stderr)
return []
new_tags: list[str] = []
skipped_oom = 0
for gguf in ggufs:
size_mb = gguf.stat().st_size // (1024 * 1024)
if max_vram_mb and size_mb > max_vram_mb:
print(f" [skip-oom] {gguf.name} ({size_mb} MB > {max_vram_mb} MB limit)")
skipped_oom += 1
continue
tag = gguf_to_ollama_tag(gguf)
if tag in existing_tags:
print(f" [skip] {gguf.name} already registered as {tag}")
continue
print(f" [register] {gguf.name} ({size_mb} MB) → {tag} ...", end=" ", flush=True)
if register_gguf(gguf, tag):
print("ok")
new_tags.append(tag)
else:
print("failed")
if skipped_oom:
print(f" [info] {skipped_oom} GGUF(s) skipped (exceed {max_vram_mb} MB VRAM limit)")
return new_tags
def list_ollama_models() -> list[str]:
"""Return model names from ollama /api/tags, filtered to text-gen candidates."""
try:
resp = httpx.get(f"{_OLLAMA_URL}/api/tags", timeout=10.0)
resp.raise_for_status()
models = resp.json().get("models", [])
# Exclude embedding-only models
exclude = {"mxbai-embed-large", "nomic-embed-text", "all-minilm"}
return [
m["name"] for m in models
if not any(x in m["name"].lower() for x in exclude)
]
except Exception as exc:
print(f"[warn] Could not reach ollama: {exc}", file=sys.stderr)
return []
# ── Run benchmark ─────────────────────────────────────────────────────────────
@dataclass
class ModelResult:
model_id: str
prompt_results: list[dict[str, Any]] = field(default_factory=list)
avg_score: float = 0.0
avg_latency_ms: float = 0.0
total_filler_hits: int = 0
total_em_dashes: int = 0
total_semicolons: int = 0
def _bench_one_model(
model_id: str,
prompts: list[dict[str, str]],
profile: Any,
use_cforch: bool,
cforch_url: str,
use_vllm: bool = False,
) -> "ModelResult | None":
"""Run all prompts for a single model. Thread-safe — all output is prefixed with model_id.
Dispatch priority:
use_vllm=True allocate vllm via cf-orch, then generate_cftext() (OpenAI-compatible)
use_cforch=True allocate cf-text via cf-orch, then generate_cftext()
else direct ollama generate()
Both vllm and cf-text expose /v1/chat/completions so generate_cftext() works for both.
"""
prefix = f"[{model_id}]"
result = ModelResult(model_id=model_id)
service_url: str | None = None
allocation_id: str = ""
if use_vllm:
alloc = cforch_allocate_vllm(model_id, cforch_url)
if alloc is None:
print(f"{prefix} [skip] vllm allocation failed", flush=True)
return None
service_url, allocation_id = alloc
print(f"{prefix} vllm allocated: {service_url}", flush=True)
elif use_cforch:
alloc = cforch_allocate(model_id, cforch_url)
if alloc is None:
print(f"{prefix} [skip] cf-orch allocation failed", flush=True)
return None
service_url, allocation_id = alloc
print(f"{prefix} allocated: {service_url}", flush=True)
try:
for prompt_def in prompts:
tag = prompt_def["tag"]
user_prompt = (
f"Thread: {prompt_def['thread_title']}\n\n"
f"{prompt_def['thread_body']}\n\n"
f"Write a reply:"
)
print(f"{prefix} [{tag}] generating...", flush=True)
if (use_cforch or use_vllm) and service_url:
# Both cf-text and vllm expose /v1/chat/completions — same call
output, elapsed_ms = generate_cftext(service_url, model_id, user_prompt, system=SYSTEM_PROMPT)
else:
output, elapsed_ms = generate(model_id, user_prompt, system=SYSTEM_PROMPT)
signals = extract_signals(output)
score = score_against_profile(signals, profile)
print(f"{prefix} [{tag}] {score:.0f}/100 ({elapsed_ms:.0f}ms)", flush=True)
if signals.filler_hits:
print(f"{prefix} ⚠ filler: {signals.filler_hits}", flush=True)
if signals.em_dash_count:
print(f"{prefix} ⚠ em-dashes: {signals.em_dash_count}", flush=True)
result.prompt_results.append({
"tag": tag,
"user_prompt": user_prompt,
"output": output,
"signals": {
"avg_sentence_length": signals.avg_sentence_length,
"em_dash_count": signals.em_dash_count,
"semicolon_count": signals.semicolon_count,
"filler_hits": signals.filler_hits,
"question_ratio": signals.question_ratio,
"word_count": signals.word_count,
},
"score": score,
"latency_ms": elapsed_ms,
})
finally:
if (use_cforch or use_vllm) and allocation_id:
cforch_release(allocation_id, cforch_url)
if not result.prompt_results:
return None
scores = [r["score"] for r in result.prompt_results]
latencies = [r["latency_ms"] for r in result.prompt_results]
result.avg_score = sum(scores) / len(scores)
result.avg_latency_ms = sum(latencies) / len(latencies)
result.total_filler_hits = sum(len(r["signals"]["filler_hits"]) for r in result.prompt_results)
result.total_em_dashes = sum(r["signals"]["em_dash_count"] for r in result.prompt_results)
result.total_semicolons = sum(r["signals"]["semicolon_count"] for r in result.prompt_results)
print(f"{prefix} done — avg score {result.avg_score:.0f}/100", flush=True)
return result
def run_benchmark(
model_ids: list[str],
corpus_dir: Path,
prompts: list[dict[str, str]],
use_cforch: bool = False,
use_vllm: bool = False,
cforch_url: str = _CFORCH_URL,
workers: int = 1,
) -> list[ModelResult]:
profile = build_corpus_profile(corpus_dir)
if profile:
print(f"Corpus profile loaded from {corpus_dir} ({len(list(corpus_dir.glob('*.txt')))} samples)")
print(f" Target avg sentence length: {profile.avg_sentence_length:.1f} words")
else:
print(f"[warn] No corpus samples found in {corpus_dir} -- scoring on hard violations only")
backend = "vllm via cf-orch" if use_vllm else ("cf-text via cf-orch" if use_cforch else "ollama")
print(f" Backend: {backend}")
effective_workers = min(workers, len(model_ids)) if model_ids else 1
print(f" Workers: {effective_workers} (of {len(model_ids)} models)", flush=True)
results: list[ModelResult] = []
if effective_workers <= 1:
# Sequential path — simpler output, easier to follow for single-model runs
for model_id in model_ids:
print(f"\n{'='*60}\nModel: {model_id}", flush=True)
r = _bench_one_model(model_id, prompts, profile, use_cforch, cforch_url, use_vllm)
if r:
results.append(r)
else:
from concurrent.futures import ThreadPoolExecutor, as_completed
print(f" Fanning out {len(model_ids)} models across {effective_workers} workers...", flush=True)
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
futures = {
pool.submit(_bench_one_model, mid, prompts, profile, use_cforch, cforch_url, use_vllm): mid
for mid in model_ids
}
for future in as_completed(futures):
r = future.result()
if r:
results.append(r)
return sorted(results, key=lambda r: r.avg_score, reverse=True)
# ── Markdown report ───────────────────────────────────────────────────────────
def render_report(results: list[ModelResult], corpus_dir: Path) -> str:
date_str = datetime.now().strftime("%Y-%m-%d %H:%M")
lines: list[str] = [
f"# Writing Style Benchmark Results",
f"",
f"**Date:** {date_str} ",
f"**Corpus:** `{corpus_dir}` ",
f"**Models tested:** {len(results)} ",
f"**Prompts per model:** {len(TEST_PROMPTS)}",
f"",
f"## Rankings",
f"",
f"| Rank | Model | Score | Latency | Em-dashes | Fillers | Semicolons |",
f"|------|-------|-------|---------|-----------|---------|------------|",
]
for i, r in enumerate(results, 1):
medal = {1: "🥇", 2: "🥈", 3: "🥉"}.get(i, f"#{i}")
lines.append(
f"| {medal} | `{r.model_id}` | {r.avg_score:.0f}/100 "
f"| {r.avg_latency_ms:.0f}ms "
f"| {r.total_em_dashes} "
f"| {r.total_filler_hits} "
f"| {r.total_semicolons} |"
)
lines += ["", "## Sample Outputs", ""]
for r in results[:3]: # top 3 only to keep report readable
lines += [f"### `{r.model_id}` (avg score: {r.avg_score:.0f})", ""]
for pr in r.prompt_results:
lines += [
f"**Prompt:** {pr['tag']} ",
f"**Score:** {pr['score']:.0f}/100 ",
f"",
f"```",
pr["output"],
f"```",
f"",
]
return "\n".join(lines)
def save_report(results: list[ModelResult], corpus_dir: Path) -> Path:
_RESULTS_DIR.mkdir(exist_ok=True)
date_str = datetime.now().strftime("%Y-%m-%d_%H%M")
report_path = _RESULTS_DIR / f"style_{date_str}.md"
report_path.write_text(render_report(results, corpus_dir), encoding="utf-8")
# Also save raw JSON for programmatic use
json_path = _RESULTS_DIR / f"style_{date_str}.json"
json_path.write_text(
json.dumps(
[
{
"model_id": r.model_id,
"avg_score": r.avg_score,
"avg_latency_ms": r.avg_latency_ms,
"total_filler_hits": r.total_filler_hits,
"total_em_dashes": r.total_em_dashes,
"total_semicolons": r.total_semicolons,
"prompt_results": r.prompt_results,
}
for r in results
],
indent=2,
),
encoding="utf-8",
)
return report_path
# ── CLI commands ──────────────────────────────────────────────────────────────
def cmd_list_models(_args: argparse.Namespace) -> None:
models = list_ollama_models()
if not models:
print("No models found (is ollama running?)")
return
print(f"{len(models)} models available:\n")
for m in models:
print(f" {m}")
def cmd_run(args: argparse.Namespace) -> None:
corpus_dir = Path(args.samples)
if not corpus_dir.exists():
print(f"[error] Corpus directory not found: {corpus_dir}", file=sys.stderr)
sys.exit(1)
max_vram_mb: int = getattr(args, "max_vram", 7200)
use_cforch: bool = getattr(args, "cforch", False)
use_vllm: bool = getattr(args, "vllm", False)
cforch_url: str = getattr(args, "cforch_url", _CFORCH_URL)
registered_tags: list[str] = []
def _filter_ollama_by_size(ids: list[str], include_large: bool) -> list[str]:
"""Apply name-pattern size filter to ollama model list."""
if include_large:
return ids
skip_patterns = ["270b", "70b", "32b", "30b", "21b", "20b", "deepseek-r1"]
filtered = [m for m in ids if not any(p in m.lower() for p in skip_patterns)]
skipped = len(ids) - len(filtered)
if skipped:
print(f"[info] Skipped {skipped} large model(s) by name pattern. "
"Pass --include-large to include them.")
return filtered
if args.models and args.models != "all":
model_ids = [m.strip() for m in args.models.split(",") if m.strip()]
elif use_cforch:
# cf-orch path: pull model list from catalog, filter by vram_mb
catalog = cforch_list_catalog(cforch_url)
if not catalog:
print("[warn] cf-orch catalog empty or unreachable -- falling back to ollama models")
use_cforch = False
model_ids = _filter_ollama_by_size(list_ollama_models(), args.include_large)
if not model_ids:
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
sys.exit(1)
else:
before = list(catalog.items())
allowed = {mid: mb for mid, mb in before if mb == 0 or mb <= max_vram_mb}
skipped_oom = {mid: mb for mid, mb in before if mid not in allowed}
model_ids = list(allowed.keys())
print(f"[info] cf-orch catalog: {len(before)} model(s), "
f"{len(allowed)} within {max_vram_mb} MB VRAM limit")
if skipped_oom:
print(f"[info] Skipped (OOM risk): "
+ ", ".join(f"{mid} ({mb} MB)" for mid, mb in sorted(skipped_oom.items())))
else:
# Ollama path
model_ids = list_ollama_models()
if not model_ids:
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
sys.exit(1)
# Backfill GGUFs from disk before filtering -- skips files that exceed VRAM limit
if getattr(args, "scan_disk", None):
llm_root = Path(args.scan_disk)
print(f"\nScanning {llm_root} for unregistered GGUFs (limit: {max_vram_mb} MB)...")
registered_tags = backfill_disk_models(llm_root, set(model_ids), max_vram_mb=max_vram_mb)
model_ids = list_ollama_models() # re-fetch with new registrations
model_ids = _filter_ollama_by_size(model_ids, args.include_large)
print(f"\nRunning writing style benchmark on {len(model_ids)} model(s)...")
try:
results = run_benchmark(model_ids, corpus_dir, TEST_PROMPTS, use_cforch=use_cforch, use_vllm=use_vllm, cforch_url=cforch_url, workers=args.workers)
report_path = save_report(results, corpus_dir)
print(f"\n{'='*60}")
print(f"Results saved to: {report_path}")
print(f"\n{render_report(results, corpus_dir)}")
finally:
if registered_tags:
print(f"\nCleaning up {len(registered_tags)} temporary ollama registrations...")
for tag in registered_tags:
deregister_gguf(tag)
def cmd_show_last(_args: argparse.Namespace) -> None:
reports = sorted(_RESULTS_DIR.glob("style_*.md"), reverse=True)
if not reports:
print("No benchmark results found. Run --run first.")
return
print(reports[0].read_text(encoding="utf-8"))
# ── Entry point ───────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description="Writing style benchmark harness for local text-gen models",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
sub = parser.add_subparsers(dest="cmd")
sub.add_parser("list-models", help="List available ollama models")
run_p = sub.add_parser("run", help="Run the benchmark")
run_p.add_argument("--models", default="all", help="Comma-separated model IDs, or 'all'")
run_p.add_argument("--samples", default=str(_CORPUS_DIR), help="Path to style corpus directory")
run_p.add_argument("--include-large", action="store_true", help="Include models >20B params")
run_p.add_argument("--scan-disk", metavar="LLM_ROOT", help="Scan directory for GGUFs not yet in ollama (e.g. /Library/Assets/LLM)")
run_p.add_argument("--cforch", action="store_true", help="Route generation through cf-orch/cf-text instead of direct ollama")
run_p.add_argument("--vllm", action="store_true", help="Route generation through cf-orch/vllm (OpenAI-compatible) instead of ollama")
run_p.add_argument("--cforch-url", default=_CFORCH_URL, help=f"cf-orch coordinator URL (default: {_CFORCH_URL})")
run_p.add_argument("--max-vram", type=int, default=7200, metavar="MB",
help="Skip models whose VRAM footprint exceeds this limit in MB (default: 7200)")
run_p.add_argument("--workers", type=int, default=1, metavar="N",
help="Parallel workers — run N models simultaneously (default: 1; use 4+ with cf-orch)")
sub.add_parser("show-last", help="Print the most recent benchmark report")
# Also support legacy --list-models / --run / --show-last flags for manage.sh compat
parser.add_argument("--list-models", action="store_true")
parser.add_argument("--run", action="store_true")
parser.add_argument("--show-last", action="store_true")
parser.add_argument("--models", default="all")
parser.add_argument("--samples", default=str(_CORPUS_DIR))
parser.add_argument("--include-large", action="store_true")
parser.add_argument("--scan-disk", metavar="LLM_ROOT")
parser.add_argument("--cforch", action="store_true")
parser.add_argument("--vllm", action="store_true")
parser.add_argument("--cforch-url", default=_CFORCH_URL)
parser.add_argument("--max-vram", type=int, default=7200, metavar="MB")
parser.add_argument("--workers", type=int, default=1, metavar="N")
args = parser.parse_args()
if args.cmd == "list-models" or args.list_models:
cmd_list_models(args)
elif args.cmd == "run" or args.run:
cmd_run(args)
elif args.cmd == "show-last" or args.show_last:
cmd_show_last(args)
else:
parser.print_help()
if __name__ == "__main__":
main()

909
scripts/benchmark_voice.py Normal file
View file

@ -0,0 +1,909 @@
#!/usr/bin/env python
"""
Voice benchmark harness -- score local text-gen models for writing style match.
Runs each model against a set of test prompts, extracts style signals from the
outputs, compares them to a voice corpus, and produces a ranked markdown table.
Usage:
# List available ollama models
conda run -n cf python scripts/benchmark_voice.py --list-models
# Run against all models with default test prompts
conda run -n cf python scripts/benchmark_voice.py --run
# Run specific models only
conda run -n cf python scripts/benchmark_voice.py --run --models mistral:7b,llama3.1:8b
# Use a custom corpus directory
conda run -n cf python scripts/benchmark_voice.py --run --samples data/voice_corpus/
# Print last results table
conda run -n cf python scripts/benchmark_voice.py --show-last
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import time
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any
import httpx
_ROOT = Path(__file__).parent.parent
_CORPUS_DIR = _ROOT / "data" / "voice_corpus"
_RESULTS_DIR = _ROOT / "benchmark_results"
_OLLAMA_URL = "http://localhost:11434"
_CFORCH_URL = "http://localhost:7700"
# Subdirectories under --scan-disk root that may contain GGUFs
_SCAN_SUBDIRS = ["textgen/models", "llama.cpp/models", "cf-text/models", "vllm/models"]
# ── Filler phrases that should be absent from good voice-match output ─────────
FILLER_PHRASES: list[str] = [
"delve", "certainly", "absolutely", "i apologize", "i'd be happy to",
"of course", "great question", "i understand", "let me know if",
"feel free to", "it's important to note", "it's worth noting",
"in conclusion", "to summarize", "in summary",
]
# ── Test prompts: (thread_title, thread_body, context_tag) ───────────────────
# These are representative threads that Magpie might reply to.
# Extend this list with real examples as the corpus grows.
TEST_PROMPTS: list[dict[str, str]] = [
{
"tag": "selfhosted_ai_fatigue",
"thread_title": "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
"thread_body": (
"Every session I start over. My whole hardware setup, what tools I use, "
"what I've already tried. It's exhausting. There has to be a better way."
),
},
{
"tag": "privacy_local_llm",
"thread_title": "What's the point of running local LLMs if the apps still phone home?",
"thread_body": (
"I went through all the trouble of setting up ollama and now I find out "
"the frontend I'm using is sending telemetry. Kind of defeats the purpose."
),
},
{
"tag": "solarpunk_tech",
"thread_title": "What does solarpunk computing actually look like in practice?",
"thread_body": (
"I keep seeing the aesthetic but not a lot of concrete examples of "
"people living it out with their tech choices. What does it mean day to day?"
),
},
{
"tag": "nd_tools",
"thread_title": "Tools that actually help with executive function vs ones that just add friction",
"thread_body": (
"I've tried a dozen productivity apps and most of them require more "
"executive function to maintain than they save. What actually sticks for you?"
),
},
{
"tag": "data_ownership",
"thread_title": "Who actually owns your data when you use a 'free' AI tool?",
"thread_body": (
"Read the ToS on three different AI assistants today. In all three cases "
"your inputs can be used for training, shared with partners, and retained "
"indefinitely. At what point does 'free' just mean you're the product?"
),
},
{
"tag": "digital_culture",
"thread_title": "The internet used to feel like it belonged to everyone. What happened?",
"thread_body": (
"I grew up on forums, IRC, personal homepages. Now everything is a platform "
"owned by someone trying to extract value from the community that built it. "
"Is the fediverse / self-hosting movement actually reversing this or just "
"a niche hobby?"
),
},
]
GENERATION_PARAMS: dict[str, Any] = {
"temperature": 0.7,
"top_p": 0.9,
"num_predict": 300,
}
SYSTEM_PROMPT = (
"You are a writing assistant. Your job is to write a Reddit reply that matches "
"the voice, tone, and style of the provided samples exactly.\n\n"
"Voice characteristics:\n"
"- Casual engineer tone. Short punchy sentences.\n"
"- No hype, no buzzwords, no em dashes, no semicolons.\n"
"- Community-first perspective. Solarpunk values.\n"
"- Direct and opinionated. No throat-clearing or filler.\n"
"- When relevant, mention personal experience with real tools.\n\n"
"Write ONLY the reply. No preamble, no 'Here is a reply:', no meta-commentary."
)
# ── Style signal extraction ───────────────────────────────────────────────────
@dataclass
class StyleSignals:
"""Quantitative style signals extracted from a text sample."""
sentence_count: int = 0
word_count: int = 0
avg_sentence_length: float = 0.0
em_dash_count: int = 0
semicolon_count: int = 0
filler_hits: list[str] = field(default_factory=list)
question_ratio: float = 0.0 # fraction of sentences ending in '?'
first_person_ratio: float = 0.0 # fraction of sentences starting with 'I'
avg_word_length: float = 0.0
def extract_signals(text: str) -> StyleSignals:
"""Extract style signals from a text sample."""
text = text.strip()
if text.startswith("[ERROR:"):
return StyleSignals() # zero-score sentinel — caller checks for empty output
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
words = text.split()
if not sentences:
return StyleSignals()
avg_sentence_length = len(words) / len(sentences) if sentences else 0.0
avg_word_length = (sum(len(w.strip('.,!?;:"\'')) for w in words) / len(words)) if words else 0.0
em_dash_count = text.count('\u2014') + text.count(' -- ') + text.count('--')
semicolon_count = text.count(';')
filler_hits = [p for p in FILLER_PHRASES if p.lower() in text.lower()]
question_ratio = sum(1 for s in sentences if s.endswith('?')) / len(sentences)
first_person_ratio = sum(1 for s in sentences if re.match(r"^I\b", s)) / len(sentences)
return StyleSignals(
sentence_count=len(sentences),
word_count=len(words),
avg_sentence_length=avg_sentence_length,
em_dash_count=em_dash_count,
semicolon_count=semicolon_count,
filler_hits=filler_hits,
question_ratio=question_ratio,
first_person_ratio=first_person_ratio,
avg_word_length=avg_word_length,
)
def build_corpus_profile(corpus_dir: Path) -> StyleSignals | None:
"""Aggregate style signals across all corpus samples into a target profile."""
samples = list(corpus_dir.glob("*.txt"))
if not samples:
return None
all_signals = [extract_signals(p.read_text(encoding="utf-8")) for p in samples]
n = len(all_signals)
return StyleSignals(
sentence_count=int(sum(s.sentence_count for s in all_signals) / n),
word_count=int(sum(s.word_count for s in all_signals) / n),
avg_sentence_length=sum(s.avg_sentence_length for s in all_signals) / n,
em_dash_count=int(sum(s.em_dash_count for s in all_signals) / n),
semicolon_count=int(sum(s.semicolon_count for s in all_signals) / n),
question_ratio=sum(s.question_ratio for s in all_signals) / n,
first_person_ratio=sum(s.first_person_ratio for s in all_signals) / n,
avg_word_length=sum(s.avg_word_length for s in all_signals) / n,
)
def score_against_profile(output_signals: StyleSignals, profile: StyleSignals | None) -> float:
"""Score a model output against the corpus profile. Returns 0-100.
Penalties:
- Em dashes / semicolons: -5 each occurrence (hard CF style violation)
- Filler phrases: -8 each hit (strong signal of non-voice output)
- Sentence length delta: proportional penalty (target: close to corpus avg)
- Word length delta: smaller penalty
When no corpus profile is available, falls back to absolute signal scores only.
"""
score = 100.0
# Hard violations -- always penalised regardless of corpus
score -= output_signals.em_dash_count * 5
score -= output_signals.semicolon_count * 3
score -= len(output_signals.filler_hits) * 8
if profile is not None:
# Sentence length delta: penalise proportionally
length_delta = abs(output_signals.avg_sentence_length - profile.avg_sentence_length)
score -= min(length_delta * 2, 20)
# Question ratio delta
question_delta = abs(output_signals.question_ratio - profile.question_ratio)
score -= min(question_delta * 10, 10)
return max(0.0, score)
# ── Ollama generation ─────────────────────────────────────────────────────────
_CFORCH_NODE_ID = "heimdall"
def cforch_list_catalog(
cforch_url: str = _CFORCH_URL,
node_id: str = _CFORCH_NODE_ID,
) -> dict[str, int]:
"""Return the cf-text catalog from cf-orch as {model_id: vram_mb}.
Uses ?node_id= to request the catalog from a specific node's profile,
avoiding cross-node catalog shadowing when multiple nodes define catalogs
for the same service.
"""
try:
resp = httpx.get(
f"{cforch_url}/api/services/cf-text/catalog",
params={"node_id": node_id} if node_id else {},
timeout=10.0,
)
resp.raise_for_status()
raw = resp.json()
return {
model_id: (entry.get("vram_mb", 0) if isinstance(entry, dict) else 0)
for model_id, entry in raw.items()
}
except Exception as exc:
print(f"[warn] Could not reach cf-orch catalog at {cforch_url}: {exc}", file=sys.stderr)
return {}
def _cforch_allocate_service(
service: str,
model_id: str,
cforch_url: str,
startup_timeout_s: float,
health_path: str,
) -> tuple[str, str] | None:
"""Generic cf-orch allocate + health-poll. Returns (service_url, allocation_id) or None."""
try:
resp = httpx.post(
f"{cforch_url}/api/services/{service}/allocate",
json={
"model_candidates": [model_id],
"caller": "avocet",
"pipeline": "voice_benchmark",
},
timeout=120.0,
)
resp.raise_for_status()
data = resp.json()
service_url: str = data["url"]
allocation_id: str = data.get("allocation_id", "")
if data.get("started", False) and not data.get("warm", True):
label = service
print(f" [cold start] waiting for {label} to load {model_id!r}...", end=" ", flush=True)
deadline = time.monotonic() + startup_timeout_s
while time.monotonic() < deadline:
try:
health = httpx.get(f"{service_url}{health_path}", timeout=3.0)
if health.is_success:
print(f"ready ({time.monotonic() - (deadline - startup_timeout_s):.0f}s)", flush=True)
break
except Exception:
pass
time.sleep(2.0)
else:
print(f"timed out after {startup_timeout_s:.0f}s", flush=True)
return None
return service_url, allocation_id
except Exception as exc:
print(f"[warn] cf-orch allocation failed for {model_id!r} ({service}): {exc}", file=sys.stderr)
return None
def cforch_allocate(
model_id: str,
cforch_url: str = _CFORCH_URL,
startup_timeout_s: float = 180.0,
) -> tuple[str, str] | None:
"""Allocate a cf-text instance for model_id. Returns (service_url, allocation_id) or None."""
return _cforch_allocate_service("cf-text", model_id, cforch_url, startup_timeout_s, "/health")
def cforch_allocate_vllm(
model_id: str,
cforch_url: str = _CFORCH_URL,
startup_timeout_s: float = 300.0,
) -> tuple[str, str] | None:
"""Allocate a vllm instance for model_id. Returns (service_url, allocation_id) or None.
vllm exposes an OpenAI-compatible API generate_cftext() works unchanged
against the returned service_url. Startup timeout is longer (300s) because
vllm loads large model weights from disk before becoming ready.
"""
return _cforch_allocate_service("vllm", model_id, cforch_url, startup_timeout_s, "/health")
def cforch_release(allocation_id: str, cforch_url: str = _CFORCH_URL) -> None:
"""Release a cf-orch allocation."""
if not allocation_id:
return
try:
httpx.post(f"{cforch_url}/api/leases/{allocation_id}/release", timeout=10.0)
except Exception:
pass
def generate_cftext(
service_url: str,
model_id: str,
prompt: str,
system: str = "",
) -> tuple[str, float]:
"""Call cf-text via OpenAI-compatible /v1/chat/completions. Returns (text, elapsed_ms)."""
messages: list[dict[str, str]] = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
payload: dict[str, Any] = {
"model": model_id,
"messages": messages,
"max_tokens": GENERATION_PARAMS.get("num_predict", 300),
"temperature": GENERATION_PARAMS.get("temperature", 0.7),
"top_p": GENERATION_PARAMS.get("top_p", 0.9),
"stream": False,
}
t0 = time.monotonic()
try:
resp = httpx.post(
f"{service_url.rstrip('/')}/v1/chat/completions",
json=payload,
timeout=180.0,
)
resp.raise_for_status()
elapsed_ms = (time.monotonic() - t0) * 1000
content = resp.json()["choices"][0]["message"]["content"]
return content.strip(), elapsed_ms
except Exception as exc:
elapsed_ms = (time.monotonic() - t0) * 1000
return f"[ERROR: {exc}]", elapsed_ms
def generate(model_id: str, prompt: str, system: str = "") -> tuple[str, float]:
"""Call ollama /api/generate. Returns (text, elapsed_ms)."""
payload: dict[str, Any] = {
"model": model_id,
"prompt": prompt,
"stream": False,
"options": GENERATION_PARAMS,
}
if system:
payload["system"] = system
t0 = time.monotonic()
try:
resp = httpx.post(
f"{_OLLAMA_URL}/api/generate",
json=payload,
timeout=120.0,
)
resp.raise_for_status()
elapsed_ms = (time.monotonic() - t0) * 1000
return resp.json().get("response", "").strip(), elapsed_ms
except Exception as exc:
elapsed_ms = (time.monotonic() - t0) * 1000
return f"[ERROR: {exc}]", elapsed_ms
def find_disk_ggufs(llm_root: Path) -> list[Path]:
"""Recursively find .gguf files under known subdirs of llm_root.
Skips vocab-only GGUFs (ggml-vocab-*) which aren't standalone models.
"""
found: list[Path] = []
search_dirs = [llm_root / sub for sub in _SCAN_SUBDIRS] + [llm_root]
seen: set[Path] = set()
for base in search_dirs:
if not base.exists():
continue
for gguf in base.rglob("*.gguf"):
if gguf in seen:
continue
seen.add(gguf)
if gguf.name.startswith("ggml-vocab-"):
continue
found.append(gguf)
return sorted(found)
def gguf_to_ollama_tag(gguf_path: Path) -> str:
"""Derive a stable ollama tag from a GGUF path.
Uses parent dir name + stem to avoid collisions, e.g.:
claude-3.7-sonnet-reasoning-gemma3-12B/foo.Q8_0.gguf
bench-claude-3.7-sonnet-reasoning-gemma3-12b-foo-q8-0
"""
parent = gguf_path.parent.name.lower()
stem = gguf_path.stem.lower()
# If stem is contained in parent (common pattern), just use parent
slug = parent if stem.replace("-", "").replace("_", "") in parent.replace("-", "").replace("_", "") else f"{parent}-{stem}"
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
return f"bench-{slug}:latest"
def register_gguf(gguf_path: Path, tag: str) -> bool:
"""Create a temporary ollama model entry from a GGUF file. Returns True on success."""
import subprocess
import tempfile
modelfile = f"FROM {gguf_path.resolve()}\n"
with tempfile.NamedTemporaryFile(mode="w", suffix=".Modelfile", delete=False) as f:
f.write(modelfile)
modelfile_path = f.name
try:
result = subprocess.run(
["ollama", "create", tag, "-f", modelfile_path],
capture_output=True, text=True, timeout=60,
)
return result.returncode == 0
except Exception as exc:
print(f"[warn] Could not register {gguf_path.name}: {exc}", file=sys.stderr)
return False
finally:
Path(modelfile_path).unlink(missing_ok=True)
def deregister_gguf(tag: str) -> None:
"""Remove a temporary ollama model entry."""
import subprocess
try:
subprocess.run(["ollama", "rm", tag], capture_output=True, timeout=30)
except Exception:
pass
def backfill_disk_models(
llm_root: Path,
existing_tags: set[str],
max_vram_mb: int = 0,
) -> list[str]:
"""Register GGUFs from disk that aren't already in ollama. Returns new tags.
max_vram_mb: skip files whose size exceeds this threshold (0 = no limit).
GGUF file size is a reliable VRAM proxy -- quantized weights load ~1:1.
"""
ggufs = find_disk_ggufs(llm_root)
if not ggufs:
print(f"No .gguf files found under {llm_root}", file=sys.stderr)
return []
new_tags: list[str] = []
skipped_oom = 0
for gguf in ggufs:
size_mb = gguf.stat().st_size // (1024 * 1024)
if max_vram_mb and size_mb > max_vram_mb:
print(f" [skip-oom] {gguf.name} ({size_mb} MB > {max_vram_mb} MB limit)")
skipped_oom += 1
continue
tag = gguf_to_ollama_tag(gguf)
if tag in existing_tags:
print(f" [skip] {gguf.name} already registered as {tag}")
continue
print(f" [register] {gguf.name} ({size_mb} MB) → {tag} ...", end=" ", flush=True)
if register_gguf(gguf, tag):
print("ok")
new_tags.append(tag)
else:
print("failed")
if skipped_oom:
print(f" [info] {skipped_oom} GGUF(s) skipped (exceed {max_vram_mb} MB VRAM limit)")
return new_tags
def list_ollama_models() -> list[str]:
"""Return model names from ollama /api/tags, filtered to text-gen candidates."""
try:
resp = httpx.get(f"{_OLLAMA_URL}/api/tags", timeout=10.0)
resp.raise_for_status()
models = resp.json().get("models", [])
# Exclude embedding-only models
exclude = {"mxbai-embed-large", "nomic-embed-text", "all-minilm"}
return [
m["name"] for m in models
if not any(x in m["name"].lower() for x in exclude)
]
except Exception as exc:
print(f"[warn] Could not reach ollama: {exc}", file=sys.stderr)
return []
# ── Run benchmark ─────────────────────────────────────────────────────────────
@dataclass
class ModelResult:
model_id: str
prompt_results: list[dict[str, Any]] = field(default_factory=list)
avg_score: float = 0.0
avg_latency_ms: float = 0.0
total_filler_hits: int = 0
total_em_dashes: int = 0
total_semicolons: int = 0
def _bench_one_model(
model_id: str,
prompts: list[dict[str, str]],
profile: Any,
use_cforch: bool,
cforch_url: str,
use_vllm: bool = False,
) -> "ModelResult | None":
"""Run all prompts for a single model. Thread-safe — all output is prefixed with model_id.
Dispatch priority:
use_vllm=True allocate vllm via cf-orch, then generate_cftext() (OpenAI-compatible)
use_cforch=True allocate cf-text via cf-orch, then generate_cftext()
else direct ollama generate()
Both vllm and cf-text expose /v1/chat/completions so generate_cftext() works for both.
"""
prefix = f"[{model_id}]"
result = ModelResult(model_id=model_id)
service_url: str | None = None
allocation_id: str = ""
if use_vllm:
alloc = cforch_allocate_vllm(model_id, cforch_url)
if alloc is None:
print(f"{prefix} [skip] vllm allocation failed", flush=True)
return None
service_url, allocation_id = alloc
print(f"{prefix} vllm allocated: {service_url}", flush=True)
elif use_cforch:
alloc = cforch_allocate(model_id, cforch_url)
if alloc is None:
print(f"{prefix} [skip] cf-orch allocation failed", flush=True)
return None
service_url, allocation_id = alloc
print(f"{prefix} allocated: {service_url}", flush=True)
try:
for prompt_def in prompts:
tag = prompt_def["tag"]
user_prompt = (
f"Thread: {prompt_def['thread_title']}\n\n"
f"{prompt_def['thread_body']}\n\n"
f"Write a reply:"
)
print(f"{prefix} [{tag}] generating...", flush=True)
if (use_cforch or use_vllm) and service_url:
# Both cf-text and vllm expose /v1/chat/completions — same call
output, elapsed_ms = generate_cftext(service_url, model_id, user_prompt, system=SYSTEM_PROMPT)
else:
output, elapsed_ms = generate(model_id, user_prompt, system=SYSTEM_PROMPT)
signals = extract_signals(output)
score = score_against_profile(signals, profile)
print(f"{prefix} [{tag}] {score:.0f}/100 ({elapsed_ms:.0f}ms)", flush=True)
if signals.filler_hits:
print(f"{prefix} ⚠ filler: {signals.filler_hits}", flush=True)
if signals.em_dash_count:
print(f"{prefix} ⚠ em-dashes: {signals.em_dash_count}", flush=True)
result.prompt_results.append({
"tag": tag,
"user_prompt": user_prompt,
"output": output,
"signals": {
"avg_sentence_length": signals.avg_sentence_length,
"em_dash_count": signals.em_dash_count,
"semicolon_count": signals.semicolon_count,
"filler_hits": signals.filler_hits,
"question_ratio": signals.question_ratio,
"word_count": signals.word_count,
},
"score": score,
"latency_ms": elapsed_ms,
})
finally:
if use_cforch and allocation_id:
cforch_release(allocation_id, cforch_url)
if not result.prompt_results:
return None
scores = [r["score"] for r in result.prompt_results]
latencies = [r["latency_ms"] for r in result.prompt_results]
result.avg_score = sum(scores) / len(scores)
result.avg_latency_ms = sum(latencies) / len(latencies)
result.total_filler_hits = sum(len(r["signals"]["filler_hits"]) for r in result.prompt_results)
result.total_em_dashes = sum(r["signals"]["em_dash_count"] for r in result.prompt_results)
result.total_semicolons = sum(r["signals"]["semicolon_count"] for r in result.prompt_results)
print(f"{prefix} done — avg score {result.avg_score:.0f}/100", flush=True)
return result
def run_benchmark(
model_ids: list[str],
corpus_dir: Path,
prompts: list[dict[str, str]],
use_cforch: bool = False,
use_vllm: bool = False,
cforch_url: str = _CFORCH_URL,
workers: int = 1,
) -> list[ModelResult]:
profile = build_corpus_profile(corpus_dir)
if profile:
print(f"Corpus profile loaded from {corpus_dir} ({len(list(corpus_dir.glob('*.txt')))} samples)")
print(f" Target avg sentence length: {profile.avg_sentence_length:.1f} words")
else:
print(f"[warn] No corpus samples found in {corpus_dir} -- scoring on hard violations only")
backend = "vllm via cf-orch" if use_vllm else ("cf-text via cf-orch" if use_cforch else "ollama")
print(f" Backend: {backend}")
effective_workers = min(workers, len(model_ids)) if model_ids else 1
print(f" Workers: {effective_workers} (of {len(model_ids)} models)", flush=True)
results: list[ModelResult] = []
if effective_workers <= 1:
# Sequential path — simpler output, easier to follow for single-model runs
for model_id in model_ids:
print(f"\n{'='*60}\nModel: {model_id}", flush=True)
r = _bench_one_model(model_id, prompts, profile, use_cforch, cforch_url, use_vllm)
if r:
results.append(r)
else:
from concurrent.futures import ThreadPoolExecutor, as_completed
print(f" Fanning out {len(model_ids)} models across {effective_workers} workers...", flush=True)
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
futures = {
pool.submit(_bench_one_model, mid, prompts, profile, use_cforch, cforch_url, use_vllm): mid
for mid in model_ids
}
for future in as_completed(futures):
r = future.result()
if r:
results.append(r)
return sorted(results, key=lambda r: r.avg_score, reverse=True)
# ── Markdown report ───────────────────────────────────────────────────────────
def render_report(results: list[ModelResult], corpus_dir: Path) -> str:
date_str = datetime.now().strftime("%Y-%m-%d %H:%M")
lines: list[str] = [
f"# Voice Benchmark Results",
f"",
f"**Date:** {date_str} ",
f"**Corpus:** `{corpus_dir}` ",
f"**Models tested:** {len(results)} ",
f"**Prompts per model:** {len(TEST_PROMPTS)}",
f"",
f"## Rankings",
f"",
f"| Rank | Model | Score | Latency | Em-dashes | Fillers | Semicolons |",
f"|------|-------|-------|---------|-----------|---------|------------|",
]
for i, r in enumerate(results, 1):
medal = {1: "🥇", 2: "🥈", 3: "🥉"}.get(i, f"#{i}")
lines.append(
f"| {medal} | `{r.model_id}` | {r.avg_score:.0f}/100 "
f"| {r.avg_latency_ms:.0f}ms "
f"| {r.total_em_dashes} "
f"| {r.total_filler_hits} "
f"| {r.total_semicolons} |"
)
lines += ["", "## Sample Outputs", ""]
for r in results[:3]: # top 3 only to keep report readable
lines += [f"### `{r.model_id}` (avg score: {r.avg_score:.0f})", ""]
for pr in r.prompt_results:
lines += [
f"**Prompt:** {pr['tag']} ",
f"**Score:** {pr['score']:.0f}/100 ",
f"",
f"```",
pr["output"],
f"```",
f"",
]
return "\n".join(lines)
def save_report(results: list[ModelResult], corpus_dir: Path) -> Path:
_RESULTS_DIR.mkdir(exist_ok=True)
date_str = datetime.now().strftime("%Y-%m-%d_%H%M")
report_path = _RESULTS_DIR / f"voice_{date_str}.md"
report_path.write_text(render_report(results, corpus_dir), encoding="utf-8")
# Also save raw JSON for programmatic use
json_path = _RESULTS_DIR / f"voice_{date_str}.json"
json_path.write_text(
json.dumps(
[
{
"model_id": r.model_id,
"avg_score": r.avg_score,
"avg_latency_ms": r.avg_latency_ms,
"total_filler_hits": r.total_filler_hits,
"total_em_dashes": r.total_em_dashes,
"total_semicolons": r.total_semicolons,
"prompt_results": r.prompt_results,
}
for r in results
],
indent=2,
),
encoding="utf-8",
)
return report_path
# ── CLI commands ──────────────────────────────────────────────────────────────
def cmd_list_models(_args: argparse.Namespace) -> None:
models = list_ollama_models()
if not models:
print("No models found (is ollama running?)")
return
print(f"{len(models)} models available:\n")
for m in models:
print(f" {m}")
def cmd_run(args: argparse.Namespace) -> None:
corpus_dir = Path(args.samples)
if not corpus_dir.exists():
print(f"[error] Corpus directory not found: {corpus_dir}", file=sys.stderr)
sys.exit(1)
max_vram_mb: int = getattr(args, "max_vram", 7200)
use_cforch: bool = getattr(args, "cforch", False)
use_vllm: bool = getattr(args, "vllm", False)
cforch_url: str = getattr(args, "cforch_url", _CFORCH_URL)
registered_tags: list[str] = []
def _filter_ollama_by_size(ids: list[str], include_large: bool) -> list[str]:
"""Apply name-pattern size filter to ollama model list."""
if include_large:
return ids
skip_patterns = ["270b", "70b", "32b", "30b", "21b", "20b", "deepseek-r1"]
filtered = [m for m in ids if not any(p in m.lower() for p in skip_patterns)]
skipped = len(ids) - len(filtered)
if skipped:
print(f"[info] Skipped {skipped} large model(s) by name pattern. "
"Pass --include-large to include them.")
return filtered
if args.models and args.models != "all":
model_ids = [m.strip() for m in args.models.split(",") if m.strip()]
elif use_cforch:
# cf-orch path: pull model list from catalog, filter by vram_mb
catalog = cforch_list_catalog(cforch_url)
if not catalog:
print("[warn] cf-orch catalog empty or unreachable -- falling back to ollama models")
use_cforch = False
model_ids = _filter_ollama_by_size(list_ollama_models(), args.include_large)
if not model_ids:
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
sys.exit(1)
else:
before = list(catalog.items())
allowed = {mid: mb for mid, mb in before if mb == 0 or mb <= max_vram_mb}
skipped_oom = {mid: mb for mid, mb in before if mid not in allowed}
model_ids = list(allowed.keys())
print(f"[info] cf-orch catalog: {len(before)} model(s), "
f"{len(allowed)} within {max_vram_mb} MB VRAM limit")
if skipped_oom:
print(f"[info] Skipped (OOM risk): "
+ ", ".join(f"{mid} ({mb} MB)" for mid, mb in sorted(skipped_oom.items())))
else:
# Ollama path
model_ids = list_ollama_models()
if not model_ids:
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
sys.exit(1)
# Backfill GGUFs from disk before filtering -- skips files that exceed VRAM limit
if getattr(args, "scan_disk", None):
llm_root = Path(args.scan_disk)
print(f"\nScanning {llm_root} for unregistered GGUFs (limit: {max_vram_mb} MB)...")
registered_tags = backfill_disk_models(llm_root, set(model_ids), max_vram_mb=max_vram_mb)
model_ids = list_ollama_models() # re-fetch with new registrations
model_ids = _filter_ollama_by_size(model_ids, args.include_large)
print(f"\nRunning voice benchmark on {len(model_ids)} model(s)...")
try:
results = run_benchmark(model_ids, corpus_dir, TEST_PROMPTS, use_cforch=use_cforch, use_vllm=use_vllm, cforch_url=cforch_url, workers=args.workers)
report_path = save_report(results, corpus_dir)
print(f"\n{'='*60}")
print(f"Results saved to: {report_path}")
print(f"\n{render_report(results, corpus_dir)}")
finally:
if registered_tags:
print(f"\nCleaning up {len(registered_tags)} temporary ollama registrations...")
for tag in registered_tags:
deregister_gguf(tag)
def cmd_show_last(_args: argparse.Namespace) -> None:
reports = sorted(_RESULTS_DIR.glob("voice_*.md"), reverse=True)
if not reports:
print("No benchmark results found. Run --run first.")
return
print(reports[0].read_text(encoding="utf-8"))
# ── Entry point ───────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
description="Voice benchmark harness for local text-gen models",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
sub = parser.add_subparsers(dest="cmd")
sub.add_parser("list-models", help="List available ollama models")
run_p = sub.add_parser("run", help="Run the benchmark")
run_p.add_argument("--models", default="all", help="Comma-separated model IDs, or 'all'")
run_p.add_argument("--samples", default=str(_CORPUS_DIR), help="Path to voice corpus directory")
run_p.add_argument("--include-large", action="store_true", help="Include models >20B params")
run_p.add_argument("--scan-disk", metavar="LLM_ROOT", help="Scan directory for GGUFs not yet in ollama (e.g. /Library/Assets/LLM)")
run_p.add_argument("--cforch", action="store_true", help="Route generation through cf-orch/cf-text instead of direct ollama")
run_p.add_argument("--vllm", action="store_true", help="Route generation through cf-orch/vllm (OpenAI-compatible) instead of ollama")
run_p.add_argument("--cforch-url", default=_CFORCH_URL, help=f"cf-orch coordinator URL (default: {_CFORCH_URL})")
run_p.add_argument("--max-vram", type=int, default=7200, metavar="MB",
help="Skip models whose VRAM footprint exceeds this limit in MB (default: 7200)")
run_p.add_argument("--workers", type=int, default=1, metavar="N",
help="Parallel workers — run N models simultaneously (default: 1; use 4+ with cf-orch)")
sub.add_parser("show-last", help="Print the most recent benchmark report")
# Also support legacy --list-models / --run / --show-last flags for manage.sh compat
parser.add_argument("--list-models", action="store_true")
parser.add_argument("--run", action="store_true")
parser.add_argument("--show-last", action="store_true")
parser.add_argument("--models", default="all")
parser.add_argument("--samples", default=str(_CORPUS_DIR))
parser.add_argument("--include-large", action="store_true")
parser.add_argument("--scan-disk", metavar="LLM_ROOT")
parser.add_argument("--cforch", action="store_true")
parser.add_argument("--vllm", action="store_true")
parser.add_argument("--cforch-url", default=_CFORCH_URL)
parser.add_argument("--max-vram", type=int, default=7200, metavar="MB")
parser.add_argument("--workers", type=int, default=1, metavar="N")
args = parser.parse_args()
if args.cmd == "list-models" or args.list_models:
cmd_list_models(args)
elif args.cmd == "run" or args.run:
cmd_run(args)
elif args.cmd == "show-last" or args.show_last:
cmd_show_last(args)
else:
parser.print_help()
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,708 @@
<template>
<div class="compare-tab">
<!-- Source toggle -->
<div class="source-toggle" role="group" aria-label="Prompt source">
<button class="source-btn" :class="{ active: promptSource === 'tasks' }" @click="promptSource = 'tasks'">
📋 cf-orch Tasks
</button>
<button class="source-btn" :class="{ active: promptSource === 'style' }" @click="promptSource = 'style'">
Writing Style Prompts
</button>
</div>
<!-- Task selector (cf-orch tasks) -->
<details v-if="promptSource === 'tasks'" class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title">📋 Pick a Task</span>
<span class="picker-badge">{{ cmpSelectedTask ? cmpSelectedTask.name : 'None selected' }}</span>
</summary>
<div class="picker-body">
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks</div>
<div v-else-if="llmTasks.length === 0" class="picker-empty">No tasks found check cforch config.</div>
<template v-else>
<div v-for="(tasks, type) in llmTasksByType" :key="type" class="picker-category">
<span class="picker-cat-name picker-cat-section">{{ type }}</span>
<div class="picker-model-list">
<label v-for="t in tasks" :key="t.id" class="picker-model-row">
<input
type="radio"
name="cmp-task"
:checked="cmpSelectedTask?.id === t.id"
@change="selectCmpTask(t)"
/>
<span class="picker-model-name" :title="t.name">{{ t.name }}</span>
</label>
</div>
</div>
</template>
</div>
</details>
<!-- Writing style prompt selector -->
<details v-if="promptSource === 'style'" class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title"> Pick a Writing Style Prompt</span>
<span class="picker-badge">{{ selectedVoicePrompt ? selectedVoicePrompt.tag : 'None selected' }}</span>
</summary>
<div class="picker-body">
<div class="picker-model-list style-prompt-list">
<label v-for="vp in STYLE_PROMPTS" :key="vp.tag" class="picker-model-row style-prompt-row">
<input
type="radio"
name="cmp-style-prompt"
:checked="selectedVoicePrompt?.tag === vp.tag"
@change="selectVoicePrompt(vp)"
/>
<span class="style-prompt-tag">{{ vp.tag }}</span>
<span class="style-prompt-title">{{ vp.thread_title }}</span>
</label>
</div>
</div>
</details>
<!-- Prompt editor + model picker (shown once a prompt source is ready) -->
<template v-if="promptSource === 'tasks' ? !!cmpSelectedTask : !!selectedVoicePrompt">
<label class="prompt-label" for="cmp-prompt">Prompt</label>
<textarea
id="cmp-prompt"
class="cmp-prompt-editor"
v-model="cmpPrompt"
rows="6"
/>
<!-- Ollama model picker -->
<details class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title">🤖 Ollama Models</span>
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ ollamaLlmModels.length }}</span>
</summary>
<div class="picker-body">
<label class="picker-cat-header">
<input
type="checkbox"
:checked="cmpSelectedModels.size === ollamaLlmModels.length"
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < ollamaLlmModels.length"
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
/>
<span class="picker-cat-name">All ollama models</span>
</label>
<div class="picker-model-list">
<label v-for="m in ollamaLlmModels" :key="m.id" class="picker-model-row">
<input
type="checkbox"
:checked="cmpSelectedModels.has(m.id)"
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
/>
<span class="picker-model-name">{{ m.name }}</span>
<span class="picker-adapter-type">{{ m.tags.slice(0, 3).join(', ') }}</span>
</label>
</div>
</div>
</details>
<!-- Run controls -->
<div class="run-controls">
<button
class="btn-run"
:disabled="cmpRunning || cmpSelectedModels.size === 0"
@click="startCompare"
>{{ cmpRunning ? '⏳ Running…' : '⚖️ Compare Models' }}</button>
<button v-if="cmpRunning" class="btn-cancel" @click="cancelCompare"> Cancel</button>
</div>
<!-- Progress log -->
<div v-if="cmpLog.length > 0" class="run-log">
<div class="log-lines">
<div v-for="(line, i) in cmpLog" :key="i" class="log-line">{{ line }}</div>
</div>
</div>
<!-- Side-by-side results -->
<template v-if="cmpResults.length > 0">
<h2 class="chart-title">Side-by-Side Responses</h2>
<div class="cmp-results-grid">
<div
v-for="r in cmpResults"
:key="r.model"
class="cmp-result-card"
:class="{ 'cmp-error': !!r.error }"
>
<div class="cmp-result-header">
<span class="cmp-model-name">{{ r.model }}</span>
<span class="cmp-meta">
<template v-if="r.error"><span class="err-badge">error</span></template>
<template v-else>{{ (r.elapsed_ms / 1000).toFixed(1) }}s</template>
</span>
</div>
<pre v-if="r.error" class="cmp-error-text">{{ r.error }}</pre>
<pre v-else class="cmp-response">{{ r.response }}</pre>
</div>
</div>
</template>
</template>
</div>
</template>
<script setup lang="ts">
import { ref, computed, onMounted } from 'vue'
import { useApiFetch } from '../composables/useApi'
// Types
interface CfOrchTask {
id: string
name: string
type: string
prompt: string
system: string
}
interface CfOrchModel {
name: string
id: string
service: string
tags: string[]
vram_estimate_mb?: number
}
interface CmpResult {
model: string
response: string
elapsed_ms: number
error: string | null
}
interface VoicePrompt {
tag: string
thread_title: string
thread_body: string
}
// Writing style prompts (mirrors TEST_PROMPTS in benchmark_style.py)
const STYLE_SYSTEM = "You are a writing assistant. Your job is to write a Reddit reply that matches the user's voice — casual, direct, community-first. No em dashes. No filler phrases. No semicolons. Short punchy sentences."
const STYLE_PROMPTS: VoicePrompt[] = [
{
tag: 'selfhosted_ai_fatigue',
thread_title: "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
thread_body: "Every session I start over. My whole hardware setup, what tools I use, what I've already tried. It's exhausting. There has to be a better way.",
},
{
tag: 'privacy_local_llm',
thread_title: "What's the point of running local LLMs if the apps still phone home?",
thread_body: "I went through all the trouble of setting up ollama and now I find out the frontend I'm using is sending telemetry. Kind of defeats the purpose.",
},
{
tag: 'solarpunk_tech',
thread_title: "What does solarpunk computing actually look like in practice?",
thread_body: "I keep seeing the aesthetic but not a lot of concrete examples of people living it out with their tech choices. What does it mean day to day?",
},
{
tag: 'nd_tools',
thread_title: "Tools that actually help with executive function vs ones that just add friction",
thread_body: "I've tried a dozen productivity apps and most of them require more executive function to maintain than they save. What actually sticks for you?",
},
{
tag: 'data_ownership',
thread_title: "Who actually owns your data when you use a 'free' AI tool?",
thread_body: "Read the ToS on three different AI assistants today. In all three cases your inputs can be used for training, shared with partners, and retained indefinitely. Is this just accepted now?",
},
{
tag: 'digital_culture',
thread_title: "The internet used to feel like it belonged to everyone. What happened?",
thread_body: "I grew up on forums, IRC, personal homepages. Now everything is a platform owned by someone trying to extract value from the community that built it.",
},
]
// State
const llmTasks = ref<CfOrchTask[]>([])
const llmTasksLoading = ref(false)
const llmModels = ref<CfOrchModel[]>([])
const promptSource = ref<'tasks' | 'style'>('tasks')
const cmpSelectedTask = ref<CfOrchTask | null>(null)
const selectedVoicePrompt = ref<VoicePrompt | null>(null)
const cmpSystemPrompt = ref('')
const cmpPrompt = ref('')
const cmpSelectedModels = ref<Set<string>>(new Set())
const cmpRunning = ref(false)
const cmpLog = ref<string[]>([])
const cmpResults = ref<CmpResult[]>([])
const cmpEventSource = ref<EventSource | null>(null)
// Computed
const ollamaLlmModels = computed(() =>
llmModels.value.filter(m => m.service === 'ollama')
)
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
const groups: Record<string, CfOrchTask[]> = {}
for (const t of llmTasks.value) {
if (!groups[t.type]) groups[t.type] = []
groups[t.type].push(t)
}
return groups
})
// Helpers
function selectCmpTask(t: CfOrchTask) {
cmpSelectedTask.value = t
cmpPrompt.value = t.prompt || ''
cmpSystemPrompt.value = t.system || ''
cmpResults.value = []
cmpLog.value = []
}
function selectVoicePrompt(vp: VoicePrompt) {
selectedVoicePrompt.value = vp
cmpPrompt.value = `Thread: ${vp.thread_title}\n\n${vp.thread_body}\n\nWrite a reply:`
cmpSystemPrompt.value = STYLE_SYSTEM
cmpResults.value = []
cmpLog.value = []
}
function toggleCmpModel(id: string, checked: boolean) {
const next = new Set(cmpSelectedModels.value)
checked ? next.add(id) : next.delete(id)
cmpSelectedModels.value = next
}
function toggleAllCmpModels(checked: boolean) {
cmpSelectedModels.value = checked
? new Set(ollamaLlmModels.value.map(m => m.id))
: new Set()
}
// Data loaders
async function loadLlmTasks() {
llmTasksLoading.value = true
const { data } = await useApiFetch<{ tasks: CfOrchTask[]; types: string[] }>('/api/cforch/tasks')
llmTasksLoading.value = false
if (data?.tasks) {
llmTasks.value = data.tasks
}
}
async function loadLlmModels() {
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
if (data?.models) {
llmModels.value = data.models
// Pre-select all ollama models
cmpSelectedModels.value = new Set(
data.models.filter(m => m.service === 'ollama').map(m => m.id)
)
}
}
// Run / cancel
function startCompare() {
if (!cmpPrompt.value.trim() || cmpSelectedModels.value.size === 0) return
cmpRunning.value = true
cmpResults.value = []
cmpLog.value = []
const params = new URLSearchParams({
prompt: cmpPrompt.value,
model_ids: [...cmpSelectedModels.value].join(','),
system: cmpSystemPrompt.value,
})
const es = new EventSource(`/api/imitate/run?${params}`)
cmpEventSource.value = es
es.onmessage = (event: MessageEvent) => {
try {
const msg = JSON.parse(event.data)
if (msg.type === 'start') {
cmpLog.value.push(`Comparing ${msg.total_models} models…`)
} else if (msg.type === 'model_start') {
cmpLog.value.push(`${msg.model}`)
} else if (msg.type === 'model_done') {
const status = msg.error
? `${msg.error}`
: `${(msg.elapsed_ms / 1000).toFixed(1)}s`
cmpLog.value.push(` ${msg.model}: ${status}`)
cmpResults.value.push({
model: msg.model,
response: msg.response,
elapsed_ms: msg.elapsed_ms,
error: msg.error ?? null,
})
} else if (msg.type === 'complete') {
cmpRunning.value = false
es.close()
}
} catch { /* ignore malformed frames */ }
}
es.onerror = () => {
cmpLog.value.push('Connection error.')
cmpRunning.value = false
es.close()
cmpEventSource.value = null
}
}
function cancelCompare() {
cmpEventSource.value?.close()
cmpEventSource.value = null
cmpRunning.value = false
cmpLog.value.push('Cancelled.')
}
onMounted(() => {
loadLlmTasks()
loadLlmModels()
})
</script>
<style scoped>
.compare-tab {
display: flex;
flex-direction: column;
gap: 1.75rem;
}
/* ── Source toggle ──────────────────────────────────────── */
.source-toggle {
display: inline-flex;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
align-self: flex-start;
}
.source-btn {
padding: 0.4rem 1rem;
font-size: 0.83rem;
font-family: var(--font-body, sans-serif);
font-weight: 500;
border: none;
background: var(--color-surface, #fff);
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
transition: background 0.15s, color 0.15s;
}
.source-btn:not(:last-child) { border-right: 1px solid var(--color-border, #d0d7e8); }
.source-btn.active { background: var(--app-primary, #2A6080); color: #fff; }
.source-btn:not(.active):hover { background: var(--color-surface-raised, #e4ebf5); }
/* ── Voice prompt list ──────────────────────────────────── */
.style-prompt-list { flex-direction: column !important; flex-wrap: nowrap !important; padding-left: 0 !important; gap: 0.4rem !important; }
.style-prompt-row {
flex-direction: column !important;
align-items: flex-start !important;
gap: 0.15rem !important;
padding: 0.5rem 0.6rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.35rem;
background: var(--color-surface, #f4f7fc);
cursor: pointer;
transition: background 0.1s;
}
.style-prompt-row:hover { background: var(--color-surface-raised, #e4ebf5); }
.style-prompt-row:has(input:checked) {
background: color-mix(in srgb, var(--app-primary, #2A6080) 10%, transparent);
border-color: var(--app-primary, #2A6080);
}
.style-prompt-row input { display: none; }
.style-prompt-tag {
font-family: var(--font-mono, monospace);
font-size: 0.72rem;
color: var(--app-primary, #2A6080);
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.04em;
}
.style-prompt-title {
font-size: 0.83rem;
color: var(--color-text, #1a2338);
line-height: 1.4;
}
/* ── Buttons ────────────────────────────────────────────── */
.btn-run {
padding: 0.45rem 1.1rem;
border-radius: 0.375rem;
border: none;
background: var(--app-primary, #2A6080);
color: #fff;
font-size: 0.88rem;
font-family: var(--font-body, sans-serif);
cursor: pointer;
transition: opacity 0.15s;
}
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-run:not(:disabled):hover { opacity: 0.85; }
.btn-cancel {
padding: 0.45rem 0.9rem;
background: transparent;
border: 1px solid var(--color-text-secondary, #6b7a99);
color: var(--color-text-secondary, #6b7a99);
border-radius: 0.4rem;
font-size: 0.85rem;
font-weight: 500;
cursor: pointer;
transition: background 0.15s;
}
.btn-cancel:hover {
background: color-mix(in srgb, var(--color-text-secondary, #6b7a99) 12%, transparent);
}
/* ── Run controls row ───────────────────────────────────── */
.run-controls {
display: flex;
align-items: center;
gap: 0.75rem;
flex-wrap: wrap;
}
/* ── Run log ────────────────────────────────────────────── */
.run-log {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
font-family: var(--font-mono, monospace);
font-size: 0.78rem;
}
.log-lines {
max-height: 160px;
overflow-y: auto;
padding: 0.5rem 0.75rem;
background: var(--color-surface, #fff);
display: flex;
flex-direction: column;
gap: 0.1rem;
}
.log-line { color: var(--color-text, #1a2338); line-height: 1.5; }
/* ── Chart title ────────────────────────────────────────── */
.chart-title {
font-size: 0.95rem;
font-weight: 600;
color: var(--color-text, #1a2338);
margin: 0;
}
/* ── Model Picker ───────────────────────────────────────── */
.model-picker {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.picker-summary {
display: flex;
align-items: center;
gap: 0.6rem;
padding: 0.65rem 0.9rem;
cursor: pointer;
user-select: none;
list-style: none;
background: var(--color-surface-raised, #e4ebf5);
}
.picker-summary::-webkit-details-marker { display: none; }
.picker-summary::before { content: '▶ '; font-size: 0.65rem; color: var(--color-text-secondary, #6b7a99); }
details[open] .picker-summary::before { content: '▼ '; }
.picker-title {
font-size: 0.9rem;
font-weight: 600;
color: var(--color-text, #1a2338);
}
.picker-badge {
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
background: var(--color-surface, #fff);
border: 1px solid var(--color-border, #d0d7e8);
padding: 0.15rem 0.5rem;
border-radius: 1rem;
font-family: var(--font-mono, monospace);
margin-left: auto;
}
.picker-body {
padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8);
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.picker-loading, .picker-empty {
font-size: 0.85rem;
color: var(--color-text-secondary, #6b7a99);
padding: 0.5rem 0;
}
.picker-category {
display: flex;
flex-direction: column;
gap: 0.3rem;
}
.picker-cat-header {
display: flex;
align-items: center;
gap: 0.45rem;
font-size: 0.82rem;
font-weight: 700;
color: var(--color-text, #1a2338);
text-transform: uppercase;
letter-spacing: 0.04em;
cursor: pointer;
}
.picker-cat-name { /* inherits from cat-header or section */ }
.picker-cat-section {
font-weight: 600;
font-size: 0.82rem;
padding: 0.35rem 0;
display: block;
color: var(--color-text, #1a2338);
}
.picker-model-list {
display: flex;
flex-wrap: wrap;
gap: 0.35rem 0.75rem;
padding-left: 1.4rem;
}
.picker-model-row {
display: flex;
align-items: center;
gap: 0.35rem;
font-size: 0.82rem;
cursor: pointer;
color: var(--color-text, #1a2338);
}
.picker-model-name {
font-family: var(--font-mono, monospace);
font-size: 0.78rem;
white-space: nowrap;
max-width: 18ch;
overflow: hidden;
text-overflow: ellipsis;
}
.picker-adapter-type {
font-size: 0.68rem;
color: var(--color-text-secondary, #6b7a99);
background: var(--color-surface-raised, #e4ebf5);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
padding: 0.05rem 0.3rem;
font-family: var(--font-mono, monospace);
}
/* ── Prompt editor ──────────────────────────────────────── */
.prompt-label {
font-size: 0.85rem;
font-weight: 600;
color: var(--color-text-secondary, #6b7a99);
margin-top: 0.5rem;
}
.cmp-prompt-editor {
width: 100%;
font-family: var(--font-mono, monospace);
font-size: 0.85rem;
padding: 0.75rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #f0f4fc);
color: var(--color-text, #1a2338);
resize: vertical;
line-height: 1.5;
box-sizing: border-box;
}
.cmp-prompt-editor:focus {
outline: 2px solid var(--app-primary, #2A6080);
outline-offset: -1px;
}
/* ── Results grid ───────────────────────────────────────── */
.cmp-results-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(280px, 1fr));
gap: 1rem;
margin-top: 0.5rem;
}
.cmp-result-card {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
background: var(--color-surface, #f0f4fc);
display: flex;
flex-direction: column;
}
.cmp-result-card.cmp-error {
border-color: #fca5a5;
}
.cmp-result-header {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.5rem 0.75rem;
background: var(--color-surface-raised, #e4ebf5);
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.cmp-model-name {
font-size: 0.82rem;
font-weight: 600;
color: var(--color-text, #1a2338);
overflow: hidden;
text-overflow: ellipsis;
white-space: nowrap;
}
.cmp-meta {
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
flex-shrink: 0;
margin-left: 0.5rem;
}
.err-badge {
background: #fee2e2;
color: #991b1b;
border-radius: 9999px;
padding: 0.1rem 0.45rem;
font-size: 0.7rem;
font-weight: 600;
}
.cmp-response, .cmp-error-text {
padding: 0.75rem;
font-size: 0.82rem;
white-space: pre-wrap;
word-break: break-word;
max-height: 300px;
overflow-y: auto;
margin: 0;
flex: 1;
color: var(--color-text, #1a2338);
}
.cmp-error-text { color: #b91c1c; }
@media (max-width: 600px) {
.picker-model-list { padding-left: 0; }
.picker-model-name { max-width: 14ch; }
}
</style>

View file

@ -0,0 +1,715 @@
<template>
<div class="llm-eval-tab">
<!-- Task Selection -->
<details class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title">📋 Task Selection</span>
<span class="picker-badge">{{ llmTaskBadge }}</span>
</summary>
<div class="picker-body">
<div v-if="llmTasksLoading" class="picker-loading">Loading tasks</div>
<div v-else-if="Object.keys(llmTasksByType).length === 0" class="picker-empty">
No tasks found check API connection.
</div>
<template v-else>
<div v-for="(tasks, type) in llmTasksByType" :key="type" class="picker-category">
<label class="picker-cat-header">
<input
type="checkbox"
:checked="isTaskTypeAllSelected(tasks)"
:indeterminate="isTaskTypeIndeterminate(tasks)"
@change="toggleTaskType(tasks, ($event.target as HTMLInputElement).checked)"
/>
<span class="picker-cat-name">{{ type }}</span>
<span class="picker-cat-count">({{ tasks.length }})</span>
</label>
<div class="picker-model-list">
<label v-for="t in tasks" :key="t.id" class="picker-model-row">
<input
type="checkbox"
:checked="selectedLlmTasks.has(t.id)"
@change="toggleLlmTask(t.id, ($event.target as HTMLInputElement).checked)"
/>
<span class="picker-model-name" :title="t.name">{{ t.name }}</span>
</label>
</div>
</div>
</template>
</div>
</details>
<!-- Model Selection -->
<details class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title">🎯 Model Selection</span>
<span class="picker-badge">{{ llmModelBadge }}</span>
</summary>
<div class="picker-body">
<div v-if="llmModelsLoading" class="picker-loading">Loading models</div>
<div v-else-if="Object.keys(llmModelsByService).length === 0" class="picker-empty">
No models found check cf-orch connection.
</div>
<template v-else>
<div v-for="(models, service) in llmModelsByService" :key="service" class="picker-category">
<label class="picker-cat-header">
<input
type="checkbox"
:checked="isServiceAllSelected(models)"
:indeterminate="isServiceIndeterminate(models)"
@change="toggleService(models, ($event.target as HTMLInputElement).checked)"
/>
<span class="picker-cat-name">{{ service }}</span>
<span class="picker-cat-count">({{ models.length }})</span>
</label>
<div class="picker-model-list">
<label v-for="m in models" :key="m.id" class="picker-model-row">
<input
type="checkbox"
:checked="selectedLlmModels.has(m.id)"
@change="toggleLlmModel(m.id, ($event.target as HTMLInputElement).checked)"
/>
<span class="picker-model-name" :title="m.name">{{ m.name }}</span>
<span class="picker-adapter-type" v-if="m.tags.length">{{ m.tags.join(', ') }}</span>
</label>
</div>
</div>
</template>
</div>
</details>
<!-- Run Controls -->
<div class="run-controls">
<button
class="btn-run"
:disabled="llmRunning || selectedLlmTasks.size === 0 || selectedLlmModels.size === 0"
@click="startLlmBenchmark"
>
{{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
</button>
<button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark"> Cancel</button>
<span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
Select at least one task and one model to run.
</span>
</div>
<!-- Progress log -->
<div v-if="llmRunning || llmRunLog.length" class="run-log">
<div class="run-log-title">
<span>{{ llmRunning ? '⏳ Running LLM eval…' : llmError ? '❌ Failed' : '✅ Done' }}</span>
<button class="btn-ghost" @click="llmRunLog = []; llmError = ''">Clear</button>
</div>
<div class="log-lines" ref="llmLogEl">
<div
v-for="(line, i) in llmRunLog"
:key="i"
class="log-line"
:class="{ 'log-error': line.startsWith('ERROR') || line.startsWith('[error]') }"
>{{ line }}</div>
</div>
<p v-if="llmError" class="run-error">{{ llmError }}</p>
</div>
<!-- Results table -->
<template v-if="llmResults.length > 0">
<h2 class="chart-title">LLM Eval Results</h2>
<div class="heatmap-scroll">
<table class="heatmap llm-results-table">
<thead>
<tr>
<th class="hm-label-col">Model</th>
<th class="hm-model-col">overall</th>
<th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
<th class="hm-model-col">tok/s</th>
</tr>
</thead>
<tbody>
<tr v-for="row in llmResults" :key="row.model_id">
<td class="hm-label-cell llm-model-name-cell" :title="row.model_id">{{ row.model_name }}</td>
<td
class="hm-value-cell"
:class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
>{{ pct(row.avg_quality_score) }}</td>
<td
v-for="col in llmTaskTypeCols"
:key="col"
class="hm-value-cell"
:class="{ 'bt-best': llmBestByCol[col] === row.model_id }"
>{{ row.quality_by_task_type[col] != null ? pct(row.quality_by_task_type[col]) : '—' }}</td>
<td class="hm-value-cell llm-tps-cell">{{ row.avg_tokens_per_sec.toFixed(1) }}</td>
</tr>
</tbody>
</table>
</div>
<p class="heatmap-hint">Run LLM Eval to refresh. Green = best per column.</p>
</template>
</div>
</template>
<script setup lang="ts">
import { ref, computed, onMounted, nextTick } from 'vue'
import { useApiFetch } from '../composables/useApi'
// Types
interface CfOrchTask {
id: string
name: string
type: string
prompt: string
system: string
}
interface CfOrchModel {
name: string
id: string
service: string
tags: string[]
vram_estimate_mb?: number
}
interface LlmModelResult {
model_name: string
model_id: string
node_id: string
avg_tokens_per_sec: number
avg_completion_ms: number
avg_quality_score: number
finetune_candidates: number
error_count: number
quality_by_task_type: Record<string, number>
}
// State
const llmTasks = ref<CfOrchTask[]>([])
const llmTasksLoading = ref(false)
const llmModels = ref<CfOrchModel[]>([])
const llmModelsLoading = ref(false)
const selectedLlmTasks = ref<Set<string>>(new Set())
const selectedLlmModels = ref<Set<string>>(new Set())
const llmRunning = ref(false)
const llmRunLog = ref<string[]>([])
const llmError = ref('')
const llmResults = ref<LlmModelResult[]>([])
const llmEventSource = ref<EventSource | null>(null)
const llmLogEl = ref<HTMLElement | null>(null)
// Computed
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
const groups: Record<string, CfOrchTask[]> = {}
for (const t of llmTasks.value) {
if (!groups[t.type]) groups[t.type] = []
groups[t.type].push(t)
}
return groups
})
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
const groups: Record<string, CfOrchModel[]> = {}
for (const m of llmModels.value) {
if (!groups[m.service]) groups[m.service] = []
groups[m.service].push(m)
}
return groups
})
const llmTaskBadge = computed(() => {
const total = llmTasks.value.length
if (total === 0) return 'No tasks available'
const sel = selectedLlmTasks.value.size
if (sel === total) return `All tasks (${total})`
return `${sel} of ${total} tasks selected`
})
const llmModelBadge = computed(() => {
const total = llmModels.value.length
if (total === 0) return 'No models available'
const sel = selectedLlmModels.value.size
if (sel === total) return `All models (${total})`
return `${sel} of ${total} selected`
})
const llmTaskTypeCols = computed(() => {
const types = new Set<string>()
for (const r of llmResults.value) {
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
}
return [...types].sort()
})
const llmBestByCol = computed((): Record<string, string> => {
const best: Record<string, string> = {}
if (llmResults.value.length === 0) return best
let bestId = '', bestVal = -Infinity
for (const r of llmResults.value) {
if (r.avg_quality_score > bestVal) { bestVal = r.avg_quality_score; bestId = r.model_id }
}
best['overall'] = bestId
for (const col of llmTaskTypeCols.value) {
bestId = ''; bestVal = -Infinity
for (const r of llmResults.value) {
const v = r.quality_by_task_type[col]
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
}
best[col] = bestId
}
return best
})
// Helpers
function pct(v: number): string {
return `${(v * 100).toFixed(1)}%`
}
// Task picker helpers
function isTaskTypeAllSelected(tasks: CfOrchTask[]): boolean {
return tasks.length > 0 && tasks.every(t => selectedLlmTasks.value.has(t.id))
}
function isTaskTypeIndeterminate(tasks: CfOrchTask[]): boolean {
const some = tasks.some(t => selectedLlmTasks.value.has(t.id))
return some && !isTaskTypeAllSelected(tasks)
}
function toggleLlmTask(id: string, checked: boolean) {
const next = new Set(selectedLlmTasks.value)
checked ? next.add(id) : next.delete(id)
selectedLlmTasks.value = next
}
function toggleTaskType(tasks: CfOrchTask[], checked: boolean) {
const next = new Set(selectedLlmTasks.value)
for (const t of tasks) {
checked ? next.add(t.id) : next.delete(t.id)
}
selectedLlmTasks.value = next
}
// Model picker helpers
function isServiceAllSelected(models: CfOrchModel[]): boolean {
return models.length > 0 && models.every(m => selectedLlmModels.value.has(m.id))
}
function isServiceIndeterminate(models: CfOrchModel[]): boolean {
const some = models.some(m => selectedLlmModels.value.has(m.id))
return some && !isServiceAllSelected(models)
}
function toggleLlmModel(id: string, checked: boolean) {
const next = new Set(selectedLlmModels.value)
checked ? next.add(id) : next.delete(id)
selectedLlmModels.value = next
}
function toggleService(models: CfOrchModel[], checked: boolean) {
const next = new Set(selectedLlmModels.value)
for (const m of models) {
checked ? next.add(m.id) : next.delete(m.id)
}
selectedLlmModels.value = next
}
// Data loaders
async function loadLlmTasks() {
llmTasksLoading.value = true
const { data } = await useApiFetch<{ tasks: CfOrchTask[]; types: string[] }>('/api/cforch/tasks')
llmTasksLoading.value = false
if (data?.tasks) {
llmTasks.value = data.tasks
selectedLlmTasks.value = new Set(data.tasks.map(t => t.id))
}
}
async function loadLlmModels() {
llmModelsLoading.value = true
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
llmModelsLoading.value = false
if (data?.models) {
llmModels.value = data.models
selectedLlmModels.value = new Set(data.models.map(m => m.id))
}
}
async function loadLlmResults() {
const { data } = await useApiFetch<LlmModelResult[]>('/api/cforch/results')
if (Array.isArray(data) && data.length > 0) {
llmResults.value = data
}
}
// Run / cancel
function startLlmBenchmark() {
llmRunning.value = true
llmRunLog.value = []
llmError.value = ''
const params = new URLSearchParams()
const taskIds = [...selectedLlmTasks.value].join(',')
if (taskIds) params.set('task_ids', taskIds)
const es = new EventSource(`/api/cforch/run?${params}`)
llmEventSource.value = es
es.onmessage = async (e: MessageEvent) => {
const msg = JSON.parse(e.data)
if (msg.type === 'progress' && typeof msg.message === 'string') {
llmRunLog.value.push(msg.message)
await nextTick()
llmLogEl.value?.scrollTo({ top: llmLogEl.value.scrollHeight, behavior: 'smooth' })
} else if (msg.type === 'result' && Array.isArray(msg.summary)) {
llmResults.value = msg.summary
} else if (msg.type === 'complete') {
llmRunning.value = false
es.close()
llmEventSource.value = null
} else if (msg.type === 'error' && typeof msg.message === 'string') {
llmError.value = msg.message
llmRunning.value = false
es.close()
llmEventSource.value = null
}
}
es.onerror = () => {
if (llmRunning.value) llmError.value = 'Connection lost'
llmRunning.value = false
es.close()
llmEventSource.value = null
}
}
async function cancelLlmBenchmark() {
llmEventSource.value?.close()
llmEventSource.value = null
llmRunning.value = false
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {})
}
onMounted(() => {
loadLlmTasks()
loadLlmModels()
loadLlmResults()
})
</script>
<style scoped>
.llm-eval-tab {
display: flex;
flex-direction: column;
gap: 1.75rem;
}
/* ── Buttons ────────────────────────────────────────────── */
.btn-run {
padding: 0.45rem 1.1rem;
border-radius: 0.375rem;
border: none;
background: var(--app-primary, #2A6080);
color: #fff;
font-size: 0.88rem;
font-family: var(--font-body, sans-serif);
cursor: pointer;
transition: opacity 0.15s;
}
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-run:not(:disabled):hover { opacity: 0.85; }
.btn-cancel {
padding: 0.45rem 0.9rem;
background: transparent;
border: 1px solid var(--color-text-secondary, #6b7a99);
color: var(--color-text-secondary, #6b7a99);
border-radius: 0.4rem;
font-size: 0.85rem;
font-weight: 500;
cursor: pointer;
transition: background 0.15s;
}
.btn-cancel:hover {
background: color-mix(in srgb, var(--color-text-secondary, #6b7a99) 12%, transparent);
}
.btn-ghost {
background: none;
border: none;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
font-size: 0.78rem;
padding: 0.1rem 0.3rem;
border-radius: 0.2rem;
}
.btn-ghost:hover { background: var(--color-border, #d0d7e8); }
/* ── Run controls row ───────────────────────────────────── */
.run-controls {
display: flex;
align-items: center;
gap: 0.75rem;
flex-wrap: wrap;
}
.run-hint {
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
}
/* ── Run log ────────────────────────────────────────────── */
.run-log {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
font-family: var(--font-mono, monospace);
font-size: 0.78rem;
}
.run-log-title {
display: flex;
justify-content: space-between;
align-items: center;
padding: 0.4rem 0.75rem;
background: var(--color-surface-raised, #e4ebf5);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
}
.log-lines {
max-height: 200px;
overflow-y: auto;
padding: 0.5rem 0.75rem;
background: var(--color-surface, #fff);
display: flex;
flex-direction: column;
gap: 0.1rem;
}
.log-line { color: var(--color-text, #1a2338); line-height: 1.5; }
.log-line.log-error { color: var(--color-error, #ef4444); }
.run-error {
margin: 0;
padding: 0.4rem 0.75rem;
background: color-mix(in srgb, var(--color-error, #ef4444) 10%, transparent);
color: var(--color-error, #ef4444);
font-size: 0.82rem;
font-family: var(--font-mono, monospace);
}
/* ── Chart title ────────────────────────────────────────── */
.chart-title {
font-size: 0.95rem;
font-weight: 600;
color: var(--color-text, #1a2338);
margin: 0;
}
/* ── Heatmap ────────────────────────────────────────────── */
.heatmap-scroll {
overflow-x: auto;
border-radius: 0.5rem;
border: 1px solid var(--color-border, #d0d7e8);
}
.heatmap {
border-collapse: collapse;
min-width: 100%;
font-size: 0.78rem;
}
.hm-label-col {
text-align: left;
min-width: 11rem;
padding: 0.4rem 0.6rem;
background: var(--color-surface-raised, #e4ebf5);
font-weight: 600;
border-bottom: 1px solid var(--color-border, #d0d7e8);
position: sticky;
left: 0;
}
.hm-model-col {
min-width: 5rem;
max-width: 8rem;
padding: 0.4rem 0.5rem;
background: var(--color-surface-raised, #e4ebf5);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-family: var(--font-mono, monospace);
font-size: 0.7rem;
text-overflow: ellipsis;
overflow: hidden;
white-space: nowrap;
text-align: center;
}
.hm-label-cell {
padding: 0.35rem 0.6rem;
background: var(--color-surface, #fff);
border-top: 1px solid var(--color-border, #d0d7e8);
white-space: nowrap;
font-family: var(--font-mono, monospace);
font-size: 0.74rem;
position: sticky;
left: 0;
}
.hm-value-cell {
padding: 0.35rem 0.5rem;
text-align: center;
font-family: var(--font-mono, monospace);
font-variant-numeric: tabular-nums;
border-top: 1px solid var(--color-border, #d0d7e8);
cursor: default;
}
.heatmap-hint {
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
margin: 0;
}
/* LLM-specific table styles */
.llm-results-table .bt-best {
color: var(--color-success, #3a7a32);
font-weight: 700;
background: color-mix(in srgb, var(--color-success, #3a7a32) 8%, transparent);
}
.llm-model-name-cell {
font-family: var(--font-mono, monospace);
font-size: 0.75rem;
white-space: nowrap;
max-width: 16rem;
overflow: hidden;
text-overflow: ellipsis;
background: var(--color-surface, #fff);
border-top: 1px solid var(--color-border, #d0d7e8);
padding: 0.35rem 0.6rem;
position: sticky;
left: 0;
}
.llm-tps-cell {
font-family: var(--font-mono, monospace);
font-variant-numeric: tabular-nums;
white-space: nowrap;
}
/* ── Model Picker ───────────────────────────────────────── */
.model-picker {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.picker-summary {
display: flex;
align-items: center;
gap: 0.6rem;
padding: 0.65rem 0.9rem;
cursor: pointer;
user-select: none;
list-style: none;
background: var(--color-surface-raised, #e4ebf5);
}
.picker-summary::-webkit-details-marker { display: none; }
.picker-summary::before { content: '▶ '; font-size: 0.65rem; color: var(--color-text-secondary, #6b7a99); }
details[open] .picker-summary::before { content: '▼ '; }
.picker-title {
font-size: 0.9rem;
font-weight: 600;
color: var(--color-text, #1a2338);
}
.picker-badge {
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
background: var(--color-surface, #fff);
border: 1px solid var(--color-border, #d0d7e8);
padding: 0.15rem 0.5rem;
border-radius: 1rem;
font-family: var(--font-mono, monospace);
margin-left: auto;
}
.picker-body {
padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8);
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.picker-loading, .picker-empty {
font-size: 0.85rem;
color: var(--color-text-secondary, #6b7a99);
padding: 0.5rem 0;
}
.picker-category {
display: flex;
flex-direction: column;
gap: 0.3rem;
}
.picker-cat-header {
display: flex;
align-items: center;
gap: 0.45rem;
font-size: 0.82rem;
font-weight: 700;
color: var(--color-text, #1a2338);
text-transform: uppercase;
letter-spacing: 0.04em;
cursor: pointer;
}
.picker-cat-name { /* inherits from cat-header */ }
.picker-cat-count {
font-weight: 400;
color: var(--color-text-secondary, #6b7a99);
font-family: var(--font-mono, monospace);
font-size: 0.75rem;
text-transform: none;
letter-spacing: 0;
}
.picker-model-list {
display: flex;
flex-wrap: wrap;
gap: 0.35rem 0.75rem;
padding-left: 1.4rem;
}
.picker-model-row {
display: flex;
align-items: center;
gap: 0.35rem;
font-size: 0.82rem;
cursor: pointer;
color: var(--color-text, #1a2338);
}
.picker-model-name {
font-family: var(--font-mono, monospace);
font-size: 0.78rem;
white-space: nowrap;
max-width: 18ch;
overflow: hidden;
text-overflow: ellipsis;
}
.picker-adapter-type {
font-size: 0.68rem;
color: var(--color-text-secondary, #6b7a99);
background: var(--color-surface-raised, #e4ebf5);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
padding: 0.05rem 0.3rem;
font-family: var(--font-mono, monospace);
}
@media (max-width: 600px) {
.picker-model-list { padding-left: 0; }
.picker-model-name { max-width: 14ch; }
}
</style>

919
web/src/views/StyleTab.vue Normal file
View file

@ -0,0 +1,919 @@
<template>
<div class="style-tab">
<!-- Controls row -->
<div class="style-controls">
<!-- Model picker -->
<details class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title"> Models</span>
<span class="picker-badge">{{ selectedCount }} selected</span>
<button class="btn-refresh" :disabled="modelsLoading" @click.stop="loadModels" title="Refresh model list">
{{ modelsLoading ? '⏳' : '🔄' }}
</button>
</summary>
<div class="picker-body">
<div v-if="modelsLoading" class="picker-loading">Loading models</div>
<div v-else-if="loadError" class="picker-error">{{ loadError }}</div>
<template v-else>
<!-- Ollama group -->
<div class="picker-group" v-if="ollamaModels.length">
<div class="group-header">
<label class="group-check">
<input
type="checkbox"
:checked="isGroupAllSelected('ollama')"
:indeterminate="isGroupIndeterminate('ollama')"
@change="toggleGroup('ollama', ($event.target as HTMLInputElement).checked)"
/>
<span class="group-label">Ollama</span>
<span class="group-count">({{ ollamaModels.length }})</span>
</label>
<span class="group-note">auto-synced with Models view</span>
</div>
<div class="model-list">
<label v-for="m in ollamaModels" :key="m.id" class="model-item">
<input type="checkbox" :value="m.id" v-model="selectedModels" />
<span class="model-name">{{ m.name }}</span>
<span v-if="m.size_mb" class="model-meta">{{ formatMb(m.size_mb) }}</span>
</label>
</div>
</div>
<!-- cf-text group -->
<div class="picker-group" v-if="cftextModels.length">
<div class="group-header">
<label class="group-check">
<input
type="checkbox"
:checked="isGroupAllSelected('cf-text')"
:indeterminate="isGroupIndeterminate('cf-text')"
@change="toggleGroup('cf-text', ($event.target as HTMLInputElement).checked)"
/>
<span class="group-label">cf-text (cf-orch)</span>
<span class="group-count">({{ cftextModels.length }})</span>
</label>
<span class="group-note">GGUFs via coordinator enable cf-orch below</span>
</div>
<div class="model-list">
<label v-for="m in cftextModels" :key="m.id" class="model-item">
<input type="checkbox" :value="m.id" v-model="selectedModels" />
<span class="model-name">{{ m.name }}</span>
<span v-if="m.vram_mb" class="model-meta">{{ formatMb(m.vram_mb) }} VRAM</span>
</label>
</div>
</div>
<div v-if="!ollamaModels.length && !cftextModels.length" class="picker-empty">
No models available check Ollama and cf-orch connections.
</div>
</template>
</div>
</details>
<!-- Options panel -->
<details class="options-panel">
<summary class="picker-summary">
<span class="picker-title"> Options</span>
</summary>
<div class="options-body">
<label class="option-row">
<input type="checkbox" v-model="useCforch" :disabled="running" />
<span class="option-label">Use cf-orch backend</span>
<span class="option-hint">Routes generation through cf-text instead of ollama</span>
</label>
<label class="option-row" :class="{ dimmed: !useCforch }">
<span class="option-label">Max VRAM (MB)</span>
<input
type="number"
v-model.number="maxVram"
:disabled="running || !useCforch"
min="1024"
max="24576"
step="512"
class="option-number"
/>
<span class="option-hint">Skip models exceeding this VRAM limit</span>
</label>
<label class="option-row">
<span class="option-label">Parallel workers</span>
<input
type="number"
v-model.number="workers"
:disabled="running"
min="1"
max="16"
step="1"
class="option-number"
/>
<span class="option-hint">Models to score simultaneously (1 = sequential)</span>
</label>
<label class="option-row">
<input type="checkbox" v-model="includeLarge" :disabled="running" />
<span class="option-label">Include large models (30B+)</span>
<span class="option-hint">Off by default these take much longer</span>
</label>
</div>
</details>
</div>
<!-- Run controls -->
<div class="run-bar">
<button class="btn-run" :disabled="running || selectedCount === 0" @click="startBenchmark">
{{ running ? '⏳ Running…' : results.length ? '🔄 Re-run' : '▶ Run Benchmark' }}
</button>
<button v-if="running" class="btn-cancel" @click="cancelBenchmark"> Cancel</button>
<span v-if="selectedCount === 0 && !running" class="run-hint">Select at least one model above</span>
</div>
<!-- Progress log -->
<div v-if="runLog.length" class="run-log">
<div class="run-log-header">
<span class="run-log-title">Run log</span>
<button class="btn-clear" @click="runLog = []">Clear</button>
</div>
<pre class="run-log-body" ref="logEl">{{ runLog.join('\n') }}</pre>
</div>
<!-- Past runs picker -->
<div class="history-bar" v-if="pastRuns.length">
<label class="history-label">📂 Past runs:</label>
<select class="history-select" v-model="selectedRun" @change="loadRun(selectedRun)">
<option value=""> select a past run </option>
<option v-for="r in pastRuns" :key="r.filename" :value="r.filename">
{{ r.date }} · {{ r.model_count }} model{{ r.model_count !== 1 ? 's' : '' }} · top {{ r.top_score }}/100
</option>
</select>
</div>
<!-- Results table -->
<div v-if="results.length" class="results-section">
<div class="results-header">
<h2 class="results-title">Rankings</h2>
<button
class="btn-corrections"
:disabled="sendingCorrections"
@click="sendToCorrections"
title="Push all outputs from this run into the Corrections review queue"
>
{{ sendingCorrections ? '⏳ Sending…' : correctionsMsg || '✍️ Send to Corrections' }}
</button>
</div>
<div class="results-table-wrap">
<table class="results-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Score</th>
<th>Latency</th>
<th title="Em-dash count"></th>
<th title="Filler phrase hits">Fillers</th>
<th title="Semicolons">;</th>
</tr>
</thead>
<tbody>
<tr
v-for="(r, i) in results"
:key="r.model_id"
class="result-row"
:class="{ 'top-row': i === 0 }"
@click="toggleExpanded(r.model_id)"
>
<td class="rank-cell">{{ medal(i) }}</td>
<td class="model-cell">
<span class="model-name-text">{{ r.model_id }}</span>
</td>
<td class="score-cell">
<span class="score-pill" :style="scorePillStyle(r.avg_score)">
{{ r.avg_score.toFixed(0) }}
</span>
</td>
<td class="latency-cell">{{ formatLatency(r.avg_latency_ms) }}</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_em_dashes > 0 }">
{{ r.total_em_dashes }}
</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_filler_hits > 0 }">
{{ r.total_filler_hits }}
</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_semicolons > 0 }">
{{ r.total_semicolons }}
</td>
</tr>
</tbody>
</table>
</div>
<!-- Expandable sample outputs -->
<div v-for="r in results" :key="'exp-' + r.model_id">
<div v-if="expandedModels.has(r.model_id)" class="sample-outputs">
<div class="sample-header">
<strong>{{ r.model_id }}</strong>
<button class="btn-collapse" @click="toggleExpanded(r.model_id)"> Close</button>
</div>
<div v-for="pr in r.prompt_results" :key="pr.tag" class="sample-prompt">
<div class="sample-tag">
<span class="tag-name">{{ pr.tag }}</span>
<span class="tag-score">{{ pr.score.toFixed(0) }}/100</span>
<span class="tag-latency">{{ formatLatency(pr.latency_ms) }}</span>
</div>
<pre class="sample-text">{{ pr.output || '(no output)' }}</pre>
</div>
</div>
</div>
</div>
</div>
</template>
<script setup lang="ts">
import { ref, computed, onMounted, nextTick, watch } from 'vue'
// Types
interface StyleModel {
id: string
name: string
source: 'ollama' | 'cf-text'
size_mb?: number | null
vram_mb?: number | null
description?: string
}
interface PromptResult {
tag: string
output: string
score: number
latency_ms: number
signals: Record<string, unknown>
}
interface ModelResult {
model_id: string
avg_score: number
avg_latency_ms: number
total_filler_hits: number
total_em_dashes: number
total_semicolons: number
prompt_results: PromptResult[]
}
interface PastRun {
filename: string
date: string
model_count: number
top_score: number
}
// State
const ollamaModels = ref<StyleModel[]>([])
const cftextModels = ref<StyleModel[]>([])
const selectedModels = ref<string[]>([])
const modelsLoading = ref(false)
const loadError = ref('')
const useCforch = ref(false)
const maxVram = ref(7200)
const workers = ref(1)
const includeLarge = ref(false)
const running = ref(false)
const runLog = ref<string[]>([])
const logEl = ref<HTMLPreElement | null>(null)
const results = ref<ModelResult[]>([])
const pastRuns = ref<PastRun[]>([])
const selectedRun = ref('')
const expandedModels = ref(new Set<string>())
const sendingCorrections = ref(false)
const correctionsMsg = ref('')
// Computed
const selectedCount = computed(() => selectedModels.value.length)
function isGroupAllSelected(source: string): boolean {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
return group.length > 0 && group.every(m => selectedModels.value.includes(m.id))
}
function isGroupIndeterminate(source: string): boolean {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
const count = group.filter(m => selectedModels.value.includes(m.id)).length
return count > 0 && count < group.length
}
// Actions
async function loadModels() {
modelsLoading.value = true
loadError.value = ''
try {
const resp = await fetch('/api/style/models')
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
const data = await resp.json()
ollamaModels.value = data.ollama ?? []
cftextModels.value = data.cf_text ?? []
} catch (e: unknown) {
loadError.value = `Failed to load models: ${e instanceof Error ? e.message : String(e)}`
} finally {
modelsLoading.value = false
}
}
async function loadPastRuns() {
try {
const resp = await fetch('/api/style/results')
if (resp.ok) pastRuns.value = await resp.json()
} catch { /* non-fatal */ }
}
async function loadRun(filename: string) {
if (!filename) return
try {
const resp = await fetch(`/api/style/results/${filename}`)
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
results.value = await resp.json()
expandedModels.value.clear()
} catch (e: unknown) {
runLog.value.push(`[error] Failed to load ${filename}: ${e instanceof Error ? e.message : String(e)}`)
}
}
function toggleGroup(source: string, checked: boolean) {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
const ids = group.map(m => m.id)
if (checked) {
const newSet = new Set([...selectedModels.value, ...ids])
selectedModels.value = [...newSet]
} else {
selectedModels.value = selectedModels.value.filter(id => !ids.includes(id))
}
}
function toggleExpanded(modelId: string) {
if (expandedModels.value.has(modelId)) {
expandedModels.value.delete(modelId)
} else {
expandedModels.value.add(modelId)
}
expandedModels.value = new Set(expandedModels.value)
}
function startBenchmark() {
if (running.value || selectedCount.value === 0) return
running.value = true
runLog.value = []
results.value = []
expandedModels.value.clear()
const params = new URLSearchParams({
models: selectedModels.value.join(','),
use_cforch: String(useCforch.value),
max_vram: String(maxVram.value),
workers: String(workers.value),
include_large: String(includeLarge.value),
})
const es = new EventSource(`/api/style/run?${params}`)
es.onmessage = async (ev) => {
try {
const msg = JSON.parse(ev.data)
if (msg.type === 'progress') {
runLog.value.push(msg.message)
await nextTick()
if (logEl.value) logEl.value.scrollTop = logEl.value.scrollHeight
} else if (msg.type === 'result') {
results.value = msg.results ?? []
await loadPastRuns()
} else if (msg.type === 'complete') {
running.value = false
es.close()
} else if (msg.type === 'error') {
runLog.value.push(`[error] ${msg.message}`)
running.value = false
es.close()
}
} catch { /* ignore parse errors */ }
}
es.onerror = () => {
if (running.value) {
runLog.value.push('[error] Connection lost')
running.value = false
}
es.close()
}
}
async function cancelBenchmark() {
try {
await fetch('/api/style/cancel', { method: 'POST' })
} finally {
running.value = false
runLog.value.push('[cancelled]')
}
}
async function sendToCorrections() {
if (!selectedRun.value || sendingCorrections.value) return
sendingCorrections.value = true
correctionsMsg.value = ''
try {
const resp = await fetch('/api/style/send-to-corrections', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ filename: selectedRun.value, model_ids: [] }),
})
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
const data = await resp.json()
correctionsMsg.value = `${data.imported} added to Corrections`
} catch (e: unknown) {
correctionsMsg.value = `Error: ${e instanceof Error ? e.message : String(e)}`
} finally {
sendingCorrections.value = false
}
}
// Formatting helpers
function formatMb(mb: number): string {
return mb >= 1024 ? `${(mb / 1024).toFixed(1)} GB` : `${mb} MB`
}
function formatLatency(ms: number): string {
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`
}
function medal(index: number): string {
return ['🥇', '🥈', '🥉'][index] ?? `#${index + 1}`
}
function scorePillStyle(score: number): Record<string, string> {
const hue = Math.round((score / 100) * 120) // 0=red, 120=green
return {
background: `hsl(${hue} 60% 88%)`,
color: `hsl(${hue} 60% 28%)`,
}
}
// Lifecycle
// Auto-enable cf-orch when cf-text models are selected
watch(selectedModels, (ids) => {
const hasCftext = ids.some(id => cftextModels.value.find(m => m.id === id))
if (hasCftext) useCforch.value = true
})
onMounted(async () => {
await Promise.all([loadModels(), loadPastRuns()])
// Auto-load the latest results if any exist
if (pastRuns.value.length) {
selectedRun.value = pastRuns.value[0].filename
await loadRun(pastRuns.value[0].filename)
}
})
</script>
<style scoped>
.style-tab {
display: flex;
flex-direction: column;
gap: 1rem;
padding: 1rem 0;
}
/* ── Controls ─────────────────────────────────────────────────────────────── */
.style-controls {
display: flex;
flex-wrap: wrap;
gap: 0.75rem;
align-items: flex-start;
}
.model-picker,
.options-panel {
flex: 1;
min-width: 280px;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
background: var(--color-surface, #f4f7fc);
overflow: hidden;
}
.picker-summary {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.65rem 0.85rem;
cursor: pointer;
user-select: none;
font-size: 0.9rem;
font-weight: 600;
list-style: none;
}
.picker-summary::-webkit-details-marker { display: none; }
.picker-title { flex: 1; color: var(--color-text, #1a2338); }
.picker-badge {
background: var(--app-primary, #2A6080);
color: #fff;
border-radius: 9999px;
padding: 0.1rem 0.5rem;
font-size: 0.72rem;
font-weight: 700;
}
.btn-refresh {
border: none;
background: transparent;
cursor: pointer;
font-size: 0.85rem;
padding: 0.1rem 0.25rem;
border-radius: 0.25rem;
color: var(--color-text-secondary, #6b7a99);
}
.btn-refresh:hover { background: var(--color-border, #d0d7e8); }
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
.picker-body,
.options-body {
padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8);
}
.picker-loading, .picker-empty {
color: var(--color-text-secondary, #6b7a99);
font-size: 0.85rem;
padding: 0.25rem 0;
}
.picker-error {
color: #b91c1c;
font-size: 0.85rem;
}
/* ── Model groups ──────────────────────────────────────────────────────────── */
.picker-group {
margin-bottom: 0.75rem;
}
.picker-group:last-child { margin-bottom: 0; }
.group-header {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.4rem;
}
.group-check {
display: flex;
align-items: center;
gap: 0.35rem;
font-size: 0.85rem;
font-weight: 600;
cursor: pointer;
color: var(--color-text, #1a2338);
}
.group-count {
color: var(--color-text-secondary, #6b7a99);
font-weight: 400;
font-size: 0.8rem;
}
.group-note {
margin-left: auto;
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
font-style: italic;
}
.model-list {
display: flex;
flex-direction: column;
gap: 0.2rem;
padding-left: 1.25rem;
max-height: 220px;
overflow-y: auto;
}
.model-item {
display: flex;
align-items: center;
gap: 0.4rem;
font-size: 0.82rem;
cursor: pointer;
padding: 0.15rem 0;
}
.model-name { flex: 1; font-family: var(--font-mono, monospace); }
.model-meta {
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
}
/* ── Options ──────────────────────────────────────────────────────────────── */
.option-row {
display: flex;
align-items: flex-start;
gap: 0.5rem;
padding: 0.35rem 0;
cursor: pointer;
font-size: 0.85rem;
}
.option-label { font-weight: 500; white-space: nowrap; }
.option-hint {
flex: 1;
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
margin-left: auto;
text-align: right;
}
.option-number {
width: 90px;
padding: 0.2rem 0.4rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
font-size: 0.85rem;
background: var(--color-bg, #fff);
color: var(--color-text, #1a2338);
}
.option-row.dimmed { opacity: 0.45; pointer-events: none; }
/* ── Run bar ──────────────────────────────────────────────────────────────── */
.run-bar {
display: flex;
align-items: center;
gap: 0.65rem;
}
.btn-run {
padding: 0.5rem 1.25rem;
border: none;
border-radius: 0.375rem;
background: var(--app-primary, #2A6080);
color: #fff;
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: background 0.15s;
}
.btn-run:hover:not(:disabled) { background: color-mix(in srgb, var(--app-primary, #2A6080) 80%, #000); }
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-cancel {
padding: 0.5rem 0.9rem;
border: 1px solid #f85149;
border-radius: 0.375rem;
background: transparent;
color: #b91c1c;
font-size: 0.85rem;
cursor: pointer;
transition: background 0.15s;
}
.btn-cancel:hover { background: #fee2e2; }
.run-hint {
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
}
/* ── Run log ──────────────────────────────────────────────────────────────── */
.run-log {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.run-log-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.4rem 0.75rem;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.8rem;
font-weight: 600;
color: var(--color-text-secondary, #6b7a99);
}
.run-log-title { text-transform: uppercase; letter-spacing: 0.05em; }
.btn-clear {
border: none;
background: transparent;
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
padding: 0.1rem 0.3rem;
border-radius: 0.25rem;
}
.btn-clear:hover { background: var(--color-border, #d0d7e8); }
.run-log-body {
margin: 0;
padding: 0.65rem 0.85rem;
font-size: 0.78rem;
font-family: var(--font-mono, monospace);
white-space: pre-wrap;
word-break: break-all;
max-height: 260px;
overflow-y: auto;
background: var(--color-bg, #fff);
color: var(--color-text, #1a2338);
}
/* ── History bar ──────────────────────────────────────────────────────────── */
.history-bar {
display: flex;
align-items: center;
gap: 0.6rem;
font-size: 0.85rem;
}
.history-label { font-weight: 500; white-space: nowrap; }
.history-select {
flex: 1;
padding: 0.3rem 0.5rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #f4f7fc);
color: var(--color-text, #1a2338);
font-size: 0.85rem;
}
/* ── Results table ────────────────────────────────────────────────────────── */
.results-section { display: flex; flex-direction: column; gap: 0.75rem; }
.results-header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 0.75rem;
}
.results-title {
font-size: 1rem;
font-weight: 700;
color: var(--color-text, #1a2338);
margin: 0;
}
.btn-corrections {
padding: 0.4rem 0.9rem;
border: 1px solid var(--app-primary, #2A6080);
border-radius: 0.375rem;
background: transparent;
color: var(--app-primary, #2A6080);
font-size: 0.83rem;
font-weight: 600;
cursor: pointer;
white-space: nowrap;
transition: background 0.15s, color 0.15s;
}
.btn-corrections:hover:not(:disabled) {
background: var(--app-primary, #2A6080);
color: #fff;
}
.btn-corrections:disabled { opacity: 0.55; cursor: not-allowed; }
.results-table-wrap {
overflow-x: auto;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
}
.results-table {
width: 100%;
border-collapse: collapse;
font-size: 0.85rem;
}
.results-table th {
padding: 0.5rem 0.75rem;
text-align: left;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.78rem;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.04em;
color: var(--color-text-secondary, #6b7a99);
white-space: nowrap;
}
.result-row {
cursor: pointer;
transition: background 0.1s;
}
.result-row:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 6%, transparent); }
.result-row.top-row { font-weight: 600; }
.result-row td {
padding: 0.5rem 0.75rem;
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.result-row:last-child td { border-bottom: none; }
.rank-cell { width: 2.5rem; text-align: center; font-size: 1.1rem; }
.model-cell { font-family: var(--font-mono, monospace); word-break: break-all; }
.score-cell { width: 5rem; text-align: center; }
.latency-cell { width: 5rem; text-align: right; color: var(--color-text-secondary, #6b7a99); }
.violation-cell { width: 4rem; text-align: center; color: var(--color-text-secondary, #6b7a99); }
.violation-cell.has-violation { color: #b91c1c; font-weight: 700; }
.score-pill {
display: inline-block;
padding: 0.15rem 0.55rem;
border-radius: 9999px;
font-weight: 700;
font-size: 0.82rem;
}
/* ── Sample outputs ───────────────────────────────────────────────────────── */
.sample-outputs {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.sample-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.5rem 0.85rem;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.85rem;
}
.btn-collapse {
border: none;
background: transparent;
font-size: 0.78rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
}
.sample-prompt {
padding: 0.65rem 0.85rem;
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.sample-prompt:last-child { border-bottom: none; }
.sample-tag {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.35rem;
font-size: 0.8rem;
}
.tag-name { font-weight: 600; color: var(--color-text, #1a2338); }
.tag-score { color: var(--app-primary, #2A6080); font-weight: 700; }
.tag-latency { color: var(--color-text-secondary, #6b7a99); margin-left: auto; }
.sample-text {
margin: 0;
font-size: 0.82rem;
white-space: pre-wrap;
word-break: break-word;
max-height: 200px;
overflow-y: auto;
background: var(--color-bg, #fff);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.35rem;
padding: 0.5rem 0.65rem;
color: var(--color-text, #1a2338);
font-family: inherit;
}
@media (max-width: 640px) {
.style-controls { flex-direction: column; }
.model-picker, .options-panel { min-width: 0; }
.option-hint { display: none; }
.group-note { display: none; }
}
</style>

919
web/src/views/VoiceTab.vue Normal file
View file

@ -0,0 +1,919 @@
<template>
<div class="voice-tab">
<!-- Controls row -->
<div class="voice-controls">
<!-- Model picker -->
<details class="model-picker" open>
<summary class="picker-summary">
<span class="picker-title">🎙 Models</span>
<span class="picker-badge">{{ selectedCount }} selected</span>
<button class="btn-refresh" :disabled="modelsLoading" @click.stop="loadModels" title="Refresh model list">
{{ modelsLoading ? '⏳' : '🔄' }}
</button>
</summary>
<div class="picker-body">
<div v-if="modelsLoading" class="picker-loading">Loading models</div>
<div v-else-if="loadError" class="picker-error">{{ loadError }}</div>
<template v-else>
<!-- Ollama group -->
<div class="picker-group" v-if="ollamaModels.length">
<div class="group-header">
<label class="group-check">
<input
type="checkbox"
:checked="isGroupAllSelected('ollama')"
:indeterminate="isGroupIndeterminate('ollama')"
@change="toggleGroup('ollama', ($event.target as HTMLInputElement).checked)"
/>
<span class="group-label">Ollama</span>
<span class="group-count">({{ ollamaModels.length }})</span>
</label>
<span class="group-note">auto-synced with Models view</span>
</div>
<div class="model-list">
<label v-for="m in ollamaModels" :key="m.id" class="model-item">
<input type="checkbox" :value="m.id" v-model="selectedModels" />
<span class="model-name">{{ m.name }}</span>
<span v-if="m.size_mb" class="model-meta">{{ formatMb(m.size_mb) }}</span>
</label>
</div>
</div>
<!-- cf-text group -->
<div class="picker-group" v-if="cftextModels.length">
<div class="group-header">
<label class="group-check">
<input
type="checkbox"
:checked="isGroupAllSelected('cf-text')"
:indeterminate="isGroupIndeterminate('cf-text')"
@change="toggleGroup('cf-text', ($event.target as HTMLInputElement).checked)"
/>
<span class="group-label">cf-text (cf-orch)</span>
<span class="group-count">({{ cftextModels.length }})</span>
</label>
<span class="group-note">GGUFs via coordinator enable cf-orch below</span>
</div>
<div class="model-list">
<label v-for="m in cftextModels" :key="m.id" class="model-item">
<input type="checkbox" :value="m.id" v-model="selectedModels" />
<span class="model-name">{{ m.name }}</span>
<span v-if="m.vram_mb" class="model-meta">{{ formatMb(m.vram_mb) }} VRAM</span>
</label>
</div>
</div>
<div v-if="!ollamaModels.length && !cftextModels.length" class="picker-empty">
No models available check Ollama and cf-orch connections.
</div>
</template>
</div>
</details>
<!-- Options panel -->
<details class="options-panel">
<summary class="picker-summary">
<span class="picker-title"> Options</span>
</summary>
<div class="options-body">
<label class="option-row">
<input type="checkbox" v-model="useCforch" :disabled="running" />
<span class="option-label">Use cf-orch backend</span>
<span class="option-hint">Routes generation through cf-text instead of ollama</span>
</label>
<label class="option-row" :class="{ dimmed: !useCforch }">
<span class="option-label">Max VRAM (MB)</span>
<input
type="number"
v-model.number="maxVram"
:disabled="running || !useCforch"
min="1024"
max="24576"
step="512"
class="option-number"
/>
<span class="option-hint">Skip models exceeding this VRAM limit</span>
</label>
<label class="option-row">
<span class="option-label">Parallel workers</span>
<input
type="number"
v-model.number="workers"
:disabled="running"
min="1"
max="16"
step="1"
class="option-number"
/>
<span class="option-hint">Models to score simultaneously (1 = sequential)</span>
</label>
<label class="option-row">
<input type="checkbox" v-model="includeLarge" :disabled="running" />
<span class="option-label">Include large models (30B+)</span>
<span class="option-hint">Off by default these take much longer</span>
</label>
</div>
</details>
</div>
<!-- Run controls -->
<div class="run-bar">
<button class="btn-run" :disabled="running || selectedCount === 0" @click="startBenchmark">
{{ running ? '⏳ Running…' : results.length ? '🔄 Re-run' : '▶ Run Benchmark' }}
</button>
<button v-if="running" class="btn-cancel" @click="cancelBenchmark"> Cancel</button>
<span v-if="selectedCount === 0 && !running" class="run-hint">Select at least one model above</span>
</div>
<!-- Progress log -->
<div v-if="runLog.length" class="run-log">
<div class="run-log-header">
<span class="run-log-title">Run log</span>
<button class="btn-clear" @click="runLog = []">Clear</button>
</div>
<pre class="run-log-body" ref="logEl">{{ runLog.join('\n') }}</pre>
</div>
<!-- Past runs picker -->
<div class="history-bar" v-if="pastRuns.length">
<label class="history-label">📂 Past runs:</label>
<select class="history-select" v-model="selectedRun" @change="loadRun(selectedRun)">
<option value=""> select a past run </option>
<option v-for="r in pastRuns" :key="r.filename" :value="r.filename">
{{ r.date }} · {{ r.model_count }} model{{ r.model_count !== 1 ? 's' : '' }} · top {{ r.top_score }}/100
</option>
</select>
</div>
<!-- Results table -->
<div v-if="results.length" class="results-section">
<div class="results-header">
<h2 class="results-title">Rankings</h2>
<button
class="btn-corrections"
:disabled="sendingCorrections"
@click="sendToCorrections"
title="Push all outputs from this run into the Corrections review queue"
>
{{ sendingCorrections ? '⏳ Sending…' : correctionsMsg || '✍️ Send to Corrections' }}
</button>
</div>
<div class="results-table-wrap">
<table class="results-table">
<thead>
<tr>
<th>Rank</th>
<th>Model</th>
<th>Score</th>
<th>Latency</th>
<th title="Em-dash count"></th>
<th title="Filler phrase hits">Fillers</th>
<th title="Semicolons">;</th>
</tr>
</thead>
<tbody>
<tr
v-for="(r, i) in results"
:key="r.model_id"
class="result-row"
:class="{ 'top-row': i === 0 }"
@click="toggleExpanded(r.model_id)"
>
<td class="rank-cell">{{ medal(i) }}</td>
<td class="model-cell">
<span class="model-name-text">{{ r.model_id }}</span>
</td>
<td class="score-cell">
<span class="score-pill" :style="scorePillStyle(r.avg_score)">
{{ r.avg_score.toFixed(0) }}
</span>
</td>
<td class="latency-cell">{{ formatLatency(r.avg_latency_ms) }}</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_em_dashes > 0 }">
{{ r.total_em_dashes }}
</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_filler_hits > 0 }">
{{ r.total_filler_hits }}
</td>
<td class="violation-cell" :class="{ 'has-violation': r.total_semicolons > 0 }">
{{ r.total_semicolons }}
</td>
</tr>
</tbody>
</table>
</div>
<!-- Expandable sample outputs -->
<div v-for="r in results" :key="'exp-' + r.model_id">
<div v-if="expandedModels.has(r.model_id)" class="sample-outputs">
<div class="sample-header">
<strong>{{ r.model_id }}</strong>
<button class="btn-collapse" @click="toggleExpanded(r.model_id)"> Close</button>
</div>
<div v-for="pr in r.prompt_results" :key="pr.tag" class="sample-prompt">
<div class="sample-tag">
<span class="tag-name">{{ pr.tag }}</span>
<span class="tag-score">{{ pr.score.toFixed(0) }}/100</span>
<span class="tag-latency">{{ formatLatency(pr.latency_ms) }}</span>
</div>
<pre class="sample-text">{{ pr.output || '(no output)' }}</pre>
</div>
</div>
</div>
</div>
</div>
</template>
<script setup lang="ts">
import { ref, computed, onMounted, nextTick, watch } from 'vue'
// Types
interface VoiceModel {
id: string
name: string
source: 'ollama' | 'cf-text'
size_mb?: number | null
vram_mb?: number | null
description?: string
}
interface PromptResult {
tag: string
output: string
score: number
latency_ms: number
signals: Record<string, unknown>
}
interface ModelResult {
model_id: string
avg_score: number
avg_latency_ms: number
total_filler_hits: number
total_em_dashes: number
total_semicolons: number
prompt_results: PromptResult[]
}
interface PastRun {
filename: string
date: string
model_count: number
top_score: number
}
// State
const ollamaModels = ref<VoiceModel[]>([])
const cftextModels = ref<VoiceModel[]>([])
const selectedModels = ref<string[]>([])
const modelsLoading = ref(false)
const loadError = ref('')
const useCforch = ref(false)
const maxVram = ref(7200)
const workers = ref(1)
const includeLarge = ref(false)
const running = ref(false)
const runLog = ref<string[]>([])
const logEl = ref<HTMLPreElement | null>(null)
const results = ref<ModelResult[]>([])
const pastRuns = ref<PastRun[]>([])
const selectedRun = ref('')
const expandedModels = ref(new Set<string>())
const sendingCorrections = ref(false)
const correctionsMsg = ref('')
// Computed
const selectedCount = computed(() => selectedModels.value.length)
function isGroupAllSelected(source: string): boolean {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
return group.length > 0 && group.every(m => selectedModels.value.includes(m.id))
}
function isGroupIndeterminate(source: string): boolean {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
const count = group.filter(m => selectedModels.value.includes(m.id)).length
return count > 0 && count < group.length
}
// Actions
async function loadModels() {
modelsLoading.value = true
loadError.value = ''
try {
const resp = await fetch('/api/voice/models')
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
const data = await resp.json()
ollamaModels.value = data.ollama ?? []
cftextModels.value = data.cf_text ?? []
} catch (e: unknown) {
loadError.value = `Failed to load models: ${e instanceof Error ? e.message : String(e)}`
} finally {
modelsLoading.value = false
}
}
async function loadPastRuns() {
try {
const resp = await fetch('/api/voice/results')
if (resp.ok) pastRuns.value = await resp.json()
} catch { /* non-fatal */ }
}
async function loadRun(filename: string) {
if (!filename) return
try {
const resp = await fetch(`/api/voice/results/${filename}`)
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
results.value = await resp.json()
expandedModels.value.clear()
} catch (e: unknown) {
runLog.value.push(`[error] Failed to load ${filename}: ${e instanceof Error ? e.message : String(e)}`)
}
}
function toggleGroup(source: string, checked: boolean) {
const group = source === 'ollama' ? ollamaModels.value : cftextModels.value
const ids = group.map(m => m.id)
if (checked) {
const newSet = new Set([...selectedModels.value, ...ids])
selectedModels.value = [...newSet]
} else {
selectedModels.value = selectedModels.value.filter(id => !ids.includes(id))
}
}
function toggleExpanded(modelId: string) {
if (expandedModels.value.has(modelId)) {
expandedModels.value.delete(modelId)
} else {
expandedModels.value.add(modelId)
}
expandedModels.value = new Set(expandedModels.value)
}
function startBenchmark() {
if (running.value || selectedCount.value === 0) return
running.value = true
runLog.value = []
results.value = []
expandedModels.value.clear()
const params = new URLSearchParams({
models: selectedModels.value.join(','),
use_cforch: String(useCforch.value),
max_vram: String(maxVram.value),
workers: String(workers.value),
include_large: String(includeLarge.value),
})
const es = new EventSource(`/api/voice/run?${params}`)
es.onmessage = async (ev) => {
try {
const msg = JSON.parse(ev.data)
if (msg.type === 'progress') {
runLog.value.push(msg.message)
await nextTick()
if (logEl.value) logEl.value.scrollTop = logEl.value.scrollHeight
} else if (msg.type === 'result') {
results.value = msg.results ?? []
await loadPastRuns()
} else if (msg.type === 'complete') {
running.value = false
es.close()
} else if (msg.type === 'error') {
runLog.value.push(`[error] ${msg.message}`)
running.value = false
es.close()
}
} catch { /* ignore parse errors */ }
}
es.onerror = () => {
if (running.value) {
runLog.value.push('[error] Connection lost')
running.value = false
}
es.close()
}
}
async function cancelBenchmark() {
try {
await fetch('/api/voice/cancel', { method: 'POST' })
} finally {
running.value = false
runLog.value.push('[cancelled]')
}
}
async function sendToCorrections() {
if (!selectedRun.value || sendingCorrections.value) return
sendingCorrections.value = true
correctionsMsg.value = ''
try {
const resp = await fetch('/api/voice/send-to-corrections', {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ filename: selectedRun.value, model_ids: [] }),
})
if (!resp.ok) throw new Error(`HTTP ${resp.status}`)
const data = await resp.json()
correctionsMsg.value = `${data.imported} added to Corrections`
} catch (e: unknown) {
correctionsMsg.value = `Error: ${e instanceof Error ? e.message : String(e)}`
} finally {
sendingCorrections.value = false
}
}
// Formatting helpers
function formatMb(mb: number): string {
return mb >= 1024 ? `${(mb / 1024).toFixed(1)} GB` : `${mb} MB`
}
function formatLatency(ms: number): string {
return ms >= 1000 ? `${(ms / 1000).toFixed(1)}s` : `${Math.round(ms)}ms`
}
function medal(index: number): string {
return ['🥇', '🥈', '🥉'][index] ?? `#${index + 1}`
}
function scorePillStyle(score: number): Record<string, string> {
const hue = Math.round((score / 100) * 120) // 0=red, 120=green
return {
background: `hsl(${hue} 60% 88%)`,
color: `hsl(${hue} 60% 28%)`,
}
}
// Lifecycle
// Auto-enable cf-orch when cf-text models are selected
watch(selectedModels, (ids) => {
const hasCftext = ids.some(id => cftextModels.value.find(m => m.id === id))
if (hasCftext) useCforch.value = true
})
onMounted(async () => {
await Promise.all([loadModels(), loadPastRuns()])
// Auto-load the latest results if any exist
if (pastRuns.value.length) {
selectedRun.value = pastRuns.value[0].filename
await loadRun(pastRuns.value[0].filename)
}
})
</script>
<style scoped>
.voice-tab {
display: flex;
flex-direction: column;
gap: 1rem;
padding: 1rem 0;
}
/* ── Controls ─────────────────────────────────────────────────────────────── */
.voice-controls {
display: flex;
flex-wrap: wrap;
gap: 0.75rem;
align-items: flex-start;
}
.model-picker,
.options-panel {
flex: 1;
min-width: 280px;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
background: var(--color-surface, #f4f7fc);
overflow: hidden;
}
.picker-summary {
display: flex;
align-items: center;
gap: 0.5rem;
padding: 0.65rem 0.85rem;
cursor: pointer;
user-select: none;
font-size: 0.9rem;
font-weight: 600;
list-style: none;
}
.picker-summary::-webkit-details-marker { display: none; }
.picker-title { flex: 1; color: var(--color-text, #1a2338); }
.picker-badge {
background: var(--app-primary, #2A6080);
color: #fff;
border-radius: 9999px;
padding: 0.1rem 0.5rem;
font-size: 0.72rem;
font-weight: 700;
}
.btn-refresh {
border: none;
background: transparent;
cursor: pointer;
font-size: 0.85rem;
padding: 0.1rem 0.25rem;
border-radius: 0.25rem;
color: var(--color-text-secondary, #6b7a99);
}
.btn-refresh:hover { background: var(--color-border, #d0d7e8); }
.btn-refresh:disabled { opacity: 0.5; cursor: not-allowed; }
.picker-body,
.options-body {
padding: 0.75rem;
border-top: 1px solid var(--color-border, #d0d7e8);
}
.picker-loading, .picker-empty {
color: var(--color-text-secondary, #6b7a99);
font-size: 0.85rem;
padding: 0.25rem 0;
}
.picker-error {
color: #b91c1c;
font-size: 0.85rem;
}
/* ── Model groups ──────────────────────────────────────────────────────────── */
.picker-group {
margin-bottom: 0.75rem;
}
.picker-group:last-child { margin-bottom: 0; }
.group-header {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.4rem;
}
.group-check {
display: flex;
align-items: center;
gap: 0.35rem;
font-size: 0.85rem;
font-weight: 600;
cursor: pointer;
color: var(--color-text, #1a2338);
}
.group-count {
color: var(--color-text-secondary, #6b7a99);
font-weight: 400;
font-size: 0.8rem;
}
.group-note {
margin-left: auto;
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
font-style: italic;
}
.model-list {
display: flex;
flex-direction: column;
gap: 0.2rem;
padding-left: 1.25rem;
max-height: 220px;
overflow-y: auto;
}
.model-item {
display: flex;
align-items: center;
gap: 0.4rem;
font-size: 0.82rem;
cursor: pointer;
padding: 0.15rem 0;
}
.model-name { flex: 1; font-family: var(--font-mono, monospace); }
.model-meta {
font-size: 0.72rem;
color: var(--color-text-secondary, #6b7a99);
}
/* ── Options ──────────────────────────────────────────────────────────────── */
.option-row {
display: flex;
align-items: flex-start;
gap: 0.5rem;
padding: 0.35rem 0;
cursor: pointer;
font-size: 0.85rem;
}
.option-label { font-weight: 500; white-space: nowrap; }
.option-hint {
flex: 1;
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
margin-left: auto;
text-align: right;
}
.option-number {
width: 90px;
padding: 0.2rem 0.4rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.25rem;
font-size: 0.85rem;
background: var(--color-bg, #fff);
color: var(--color-text, #1a2338);
}
.option-row.dimmed { opacity: 0.45; pointer-events: none; }
/* ── Run bar ──────────────────────────────────────────────────────────────── */
.run-bar {
display: flex;
align-items: center;
gap: 0.65rem;
}
.btn-run {
padding: 0.5rem 1.25rem;
border: none;
border-radius: 0.375rem;
background: var(--app-primary, #2A6080);
color: #fff;
font-size: 0.9rem;
font-weight: 600;
cursor: pointer;
transition: background 0.15s;
}
.btn-run:hover:not(:disabled) { background: color-mix(in srgb, var(--app-primary, #2A6080) 80%, #000); }
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
.btn-cancel {
padding: 0.5rem 0.9rem;
border: 1px solid #f85149;
border-radius: 0.375rem;
background: transparent;
color: #b91c1c;
font-size: 0.85rem;
cursor: pointer;
transition: background 0.15s;
}
.btn-cancel:hover { background: #fee2e2; }
.run-hint {
font-size: 0.8rem;
color: var(--color-text-secondary, #6b7a99);
}
/* ── Run log ──────────────────────────────────────────────────────────────── */
.run-log {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.run-log-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.4rem 0.75rem;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.8rem;
font-weight: 600;
color: var(--color-text-secondary, #6b7a99);
}
.run-log-title { text-transform: uppercase; letter-spacing: 0.05em; }
.btn-clear {
border: none;
background: transparent;
font-size: 0.75rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
padding: 0.1rem 0.3rem;
border-radius: 0.25rem;
}
.btn-clear:hover { background: var(--color-border, #d0d7e8); }
.run-log-body {
margin: 0;
padding: 0.65rem 0.85rem;
font-size: 0.78rem;
font-family: var(--font-mono, monospace);
white-space: pre-wrap;
word-break: break-all;
max-height: 260px;
overflow-y: auto;
background: var(--color-bg, #fff);
color: var(--color-text, #1a2338);
}
/* ── History bar ──────────────────────────────────────────────────────────── */
.history-bar {
display: flex;
align-items: center;
gap: 0.6rem;
font-size: 0.85rem;
}
.history-label { font-weight: 500; white-space: nowrap; }
.history-select {
flex: 1;
padding: 0.3rem 0.5rem;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.375rem;
background: var(--color-surface, #f4f7fc);
color: var(--color-text, #1a2338);
font-size: 0.85rem;
}
/* ── Results table ────────────────────────────────────────────────────────── */
.results-section { display: flex; flex-direction: column; gap: 0.75rem; }
.results-header {
display: flex;
align-items: center;
justify-content: space-between;
gap: 0.75rem;
}
.results-title {
font-size: 1rem;
font-weight: 700;
color: var(--color-text, #1a2338);
margin: 0;
}
.btn-corrections {
padding: 0.4rem 0.9rem;
border: 1px solid var(--app-primary, #2A6080);
border-radius: 0.375rem;
background: transparent;
color: var(--app-primary, #2A6080);
font-size: 0.83rem;
font-weight: 600;
cursor: pointer;
white-space: nowrap;
transition: background 0.15s, color 0.15s;
}
.btn-corrections:hover:not(:disabled) {
background: var(--app-primary, #2A6080);
color: #fff;
}
.btn-corrections:disabled { opacity: 0.55; cursor: not-allowed; }
.results-table-wrap {
overflow-x: auto;
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
}
.results-table {
width: 100%;
border-collapse: collapse;
font-size: 0.85rem;
}
.results-table th {
padding: 0.5rem 0.75rem;
text-align: left;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.78rem;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 0.04em;
color: var(--color-text-secondary, #6b7a99);
white-space: nowrap;
}
.result-row {
cursor: pointer;
transition: background 0.1s;
}
.result-row:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 6%, transparent); }
.result-row.top-row { font-weight: 600; }
.result-row td {
padding: 0.5rem 0.75rem;
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.result-row:last-child td { border-bottom: none; }
.rank-cell { width: 2.5rem; text-align: center; font-size: 1.1rem; }
.model-cell { font-family: var(--font-mono, monospace); word-break: break-all; }
.score-cell { width: 5rem; text-align: center; }
.latency-cell { width: 5rem; text-align: right; color: var(--color-text-secondary, #6b7a99); }
.violation-cell { width: 4rem; text-align: center; color: var(--color-text-secondary, #6b7a99); }
.violation-cell.has-violation { color: #b91c1c; font-weight: 700; }
.score-pill {
display: inline-block;
padding: 0.15rem 0.55rem;
border-radius: 9999px;
font-weight: 700;
font-size: 0.82rem;
}
/* ── Sample outputs ───────────────────────────────────────────────────────── */
.sample-outputs {
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.5rem;
overflow: hidden;
}
.sample-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 0.5rem 0.85rem;
background: var(--color-surface, #f4f7fc);
border-bottom: 1px solid var(--color-border, #d0d7e8);
font-size: 0.85rem;
}
.btn-collapse {
border: none;
background: transparent;
font-size: 0.78rem;
color: var(--color-text-secondary, #6b7a99);
cursor: pointer;
}
.sample-prompt {
padding: 0.65rem 0.85rem;
border-bottom: 1px solid var(--color-border, #d0d7e8);
}
.sample-prompt:last-child { border-bottom: none; }
.sample-tag {
display: flex;
align-items: center;
gap: 0.5rem;
margin-bottom: 0.35rem;
font-size: 0.8rem;
}
.tag-name { font-weight: 600; color: var(--color-text, #1a2338); }
.tag-score { color: var(--app-primary, #2A6080); font-weight: 700; }
.tag-latency { color: var(--color-text-secondary, #6b7a99); margin-left: auto; }
.sample-text {
margin: 0;
font-size: 0.82rem;
white-space: pre-wrap;
word-break: break-word;
max-height: 200px;
overflow-y: auto;
background: var(--color-bg, #fff);
border: 1px solid var(--color-border, #d0d7e8);
border-radius: 0.35rem;
padding: 0.5rem 0.65rem;
color: var(--color-text, #1a2338);
font-family: inherit;
}
@media (max-width: 640px) {
.voice-controls { flex-direction: column; }
.model-picker, .options-panel { min-width: 0; }
.option-hint { display: none; }
.group-note { display: none; }
}
</style>