- BenchmarkView.vue: convert from monolithic view to tabbed shell; each tab is now its own component (ClassifierTab, CompareTab, LlmEvalTab, StyleTab, VoiceTab) - StyleTab + VoiceTab: new benchmark modes for style and voice model evaluation - app/style.py: FastAPI router for style imitation benchmarks - app/voice.py: FastAPI router for voice benchmark endpoints - scripts/benchmark_style.py + benchmark_voice.py: headless runner scripts
909 lines
36 KiB
Python
909 lines
36 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Voice benchmark harness -- score local text-gen models for writing style match.
|
|
|
|
Runs each model against a set of test prompts, extracts style signals from the
|
|
outputs, compares them to a voice corpus, and produces a ranked markdown table.
|
|
|
|
Usage:
|
|
# List available ollama models
|
|
conda run -n cf python scripts/benchmark_voice.py --list-models
|
|
|
|
# Run against all models with default test prompts
|
|
conda run -n cf python scripts/benchmark_voice.py --run
|
|
|
|
# Run specific models only
|
|
conda run -n cf python scripts/benchmark_voice.py --run --models mistral:7b,llama3.1:8b
|
|
|
|
# Use a custom corpus directory
|
|
conda run -n cf python scripts/benchmark_voice.py --run --samples data/voice_corpus/
|
|
|
|
# Print last results table
|
|
conda run -n cf python scripts/benchmark_voice.py --show-last
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
import httpx
|
|
|
|
_ROOT = Path(__file__).parent.parent
|
|
_CORPUS_DIR = _ROOT / "data" / "voice_corpus"
|
|
_RESULTS_DIR = _ROOT / "benchmark_results"
|
|
_OLLAMA_URL = "http://localhost:11434"
|
|
_CFORCH_URL = "http://localhost:7700"
|
|
|
|
# Subdirectories under --scan-disk root that may contain GGUFs
|
|
_SCAN_SUBDIRS = ["textgen/models", "llama.cpp/models", "cf-text/models", "vllm/models"]
|
|
|
|
# ── Filler phrases that should be absent from good voice-match output ─────────
|
|
FILLER_PHRASES: list[str] = [
|
|
"delve", "certainly", "absolutely", "i apologize", "i'd be happy to",
|
|
"of course", "great question", "i understand", "let me know if",
|
|
"feel free to", "it's important to note", "it's worth noting",
|
|
"in conclusion", "to summarize", "in summary",
|
|
]
|
|
|
|
# ── Test prompts: (thread_title, thread_body, context_tag) ───────────────────
|
|
# These are representative threads that Magpie might reply to.
|
|
# Extend this list with real examples as the corpus grows.
|
|
TEST_PROMPTS: list[dict[str, str]] = [
|
|
{
|
|
"tag": "selfhosted_ai_fatigue",
|
|
"thread_title": "Anyone else getting tired of re-explaining their setup every time an AI model forgets?",
|
|
"thread_body": (
|
|
"Every session I start over. My whole hardware setup, what tools I use, "
|
|
"what I've already tried. It's exhausting. There has to be a better way."
|
|
),
|
|
},
|
|
{
|
|
"tag": "privacy_local_llm",
|
|
"thread_title": "What's the point of running local LLMs if the apps still phone home?",
|
|
"thread_body": (
|
|
"I went through all the trouble of setting up ollama and now I find out "
|
|
"the frontend I'm using is sending telemetry. Kind of defeats the purpose."
|
|
),
|
|
},
|
|
{
|
|
"tag": "solarpunk_tech",
|
|
"thread_title": "What does solarpunk computing actually look like in practice?",
|
|
"thread_body": (
|
|
"I keep seeing the aesthetic but not a lot of concrete examples of "
|
|
"people living it out with their tech choices. What does it mean day to day?"
|
|
),
|
|
},
|
|
{
|
|
"tag": "nd_tools",
|
|
"thread_title": "Tools that actually help with executive function vs ones that just add friction",
|
|
"thread_body": (
|
|
"I've tried a dozen productivity apps and most of them require more "
|
|
"executive function to maintain than they save. What actually sticks for you?"
|
|
),
|
|
},
|
|
{
|
|
"tag": "data_ownership",
|
|
"thread_title": "Who actually owns your data when you use a 'free' AI tool?",
|
|
"thread_body": (
|
|
"Read the ToS on three different AI assistants today. In all three cases "
|
|
"your inputs can be used for training, shared with partners, and retained "
|
|
"indefinitely. At what point does 'free' just mean you're the product?"
|
|
),
|
|
},
|
|
{
|
|
"tag": "digital_culture",
|
|
"thread_title": "The internet used to feel like it belonged to everyone. What happened?",
|
|
"thread_body": (
|
|
"I grew up on forums, IRC, personal homepages. Now everything is a platform "
|
|
"owned by someone trying to extract value from the community that built it. "
|
|
"Is the fediverse / self-hosting movement actually reversing this or just "
|
|
"a niche hobby?"
|
|
),
|
|
},
|
|
]
|
|
|
|
GENERATION_PARAMS: dict[str, Any] = {
|
|
"temperature": 0.7,
|
|
"top_p": 0.9,
|
|
"num_predict": 300,
|
|
}
|
|
|
|
SYSTEM_PROMPT = (
|
|
"You are a writing assistant. Your job is to write a Reddit reply that matches "
|
|
"the voice, tone, and style of the provided samples exactly.\n\n"
|
|
"Voice characteristics:\n"
|
|
"- Casual engineer tone. Short punchy sentences.\n"
|
|
"- No hype, no buzzwords, no em dashes, no semicolons.\n"
|
|
"- Community-first perspective. Solarpunk values.\n"
|
|
"- Direct and opinionated. No throat-clearing or filler.\n"
|
|
"- When relevant, mention personal experience with real tools.\n\n"
|
|
"Write ONLY the reply. No preamble, no 'Here is a reply:', no meta-commentary."
|
|
)
|
|
|
|
|
|
# ── Style signal extraction ───────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class StyleSignals:
|
|
"""Quantitative style signals extracted from a text sample."""
|
|
sentence_count: int = 0
|
|
word_count: int = 0
|
|
avg_sentence_length: float = 0.0
|
|
em_dash_count: int = 0
|
|
semicolon_count: int = 0
|
|
filler_hits: list[str] = field(default_factory=list)
|
|
question_ratio: float = 0.0 # fraction of sentences ending in '?'
|
|
first_person_ratio: float = 0.0 # fraction of sentences starting with 'I'
|
|
avg_word_length: float = 0.0
|
|
|
|
|
|
def extract_signals(text: str) -> StyleSignals:
|
|
"""Extract style signals from a text sample."""
|
|
text = text.strip()
|
|
if text.startswith("[ERROR:"):
|
|
return StyleSignals() # zero-score sentinel — caller checks for empty output
|
|
sentences = [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]
|
|
words = text.split()
|
|
|
|
if not sentences:
|
|
return StyleSignals()
|
|
|
|
avg_sentence_length = len(words) / len(sentences) if sentences else 0.0
|
|
avg_word_length = (sum(len(w.strip('.,!?;:"\'')) for w in words) / len(words)) if words else 0.0
|
|
|
|
em_dash_count = text.count('\u2014') + text.count(' -- ') + text.count('--')
|
|
semicolon_count = text.count(';')
|
|
|
|
filler_hits = [p for p in FILLER_PHRASES if p.lower() in text.lower()]
|
|
|
|
question_ratio = sum(1 for s in sentences if s.endswith('?')) / len(sentences)
|
|
first_person_ratio = sum(1 for s in sentences if re.match(r"^I\b", s)) / len(sentences)
|
|
|
|
return StyleSignals(
|
|
sentence_count=len(sentences),
|
|
word_count=len(words),
|
|
avg_sentence_length=avg_sentence_length,
|
|
em_dash_count=em_dash_count,
|
|
semicolon_count=semicolon_count,
|
|
filler_hits=filler_hits,
|
|
question_ratio=question_ratio,
|
|
first_person_ratio=first_person_ratio,
|
|
avg_word_length=avg_word_length,
|
|
)
|
|
|
|
|
|
def build_corpus_profile(corpus_dir: Path) -> StyleSignals | None:
|
|
"""Aggregate style signals across all corpus samples into a target profile."""
|
|
samples = list(corpus_dir.glob("*.txt"))
|
|
if not samples:
|
|
return None
|
|
|
|
all_signals = [extract_signals(p.read_text(encoding="utf-8")) for p in samples]
|
|
n = len(all_signals)
|
|
|
|
return StyleSignals(
|
|
sentence_count=int(sum(s.sentence_count for s in all_signals) / n),
|
|
word_count=int(sum(s.word_count for s in all_signals) / n),
|
|
avg_sentence_length=sum(s.avg_sentence_length for s in all_signals) / n,
|
|
em_dash_count=int(sum(s.em_dash_count for s in all_signals) / n),
|
|
semicolon_count=int(sum(s.semicolon_count for s in all_signals) / n),
|
|
question_ratio=sum(s.question_ratio for s in all_signals) / n,
|
|
first_person_ratio=sum(s.first_person_ratio for s in all_signals) / n,
|
|
avg_word_length=sum(s.avg_word_length for s in all_signals) / n,
|
|
)
|
|
|
|
|
|
def score_against_profile(output_signals: StyleSignals, profile: StyleSignals | None) -> float:
|
|
"""Score a model output against the corpus profile. Returns 0-100.
|
|
|
|
Penalties:
|
|
- Em dashes / semicolons: -5 each occurrence (hard CF style violation)
|
|
- Filler phrases: -8 each hit (strong signal of non-voice output)
|
|
- Sentence length delta: proportional penalty (target: close to corpus avg)
|
|
- Word length delta: smaller penalty
|
|
|
|
When no corpus profile is available, falls back to absolute signal scores only.
|
|
"""
|
|
score = 100.0
|
|
|
|
# Hard violations -- always penalised regardless of corpus
|
|
score -= output_signals.em_dash_count * 5
|
|
score -= output_signals.semicolon_count * 3
|
|
score -= len(output_signals.filler_hits) * 8
|
|
|
|
if profile is not None:
|
|
# Sentence length delta: penalise proportionally
|
|
length_delta = abs(output_signals.avg_sentence_length - profile.avg_sentence_length)
|
|
score -= min(length_delta * 2, 20)
|
|
|
|
# Question ratio delta
|
|
question_delta = abs(output_signals.question_ratio - profile.question_ratio)
|
|
score -= min(question_delta * 10, 10)
|
|
|
|
return max(0.0, score)
|
|
|
|
|
|
# ── Ollama generation ─────────────────────────────────────────────────────────
|
|
|
|
_CFORCH_NODE_ID = "heimdall"
|
|
|
|
|
|
def cforch_list_catalog(
|
|
cforch_url: str = _CFORCH_URL,
|
|
node_id: str = _CFORCH_NODE_ID,
|
|
) -> dict[str, int]:
|
|
"""Return the cf-text catalog from cf-orch as {model_id: vram_mb}.
|
|
|
|
Uses ?node_id= to request the catalog from a specific node's profile,
|
|
avoiding cross-node catalog shadowing when multiple nodes define catalogs
|
|
for the same service.
|
|
"""
|
|
try:
|
|
resp = httpx.get(
|
|
f"{cforch_url}/api/services/cf-text/catalog",
|
|
params={"node_id": node_id} if node_id else {},
|
|
timeout=10.0,
|
|
)
|
|
resp.raise_for_status()
|
|
raw = resp.json()
|
|
return {
|
|
model_id: (entry.get("vram_mb", 0) if isinstance(entry, dict) else 0)
|
|
for model_id, entry in raw.items()
|
|
}
|
|
except Exception as exc:
|
|
print(f"[warn] Could not reach cf-orch catalog at {cforch_url}: {exc}", file=sys.stderr)
|
|
return {}
|
|
|
|
|
|
def _cforch_allocate_service(
|
|
service: str,
|
|
model_id: str,
|
|
cforch_url: str,
|
|
startup_timeout_s: float,
|
|
health_path: str,
|
|
) -> tuple[str, str] | None:
|
|
"""Generic cf-orch allocate + health-poll. Returns (service_url, allocation_id) or None."""
|
|
try:
|
|
resp = httpx.post(
|
|
f"{cforch_url}/api/services/{service}/allocate",
|
|
json={
|
|
"model_candidates": [model_id],
|
|
"caller": "avocet",
|
|
"pipeline": "voice_benchmark",
|
|
},
|
|
timeout=120.0,
|
|
)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
service_url: str = data["url"]
|
|
allocation_id: str = data.get("allocation_id", "")
|
|
|
|
if data.get("started", False) and not data.get("warm", True):
|
|
label = service
|
|
print(f" [cold start] waiting for {label} to load {model_id!r}...", end=" ", flush=True)
|
|
deadline = time.monotonic() + startup_timeout_s
|
|
while time.monotonic() < deadline:
|
|
try:
|
|
health = httpx.get(f"{service_url}{health_path}", timeout=3.0)
|
|
if health.is_success:
|
|
print(f"ready ({time.monotonic() - (deadline - startup_timeout_s):.0f}s)", flush=True)
|
|
break
|
|
except Exception:
|
|
pass
|
|
time.sleep(2.0)
|
|
else:
|
|
print(f"timed out after {startup_timeout_s:.0f}s", flush=True)
|
|
return None
|
|
|
|
return service_url, allocation_id
|
|
except Exception as exc:
|
|
print(f"[warn] cf-orch allocation failed for {model_id!r} ({service}): {exc}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def cforch_allocate(
|
|
model_id: str,
|
|
cforch_url: str = _CFORCH_URL,
|
|
startup_timeout_s: float = 180.0,
|
|
) -> tuple[str, str] | None:
|
|
"""Allocate a cf-text instance for model_id. Returns (service_url, allocation_id) or None."""
|
|
return _cforch_allocate_service("cf-text", model_id, cforch_url, startup_timeout_s, "/health")
|
|
|
|
|
|
def cforch_allocate_vllm(
|
|
model_id: str,
|
|
cforch_url: str = _CFORCH_URL,
|
|
startup_timeout_s: float = 300.0,
|
|
) -> tuple[str, str] | None:
|
|
"""Allocate a vllm instance for model_id. Returns (service_url, allocation_id) or None.
|
|
|
|
vllm exposes an OpenAI-compatible API — generate_cftext() works unchanged
|
|
against the returned service_url. Startup timeout is longer (300s) because
|
|
vllm loads large model weights from disk before becoming ready.
|
|
"""
|
|
return _cforch_allocate_service("vllm", model_id, cforch_url, startup_timeout_s, "/health")
|
|
|
|
|
|
def cforch_release(allocation_id: str, cforch_url: str = _CFORCH_URL) -> None:
|
|
"""Release a cf-orch allocation."""
|
|
if not allocation_id:
|
|
return
|
|
try:
|
|
httpx.post(f"{cforch_url}/api/leases/{allocation_id}/release", timeout=10.0)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def generate_cftext(
|
|
service_url: str,
|
|
model_id: str,
|
|
prompt: str,
|
|
system: str = "",
|
|
) -> tuple[str, float]:
|
|
"""Call cf-text via OpenAI-compatible /v1/chat/completions. Returns (text, elapsed_ms)."""
|
|
messages: list[dict[str, str]] = []
|
|
if system:
|
|
messages.append({"role": "system", "content": system})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
payload: dict[str, Any] = {
|
|
"model": model_id,
|
|
"messages": messages,
|
|
"max_tokens": GENERATION_PARAMS.get("num_predict", 300),
|
|
"temperature": GENERATION_PARAMS.get("temperature", 0.7),
|
|
"top_p": GENERATION_PARAMS.get("top_p", 0.9),
|
|
"stream": False,
|
|
}
|
|
|
|
t0 = time.monotonic()
|
|
try:
|
|
resp = httpx.post(
|
|
f"{service_url.rstrip('/')}/v1/chat/completions",
|
|
json=payload,
|
|
timeout=180.0,
|
|
)
|
|
resp.raise_for_status()
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
content = resp.json()["choices"][0]["message"]["content"]
|
|
return content.strip(), elapsed_ms
|
|
except Exception as exc:
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
return f"[ERROR: {exc}]", elapsed_ms
|
|
|
|
|
|
def generate(model_id: str, prompt: str, system: str = "") -> tuple[str, float]:
|
|
"""Call ollama /api/generate. Returns (text, elapsed_ms)."""
|
|
payload: dict[str, Any] = {
|
|
"model": model_id,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": GENERATION_PARAMS,
|
|
}
|
|
if system:
|
|
payload["system"] = system
|
|
|
|
t0 = time.monotonic()
|
|
try:
|
|
resp = httpx.post(
|
|
f"{_OLLAMA_URL}/api/generate",
|
|
json=payload,
|
|
timeout=120.0,
|
|
)
|
|
resp.raise_for_status()
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
return resp.json().get("response", "").strip(), elapsed_ms
|
|
except Exception as exc:
|
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
|
return f"[ERROR: {exc}]", elapsed_ms
|
|
|
|
|
|
def find_disk_ggufs(llm_root: Path) -> list[Path]:
|
|
"""Recursively find .gguf files under known subdirs of llm_root.
|
|
|
|
Skips vocab-only GGUFs (ggml-vocab-*) which aren't standalone models.
|
|
"""
|
|
found: list[Path] = []
|
|
search_dirs = [llm_root / sub for sub in _SCAN_SUBDIRS] + [llm_root]
|
|
seen: set[Path] = set()
|
|
for base in search_dirs:
|
|
if not base.exists():
|
|
continue
|
|
for gguf in base.rglob("*.gguf"):
|
|
if gguf in seen:
|
|
continue
|
|
seen.add(gguf)
|
|
if gguf.name.startswith("ggml-vocab-"):
|
|
continue
|
|
found.append(gguf)
|
|
return sorted(found)
|
|
|
|
|
|
def gguf_to_ollama_tag(gguf_path: Path) -> str:
|
|
"""Derive a stable ollama tag from a GGUF path.
|
|
|
|
Uses parent dir name + stem to avoid collisions, e.g.:
|
|
claude-3.7-sonnet-reasoning-gemma3-12B/foo.Q8_0.gguf
|
|
→ bench-claude-3.7-sonnet-reasoning-gemma3-12b-foo-q8-0
|
|
"""
|
|
parent = gguf_path.parent.name.lower()
|
|
stem = gguf_path.stem.lower()
|
|
# If stem is contained in parent (common pattern), just use parent
|
|
slug = parent if stem.replace("-", "").replace("_", "") in parent.replace("-", "").replace("_", "") else f"{parent}-{stem}"
|
|
slug = re.sub(r"[^a-z0-9]+", "-", slug).strip("-")
|
|
return f"bench-{slug}:latest"
|
|
|
|
|
|
def register_gguf(gguf_path: Path, tag: str) -> bool:
|
|
"""Create a temporary ollama model entry from a GGUF file. Returns True on success."""
|
|
import subprocess
|
|
import tempfile
|
|
modelfile = f"FROM {gguf_path.resolve()}\n"
|
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".Modelfile", delete=False) as f:
|
|
f.write(modelfile)
|
|
modelfile_path = f.name
|
|
try:
|
|
result = subprocess.run(
|
|
["ollama", "create", tag, "-f", modelfile_path],
|
|
capture_output=True, text=True, timeout=60,
|
|
)
|
|
return result.returncode == 0
|
|
except Exception as exc:
|
|
print(f"[warn] Could not register {gguf_path.name}: {exc}", file=sys.stderr)
|
|
return False
|
|
finally:
|
|
Path(modelfile_path).unlink(missing_ok=True)
|
|
|
|
|
|
def deregister_gguf(tag: str) -> None:
|
|
"""Remove a temporary ollama model entry."""
|
|
import subprocess
|
|
try:
|
|
subprocess.run(["ollama", "rm", tag], capture_output=True, timeout=30)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def backfill_disk_models(
|
|
llm_root: Path,
|
|
existing_tags: set[str],
|
|
max_vram_mb: int = 0,
|
|
) -> list[str]:
|
|
"""Register GGUFs from disk that aren't already in ollama. Returns new tags.
|
|
|
|
max_vram_mb: skip files whose size exceeds this threshold (0 = no limit).
|
|
GGUF file size is a reliable VRAM proxy -- quantized weights load ~1:1.
|
|
"""
|
|
ggufs = find_disk_ggufs(llm_root)
|
|
if not ggufs:
|
|
print(f"No .gguf files found under {llm_root}", file=sys.stderr)
|
|
return []
|
|
|
|
new_tags: list[str] = []
|
|
skipped_oom = 0
|
|
for gguf in ggufs:
|
|
size_mb = gguf.stat().st_size // (1024 * 1024)
|
|
if max_vram_mb and size_mb > max_vram_mb:
|
|
print(f" [skip-oom] {gguf.name} ({size_mb} MB > {max_vram_mb} MB limit)")
|
|
skipped_oom += 1
|
|
continue
|
|
tag = gguf_to_ollama_tag(gguf)
|
|
if tag in existing_tags:
|
|
print(f" [skip] {gguf.name} already registered as {tag}")
|
|
continue
|
|
print(f" [register] {gguf.name} ({size_mb} MB) → {tag} ...", end=" ", flush=True)
|
|
if register_gguf(gguf, tag):
|
|
print("ok")
|
|
new_tags.append(tag)
|
|
else:
|
|
print("failed")
|
|
|
|
if skipped_oom:
|
|
print(f" [info] {skipped_oom} GGUF(s) skipped (exceed {max_vram_mb} MB VRAM limit)")
|
|
return new_tags
|
|
|
|
|
|
def list_ollama_models() -> list[str]:
|
|
"""Return model names from ollama /api/tags, filtered to text-gen candidates."""
|
|
try:
|
|
resp = httpx.get(f"{_OLLAMA_URL}/api/tags", timeout=10.0)
|
|
resp.raise_for_status()
|
|
models = resp.json().get("models", [])
|
|
# Exclude embedding-only models
|
|
exclude = {"mxbai-embed-large", "nomic-embed-text", "all-minilm"}
|
|
return [
|
|
m["name"] for m in models
|
|
if not any(x in m["name"].lower() for x in exclude)
|
|
]
|
|
except Exception as exc:
|
|
print(f"[warn] Could not reach ollama: {exc}", file=sys.stderr)
|
|
return []
|
|
|
|
|
|
# ── Run benchmark ─────────────────────────────────────────────────────────────
|
|
|
|
@dataclass
|
|
class ModelResult:
|
|
model_id: str
|
|
prompt_results: list[dict[str, Any]] = field(default_factory=list)
|
|
avg_score: float = 0.0
|
|
avg_latency_ms: float = 0.0
|
|
total_filler_hits: int = 0
|
|
total_em_dashes: int = 0
|
|
total_semicolons: int = 0
|
|
|
|
|
|
def _bench_one_model(
|
|
model_id: str,
|
|
prompts: list[dict[str, str]],
|
|
profile: Any,
|
|
use_cforch: bool,
|
|
cforch_url: str,
|
|
use_vllm: bool = False,
|
|
) -> "ModelResult | None":
|
|
"""Run all prompts for a single model. Thread-safe — all output is prefixed with model_id.
|
|
|
|
Dispatch priority:
|
|
use_vllm=True → allocate vllm via cf-orch, then generate_cftext() (OpenAI-compatible)
|
|
use_cforch=True → allocate cf-text via cf-orch, then generate_cftext()
|
|
else → direct ollama generate()
|
|
Both vllm and cf-text expose /v1/chat/completions so generate_cftext() works for both.
|
|
"""
|
|
prefix = f"[{model_id}]"
|
|
result = ModelResult(model_id=model_id)
|
|
|
|
service_url: str | None = None
|
|
allocation_id: str = ""
|
|
if use_vllm:
|
|
alloc = cforch_allocate_vllm(model_id, cforch_url)
|
|
if alloc is None:
|
|
print(f"{prefix} [skip] vllm allocation failed", flush=True)
|
|
return None
|
|
service_url, allocation_id = alloc
|
|
print(f"{prefix} vllm allocated: {service_url}", flush=True)
|
|
elif use_cforch:
|
|
alloc = cforch_allocate(model_id, cforch_url)
|
|
if alloc is None:
|
|
print(f"{prefix} [skip] cf-orch allocation failed", flush=True)
|
|
return None
|
|
service_url, allocation_id = alloc
|
|
print(f"{prefix} allocated: {service_url}", flush=True)
|
|
|
|
try:
|
|
for prompt_def in prompts:
|
|
tag = prompt_def["tag"]
|
|
user_prompt = (
|
|
f"Thread: {prompt_def['thread_title']}\n\n"
|
|
f"{prompt_def['thread_body']}\n\n"
|
|
f"Write a reply:"
|
|
)
|
|
print(f"{prefix} [{tag}] generating...", flush=True)
|
|
|
|
if (use_cforch or use_vllm) and service_url:
|
|
# Both cf-text and vllm expose /v1/chat/completions — same call
|
|
output, elapsed_ms = generate_cftext(service_url, model_id, user_prompt, system=SYSTEM_PROMPT)
|
|
else:
|
|
output, elapsed_ms = generate(model_id, user_prompt, system=SYSTEM_PROMPT)
|
|
|
|
signals = extract_signals(output)
|
|
score = score_against_profile(signals, profile)
|
|
|
|
print(f"{prefix} [{tag}] {score:.0f}/100 ({elapsed_ms:.0f}ms)", flush=True)
|
|
if signals.filler_hits:
|
|
print(f"{prefix} ⚠ filler: {signals.filler_hits}", flush=True)
|
|
if signals.em_dash_count:
|
|
print(f"{prefix} ⚠ em-dashes: {signals.em_dash_count}", flush=True)
|
|
|
|
result.prompt_results.append({
|
|
"tag": tag,
|
|
"user_prompt": user_prompt,
|
|
"output": output,
|
|
"signals": {
|
|
"avg_sentence_length": signals.avg_sentence_length,
|
|
"em_dash_count": signals.em_dash_count,
|
|
"semicolon_count": signals.semicolon_count,
|
|
"filler_hits": signals.filler_hits,
|
|
"question_ratio": signals.question_ratio,
|
|
"word_count": signals.word_count,
|
|
},
|
|
"score": score,
|
|
"latency_ms": elapsed_ms,
|
|
})
|
|
finally:
|
|
if use_cforch and allocation_id:
|
|
cforch_release(allocation_id, cforch_url)
|
|
|
|
if not result.prompt_results:
|
|
return None
|
|
|
|
scores = [r["score"] for r in result.prompt_results]
|
|
latencies = [r["latency_ms"] for r in result.prompt_results]
|
|
result.avg_score = sum(scores) / len(scores)
|
|
result.avg_latency_ms = sum(latencies) / len(latencies)
|
|
result.total_filler_hits = sum(len(r["signals"]["filler_hits"]) for r in result.prompt_results)
|
|
result.total_em_dashes = sum(r["signals"]["em_dash_count"] for r in result.prompt_results)
|
|
result.total_semicolons = sum(r["signals"]["semicolon_count"] for r in result.prompt_results)
|
|
|
|
print(f"{prefix} done — avg score {result.avg_score:.0f}/100", flush=True)
|
|
return result
|
|
|
|
|
|
def run_benchmark(
|
|
model_ids: list[str],
|
|
corpus_dir: Path,
|
|
prompts: list[dict[str, str]],
|
|
use_cforch: bool = False,
|
|
use_vllm: bool = False,
|
|
cforch_url: str = _CFORCH_URL,
|
|
workers: int = 1,
|
|
) -> list[ModelResult]:
|
|
profile = build_corpus_profile(corpus_dir)
|
|
if profile:
|
|
print(f"Corpus profile loaded from {corpus_dir} ({len(list(corpus_dir.glob('*.txt')))} samples)")
|
|
print(f" Target avg sentence length: {profile.avg_sentence_length:.1f} words")
|
|
else:
|
|
print(f"[warn] No corpus samples found in {corpus_dir} -- scoring on hard violations only")
|
|
|
|
backend = "vllm via cf-orch" if use_vllm else ("cf-text via cf-orch" if use_cforch else "ollama")
|
|
print(f" Backend: {backend}")
|
|
|
|
effective_workers = min(workers, len(model_ids)) if model_ids else 1
|
|
print(f" Workers: {effective_workers} (of {len(model_ids)} models)", flush=True)
|
|
|
|
results: list[ModelResult] = []
|
|
|
|
if effective_workers <= 1:
|
|
# Sequential path — simpler output, easier to follow for single-model runs
|
|
for model_id in model_ids:
|
|
print(f"\n{'='*60}\nModel: {model_id}", flush=True)
|
|
r = _bench_one_model(model_id, prompts, profile, use_cforch, cforch_url, use_vllm)
|
|
if r:
|
|
results.append(r)
|
|
else:
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
print(f" Fanning out {len(model_ids)} models across {effective_workers} workers...", flush=True)
|
|
with ThreadPoolExecutor(max_workers=effective_workers) as pool:
|
|
futures = {
|
|
pool.submit(_bench_one_model, mid, prompts, profile, use_cforch, cforch_url, use_vllm): mid
|
|
for mid in model_ids
|
|
}
|
|
for future in as_completed(futures):
|
|
r = future.result()
|
|
if r:
|
|
results.append(r)
|
|
|
|
return sorted(results, key=lambda r: r.avg_score, reverse=True)
|
|
|
|
|
|
# ── Markdown report ───────────────────────────────────────────────────────────
|
|
|
|
def render_report(results: list[ModelResult], corpus_dir: Path) -> str:
|
|
date_str = datetime.now().strftime("%Y-%m-%d %H:%M")
|
|
lines: list[str] = [
|
|
f"# Voice Benchmark Results",
|
|
f"",
|
|
f"**Date:** {date_str} ",
|
|
f"**Corpus:** `{corpus_dir}` ",
|
|
f"**Models tested:** {len(results)} ",
|
|
f"**Prompts per model:** {len(TEST_PROMPTS)}",
|
|
f"",
|
|
f"## Rankings",
|
|
f"",
|
|
f"| Rank | Model | Score | Latency | Em-dashes | Fillers | Semicolons |",
|
|
f"|------|-------|-------|---------|-----------|---------|------------|",
|
|
]
|
|
|
|
for i, r in enumerate(results, 1):
|
|
medal = {1: "🥇", 2: "🥈", 3: "🥉"}.get(i, f"#{i}")
|
|
lines.append(
|
|
f"| {medal} | `{r.model_id}` | {r.avg_score:.0f}/100 "
|
|
f"| {r.avg_latency_ms:.0f}ms "
|
|
f"| {r.total_em_dashes} "
|
|
f"| {r.total_filler_hits} "
|
|
f"| {r.total_semicolons} |"
|
|
)
|
|
|
|
lines += ["", "## Sample Outputs", ""]
|
|
|
|
for r in results[:3]: # top 3 only to keep report readable
|
|
lines += [f"### `{r.model_id}` (avg score: {r.avg_score:.0f})", ""]
|
|
for pr in r.prompt_results:
|
|
lines += [
|
|
f"**Prompt:** {pr['tag']} ",
|
|
f"**Score:** {pr['score']:.0f}/100 ",
|
|
f"",
|
|
f"```",
|
|
pr["output"],
|
|
f"```",
|
|
f"",
|
|
]
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def save_report(results: list[ModelResult], corpus_dir: Path) -> Path:
|
|
_RESULTS_DIR.mkdir(exist_ok=True)
|
|
date_str = datetime.now().strftime("%Y-%m-%d_%H%M")
|
|
report_path = _RESULTS_DIR / f"voice_{date_str}.md"
|
|
report_path.write_text(render_report(results, corpus_dir), encoding="utf-8")
|
|
|
|
# Also save raw JSON for programmatic use
|
|
json_path = _RESULTS_DIR / f"voice_{date_str}.json"
|
|
json_path.write_text(
|
|
json.dumps(
|
|
[
|
|
{
|
|
"model_id": r.model_id,
|
|
"avg_score": r.avg_score,
|
|
"avg_latency_ms": r.avg_latency_ms,
|
|
"total_filler_hits": r.total_filler_hits,
|
|
"total_em_dashes": r.total_em_dashes,
|
|
"total_semicolons": r.total_semicolons,
|
|
"prompt_results": r.prompt_results,
|
|
}
|
|
for r in results
|
|
],
|
|
indent=2,
|
|
),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
return report_path
|
|
|
|
|
|
# ── CLI commands ──────────────────────────────────────────────────────────────
|
|
|
|
def cmd_list_models(_args: argparse.Namespace) -> None:
|
|
models = list_ollama_models()
|
|
if not models:
|
|
print("No models found (is ollama running?)")
|
|
return
|
|
print(f"{len(models)} models available:\n")
|
|
for m in models:
|
|
print(f" {m}")
|
|
|
|
|
|
def cmd_run(args: argparse.Namespace) -> None:
|
|
corpus_dir = Path(args.samples)
|
|
if not corpus_dir.exists():
|
|
print(f"[error] Corpus directory not found: {corpus_dir}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
max_vram_mb: int = getattr(args, "max_vram", 7200)
|
|
use_cforch: bool = getattr(args, "cforch", False)
|
|
use_vllm: bool = getattr(args, "vllm", False)
|
|
cforch_url: str = getattr(args, "cforch_url", _CFORCH_URL)
|
|
registered_tags: list[str] = []
|
|
|
|
def _filter_ollama_by_size(ids: list[str], include_large: bool) -> list[str]:
|
|
"""Apply name-pattern size filter to ollama model list."""
|
|
if include_large:
|
|
return ids
|
|
skip_patterns = ["270b", "70b", "32b", "30b", "21b", "20b", "deepseek-r1"]
|
|
filtered = [m for m in ids if not any(p in m.lower() for p in skip_patterns)]
|
|
skipped = len(ids) - len(filtered)
|
|
if skipped:
|
|
print(f"[info] Skipped {skipped} large model(s) by name pattern. "
|
|
"Pass --include-large to include them.")
|
|
return filtered
|
|
|
|
if args.models and args.models != "all":
|
|
model_ids = [m.strip() for m in args.models.split(",") if m.strip()]
|
|
elif use_cforch:
|
|
# cf-orch path: pull model list from catalog, filter by vram_mb
|
|
catalog = cforch_list_catalog(cforch_url)
|
|
if not catalog:
|
|
print("[warn] cf-orch catalog empty or unreachable -- falling back to ollama models")
|
|
use_cforch = False
|
|
model_ids = _filter_ollama_by_size(list_ollama_models(), args.include_large)
|
|
if not model_ids:
|
|
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
before = list(catalog.items())
|
|
allowed = {mid: mb for mid, mb in before if mb == 0 or mb <= max_vram_mb}
|
|
skipped_oom = {mid: mb for mid, mb in before if mid not in allowed}
|
|
model_ids = list(allowed.keys())
|
|
print(f"[info] cf-orch catalog: {len(before)} model(s), "
|
|
f"{len(allowed)} within {max_vram_mb} MB VRAM limit")
|
|
if skipped_oom:
|
|
print(f"[info] Skipped (OOM risk): "
|
|
+ ", ".join(f"{mid} ({mb} MB)" for mid, mb in sorted(skipped_oom.items())))
|
|
else:
|
|
# Ollama path
|
|
model_ids = list_ollama_models()
|
|
if not model_ids:
|
|
print("[error] No models found. Pass --models explicitly or check ollama.", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Backfill GGUFs from disk before filtering -- skips files that exceed VRAM limit
|
|
if getattr(args, "scan_disk", None):
|
|
llm_root = Path(args.scan_disk)
|
|
print(f"\nScanning {llm_root} for unregistered GGUFs (limit: {max_vram_mb} MB)...")
|
|
registered_tags = backfill_disk_models(llm_root, set(model_ids), max_vram_mb=max_vram_mb)
|
|
model_ids = list_ollama_models() # re-fetch with new registrations
|
|
|
|
model_ids = _filter_ollama_by_size(model_ids, args.include_large)
|
|
|
|
print(f"\nRunning voice benchmark on {len(model_ids)} model(s)...")
|
|
try:
|
|
results = run_benchmark(model_ids, corpus_dir, TEST_PROMPTS, use_cforch=use_cforch, use_vllm=use_vllm, cforch_url=cforch_url, workers=args.workers)
|
|
report_path = save_report(results, corpus_dir)
|
|
print(f"\n{'='*60}")
|
|
print(f"Results saved to: {report_path}")
|
|
print(f"\n{render_report(results, corpus_dir)}")
|
|
finally:
|
|
if registered_tags:
|
|
print(f"\nCleaning up {len(registered_tags)} temporary ollama registrations...")
|
|
for tag in registered_tags:
|
|
deregister_gguf(tag)
|
|
|
|
|
|
def cmd_show_last(_args: argparse.Namespace) -> None:
|
|
reports = sorted(_RESULTS_DIR.glob("voice_*.md"), reverse=True)
|
|
if not reports:
|
|
print("No benchmark results found. Run --run first.")
|
|
return
|
|
print(reports[0].read_text(encoding="utf-8"))
|
|
|
|
|
|
# ── Entry point ───────────────────────────────────────────────────────────────
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(
|
|
description="Voice benchmark harness for local text-gen models",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
sub = parser.add_subparsers(dest="cmd")
|
|
|
|
sub.add_parser("list-models", help="List available ollama models")
|
|
|
|
run_p = sub.add_parser("run", help="Run the benchmark")
|
|
run_p.add_argument("--models", default="all", help="Comma-separated model IDs, or 'all'")
|
|
run_p.add_argument("--samples", default=str(_CORPUS_DIR), help="Path to voice corpus directory")
|
|
run_p.add_argument("--include-large", action="store_true", help="Include models >20B params")
|
|
run_p.add_argument("--scan-disk", metavar="LLM_ROOT", help="Scan directory for GGUFs not yet in ollama (e.g. /Library/Assets/LLM)")
|
|
run_p.add_argument("--cforch", action="store_true", help="Route generation through cf-orch/cf-text instead of direct ollama")
|
|
run_p.add_argument("--vllm", action="store_true", help="Route generation through cf-orch/vllm (OpenAI-compatible) instead of ollama")
|
|
run_p.add_argument("--cforch-url", default=_CFORCH_URL, help=f"cf-orch coordinator URL (default: {_CFORCH_URL})")
|
|
run_p.add_argument("--max-vram", type=int, default=7200, metavar="MB",
|
|
help="Skip models whose VRAM footprint exceeds this limit in MB (default: 7200)")
|
|
run_p.add_argument("--workers", type=int, default=1, metavar="N",
|
|
help="Parallel workers — run N models simultaneously (default: 1; use 4+ with cf-orch)")
|
|
|
|
sub.add_parser("show-last", help="Print the most recent benchmark report")
|
|
|
|
# Also support legacy --list-models / --run / --show-last flags for manage.sh compat
|
|
parser.add_argument("--list-models", action="store_true")
|
|
parser.add_argument("--run", action="store_true")
|
|
parser.add_argument("--show-last", action="store_true")
|
|
parser.add_argument("--models", default="all")
|
|
parser.add_argument("--samples", default=str(_CORPUS_DIR))
|
|
parser.add_argument("--include-large", action="store_true")
|
|
parser.add_argument("--scan-disk", metavar="LLM_ROOT")
|
|
parser.add_argument("--cforch", action="store_true")
|
|
parser.add_argument("--vllm", action="store_true")
|
|
parser.add_argument("--cforch-url", default=_CFORCH_URL)
|
|
parser.add_argument("--max-vram", type=int, default=7200, metavar="MB")
|
|
parser.add_argument("--workers", type=int, default=1, metavar="N")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.cmd == "list-models" or args.list_models:
|
|
cmd_list_models(args)
|
|
elif args.cmd == "run" or args.run:
|
|
cmd_run(args)
|
|
elif args.cmd == "show-last" or args.show_last:
|
|
cmd_show_last(args)
|
|
else:
|
|
parser.print_help()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|