feat(avocet): benchmark UI, label fixes, BenchmarkView with charts and SSE run
This commit is contained in:
parent
d3ae5b576a
commit
8c22dd62de
14 changed files with 2573 additions and 33 deletions
53
app/api.py
53
app/api.py
|
|
@ -287,6 +287,59 @@ def test_account(req: AccountTestRequest):
|
|||
from fastapi.responses import StreamingResponse
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Benchmark endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@app.get("/api/benchmark/results")
|
||||
def get_benchmark_results():
|
||||
"""Return the most recently saved benchmark results, or an empty envelope."""
|
||||
path = _DATA_DIR / "benchmark_results.json"
|
||||
if not path.exists():
|
||||
return {"models": {}, "sample_count": 0, "timestamp": None}
|
||||
return json.loads(path.read_text())
|
||||
|
||||
|
||||
@app.get("/api/benchmark/run")
|
||||
def run_benchmark(include_slow: bool = False):
|
||||
"""Spawn the benchmark script and stream stdout as SSE progress events."""
|
||||
import subprocess
|
||||
|
||||
python_bin = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
|
||||
script = str(_ROOT / "scripts" / "benchmark_classifier.py")
|
||||
cmd = [python_bin, script, "--score", "--save"]
|
||||
if include_slow:
|
||||
cmd.append("--include-slow")
|
||||
|
||||
def generate():
|
||||
try:
|
||||
proc = subprocess.Popen(
|
||||
cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
text=True,
|
||||
bufsize=1,
|
||||
cwd=str(_ROOT),
|
||||
)
|
||||
for line in proc.stdout:
|
||||
line = line.rstrip()
|
||||
if line:
|
||||
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||
proc.wait()
|
||||
if proc.returncode == 0:
|
||||
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
|
||||
else:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
|
||||
except Exception as exc:
|
||||
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/event-stream",
|
||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/api/fetch/stream")
|
||||
def fetch_stream(
|
||||
accounts: str = Query(default=""),
|
||||
|
|
|
|||
1861
docs/superpowers/plans/2026-03-15-finetune-classifier.md
Normal file
1861
docs/superpowers/plans/2026-03-15-finetune-classifier.md
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -163,7 +163,8 @@ def run_scoring(
|
|||
gold = [r["label"] for r in rows]
|
||||
results: dict[str, Any] = {}
|
||||
|
||||
for adapter in adapters:
|
||||
for i, adapter in enumerate(adapters, 1):
|
||||
print(f"[{i}/{len(adapters)}] Running {adapter.name} ({len(rows)} samples) …", flush=True)
|
||||
preds: list[str] = []
|
||||
t0 = time.monotonic()
|
||||
for row in rows:
|
||||
|
|
@ -177,6 +178,7 @@ def run_scoring(
|
|||
metrics = compute_metrics(preds, gold, LABELS)
|
||||
metrics["latency_ms"] = round(elapsed_ms / len(rows), 1)
|
||||
results[adapter.name] = metrics
|
||||
print(f" → macro-F1 {metrics['__macro_f1__']:.3f} accuracy {metrics['__accuracy__']:.3f} {metrics['latency_ms']:.1f} ms/email", flush=True)
|
||||
adapter.unload()
|
||||
|
||||
return results
|
||||
|
|
@ -375,6 +377,31 @@ def cmd_score(args: argparse.Namespace) -> None:
|
|||
print(row_str)
|
||||
print()
|
||||
|
||||
if args.save:
|
||||
import datetime
|
||||
rows = load_scoring_jsonl(args.score_file)
|
||||
save_data = {
|
||||
"timestamp": datetime.datetime.utcnow().isoformat() + "Z",
|
||||
"sample_count": len(rows),
|
||||
"models": {
|
||||
name: {
|
||||
"macro_f1": round(m["__macro_f1__"], 4),
|
||||
"accuracy": round(m["__accuracy__"], 4),
|
||||
"latency_ms": m["latency_ms"],
|
||||
"per_label": {
|
||||
label: {k: round(v, 4) for k, v in m[label].items()}
|
||||
for label in LABELS
|
||||
if label in m
|
||||
},
|
||||
}
|
||||
for name, m in results.items()
|
||||
},
|
||||
}
|
||||
save_path = Path(args.score_file).parent / "benchmark_results.json"
|
||||
with open(save_path, "w") as f:
|
||||
json.dump(save_data, f, indent=2)
|
||||
print(f"Results saved → {save_path}", flush=True)
|
||||
|
||||
|
||||
def cmd_compare(args: argparse.Namespace) -> None:
|
||||
active = _active_models(args.include_slow)
|
||||
|
|
@ -431,6 +458,8 @@ def main() -> None:
|
|||
parser.add_argument("--days", type=int, default=90, help="Days back for IMAP search")
|
||||
parser.add_argument("--include-slow", action="store_true", help="Include non-default heavy models")
|
||||
parser.add_argument("--models", nargs="+", help="Override: run only these model names")
|
||||
parser.add_argument("--save", action="store_true",
|
||||
help="Save results to data/benchmark_results.json (for the web UI)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,12 @@
|
|||
<meta charset="UTF-8" />
|
||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||
<title>web</title>
|
||||
<title>Avocet — Label Tool</title>
|
||||
<!-- Inline background prevents blank-white flash before the CSS bundle loads -->
|
||||
<style>
|
||||
html, body { margin: 0; background: #eaeff8; min-height: 100vh; }
|
||||
@media (prefers-color-scheme: dark) { html, body { background: #16202e; } }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div id="app"></div>
|
||||
|
|
|
|||
|
|
@ -11,11 +11,13 @@
|
|||
import { onMounted } from 'vue'
|
||||
import { RouterView } from 'vue-router'
|
||||
import { useMotion } from './composables/useMotion'
|
||||
import { useHackerMode } from './composables/useEasterEgg'
|
||||
import { useHackerMode, useKonamiCode } from './composables/useEasterEgg'
|
||||
import AppSidebar from './components/AppSidebar.vue'
|
||||
|
||||
const motion = useMotion()
|
||||
const { restore } = useHackerMode()
|
||||
const { toggle, restore } = useHackerMode()
|
||||
|
||||
useKonamiCode(toggle)
|
||||
|
||||
onMounted(() => {
|
||||
restore() // re-apply hacker mode from localStorage on page load
|
||||
|
|
|
|||
|
|
@ -8,8 +8,29 @@
|
|||
Accent — Russet (#B8622A) — inspired by avocet's vivid orange-russet head
|
||||
*/
|
||||
|
||||
/* ── Page-level overrides — must be in avocet.css (applied after theme.css base) ── */
|
||||
html {
|
||||
/* Prevent Mac Chrome's horizontal swipe-to-navigate page animation
|
||||
from triggering when the user scrolls near the viewport edge */
|
||||
overscroll-behavior-x: none;
|
||||
/* clip (not hidden) — prevents overflowing content from expanding the html layout
|
||||
width beyond the viewport. Without this, body's overflow-x:hidden propagates to
|
||||
the viewport and body has no BFC, so long email URLs inflate the layout and
|
||||
margin:0 auto centering drifts rightward as fonts load. */
|
||||
overflow-x: clip;
|
||||
}
|
||||
|
||||
body {
|
||||
/* Prevent horizontal scroll from card swipe animations */
|
||||
overflow-x: hidden;
|
||||
}
|
||||
|
||||
|
||||
/* ── Light mode (default) ──────────────────────────── */
|
||||
:root {
|
||||
/* Aliases bridging avocet component vars to CircuitForge base theme vars */
|
||||
--color-bg: var(--color-surface); /* App.vue body bg → #eaeff8 in light */
|
||||
--color-text-secondary: var(--color-text-muted); /* muted label text */
|
||||
/* Primary — Slate Teal */
|
||||
--app-primary: #2A6080; /* 4.8:1 on light surface #eaeff8 — ✅ AA */
|
||||
--app-primary-hover: #1E4D66; /* darker for hover */
|
||||
|
|
|
|||
|
|
@ -62,10 +62,11 @@ import { RouterLink } from 'vue-router'
|
|||
const LS_KEY = 'cf-avocet-nav-stowed'
|
||||
|
||||
const navItems = [
|
||||
{ path: '/', icon: '🃏', label: 'Label' },
|
||||
{ path: '/fetch', icon: '📥', label: 'Fetch' },
|
||||
{ path: '/stats', icon: '📊', label: 'Stats' },
|
||||
{ path: '/settings', icon: '⚙️', label: 'Settings' },
|
||||
{ path: '/', icon: '🃏', label: 'Label' },
|
||||
{ path: '/fetch', icon: '📥', label: 'Fetch' },
|
||||
{ path: '/stats', icon: '📊', label: 'Stats' },
|
||||
{ path: '/benchmark', icon: '🏁', label: 'Benchmark' },
|
||||
{ path: '/settings', icon: '⚙️', label: 'Settings' },
|
||||
]
|
||||
|
||||
const stowed = ref(localStorage.getItem(LS_KEY) === 'true')
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ const displayBody = computed(() => {
|
|||
font-size: 0.9375rem;
|
||||
line-height: 1.6;
|
||||
white-space: pre-wrap;
|
||||
overflow-wrap: break-word;
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -84,6 +84,8 @@ const FLING_WINDOW_MS = 50 // rolling sample window in ms
|
|||
let velocityBuf: { x: number; y: number; t: number }[] = []
|
||||
|
||||
function onPointerDown(e: PointerEvent) {
|
||||
// Let clicks on interactive children (expand/collapse, links, etc.) pass through
|
||||
if ((e.target as Element).closest('button, a, input, select, textarea')) return
|
||||
if (!motion.rich.value) return
|
||||
;(e.currentTarget as HTMLElement).setPointerCapture(e.pointerId)
|
||||
pickupX.value = e.clientX
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
<template>
|
||||
<div class="label-grid" :class="{ 'bucket-mode': isBucketMode }" role="group" aria-label="Label buttons">
|
||||
<button
|
||||
v-for="label in labels"
|
||||
v-for="label in displayLabels"
|
||||
:key="label.key"
|
||||
data-testid="label-btn"
|
||||
:data-label-key="label.name"
|
||||
|
|
@ -19,6 +19,8 @@
|
|||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { computed } from 'vue'
|
||||
|
||||
interface Label { name: string; emoji: string; color: string; key: string }
|
||||
|
||||
const props = defineProps<{
|
||||
|
|
@ -27,6 +29,16 @@ const props = defineProps<{
|
|||
hoveredBucket?: string | null
|
||||
}>()
|
||||
const emit = defineEmits<{ label: [name: string] }>()
|
||||
|
||||
// Numpad layout: reverse the row order of numeric keys (7-8-9 on top, 1-2-3 on bottom)
|
||||
// Non-numeric keys (e.g. 'h' for hired) stay pinned after the grid.
|
||||
const displayLabels = computed(() => {
|
||||
const numeric = props.labels.filter(l => !isNaN(Number(l.key)))
|
||||
const other = props.labels.filter(l => isNaN(Number(l.key)))
|
||||
const rows: Label[][] = []
|
||||
for (let i = 0; i < numeric.length; i += 3) rows.push(numeric.slice(i, i + 3))
|
||||
return [...rows.reverse().flat(), ...other]
|
||||
})
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
|
|
@ -38,11 +50,9 @@ const emit = defineEmits<{ label: [name: string] }>()
|
|||
padding var(--bucket-expand, 250ms cubic-bezier(0.34, 1.56, 0.64, 1));
|
||||
}
|
||||
|
||||
/* 10th button (hired / key h) — centered below the 3×3 like a numpad 0 */
|
||||
/* 10th button (hired / key h) — full-width bar below the 3×3 */
|
||||
.label-btn:last-child {
|
||||
grid-column: 1 / -1;
|
||||
max-width: calc(33.333% - 0.34rem);
|
||||
justify-self: center;
|
||||
}
|
||||
|
||||
.label-grid.bucket-mode {
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
import { onMounted, onUnmounted } from 'vue'
|
||||
|
||||
const KONAMI = ['ArrowUp','ArrowUp','ArrowDown','ArrowDown','ArrowLeft','ArrowRight','ArrowLeft','ArrowRight','b','a']
|
||||
const KONAMI = ['ArrowUp','ArrowUp','ArrowDown','ArrowDown','ArrowLeft','ArrowRight','ArrowLeft','ArrowRight','b','a']
|
||||
const KONAMI_AB = ['ArrowUp','ArrowUp','ArrowDown','ArrowDown','ArrowLeft','ArrowRight','ArrowLeft','ArrowRight','a','b']
|
||||
|
||||
export function useKonamiCode(onActivate: () => void) {
|
||||
export function useKeySequence(sequence: string[], onActivate: () => void) {
|
||||
let pos = 0
|
||||
|
||||
function handler(e: KeyboardEvent) {
|
||||
if (e.key === KONAMI[pos]) {
|
||||
if (e.key === sequence[pos]) {
|
||||
pos++
|
||||
if (pos === KONAMI.length) {
|
||||
if (pos === sequence.length) {
|
||||
pos = 0
|
||||
onActivate()
|
||||
}
|
||||
|
|
@ -21,6 +22,11 @@ export function useKonamiCode(onActivate: () => void) {
|
|||
onUnmounted(() => window.removeEventListener('keydown', handler))
|
||||
}
|
||||
|
||||
export function useKonamiCode(onActivate: () => void) {
|
||||
useKeySequence(KONAMI, onActivate)
|
||||
useKeySequence(KONAMI_AB, onActivate)
|
||||
}
|
||||
|
||||
export function useHackerMode() {
|
||||
function toggle() {
|
||||
const root = document.documentElement
|
||||
|
|
|
|||
|
|
@ -2,16 +2,18 @@ import { createRouter, createWebHashHistory } from 'vue-router'
|
|||
import LabelView from '../views/LabelView.vue'
|
||||
|
||||
// Views are lazy-loaded to keep initial bundle small
|
||||
const FetchView = () => import('../views/FetchView.vue')
|
||||
const StatsView = () => import('../views/StatsView.vue')
|
||||
const SettingsView = () => import('../views/SettingsView.vue')
|
||||
const FetchView = () => import('../views/FetchView.vue')
|
||||
const StatsView = () => import('../views/StatsView.vue')
|
||||
const BenchmarkView = () => import('../views/BenchmarkView.vue')
|
||||
const SettingsView = () => import('../views/SettingsView.vue')
|
||||
|
||||
export const router = createRouter({
|
||||
history: createWebHashHistory(),
|
||||
routes: [
|
||||
{ path: '/', component: LabelView, meta: { title: 'Label' } },
|
||||
{ path: '/fetch', component: FetchView, meta: { title: 'Fetch' } },
|
||||
{ path: '/stats', component: StatsView, meta: { title: 'Stats' } },
|
||||
{ path: '/settings', component: SettingsView, meta: { title: 'Settings' } },
|
||||
{ path: '/', component: LabelView, meta: { title: 'Label' } },
|
||||
{ path: '/fetch', component: FetchView, meta: { title: 'Fetch' } },
|
||||
{ path: '/stats', component: StatsView, meta: { title: 'Stats' } },
|
||||
{ path: '/benchmark', component: BenchmarkView, meta: { title: 'Benchmark' } },
|
||||
{ path: '/settings', component: SettingsView, meta: { title: 'Settings' } },
|
||||
],
|
||||
})
|
||||
|
|
|
|||
551
web/src/views/BenchmarkView.vue
Normal file
551
web/src/views/BenchmarkView.vue
Normal file
|
|
@ -0,0 +1,551 @@
|
|||
<template>
|
||||
<div class="bench-view">
|
||||
<header class="bench-header">
|
||||
<h1 class="page-title">🏁 Benchmark</h1>
|
||||
<div class="header-actions">
|
||||
<label class="slow-toggle" :class="{ disabled: running }">
|
||||
<input type="checkbox" v-model="includeSlow" :disabled="running" />
|
||||
Include slow models
|
||||
</label>
|
||||
<button
|
||||
class="btn-run"
|
||||
:disabled="running"
|
||||
@click="startBenchmark"
|
||||
>
|
||||
{{ running ? '⏳ Running…' : results ? '🔄 Re-run' : '▶ Run Benchmark' }}
|
||||
</button>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<!-- Progress log -->
|
||||
<div v-if="running || runLog.length" class="run-log">
|
||||
<div class="run-log-title">
|
||||
<span>{{ running ? '⏳ Running benchmark…' : runError ? '❌ Failed' : '✅ Done' }}</span>
|
||||
<button class="btn-ghost" @click="runLog = []; runError = ''">Clear</button>
|
||||
</div>
|
||||
<div class="log-lines" ref="logEl">
|
||||
<div
|
||||
v-for="(line, i) in runLog"
|
||||
:key="i"
|
||||
class="log-line"
|
||||
:class="{ 'log-error': line.startsWith('ERROR') || line.startsWith('[error]') }"
|
||||
>{{ line }}</div>
|
||||
</div>
|
||||
<p v-if="runError" class="run-error">{{ runError }}</p>
|
||||
</div>
|
||||
|
||||
<!-- Loading -->
|
||||
<div v-if="loading" class="status-notice">Loading…</div>
|
||||
|
||||
<!-- No results yet -->
|
||||
<div v-else-if="!results" class="status-notice empty">
|
||||
<p>No benchmark results yet.</p>
|
||||
<p class="hint">Click <strong>Run Benchmark</strong> to score all default models against your labeled data.</p>
|
||||
</div>
|
||||
|
||||
<!-- Results -->
|
||||
<template v-else>
|
||||
<p class="meta-line">
|
||||
<span>{{ results.sample_count.toLocaleString() }} labeled emails</span>
|
||||
<span class="sep">·</span>
|
||||
<span>{{ modelCount }} model{{ modelCount === 1 ? '' : 's' }}</span>
|
||||
<span class="sep">·</span>
|
||||
<span>{{ formatDate(results.timestamp) }}</span>
|
||||
</p>
|
||||
|
||||
<!-- Macro-F1 chart -->
|
||||
<section class="chart-section">
|
||||
<h2 class="chart-title">Macro-F1 (higher = better)</h2>
|
||||
<div class="bar-chart">
|
||||
<div v-for="row in f1Rows" :key="row.name" class="bar-row">
|
||||
<span class="bar-label" :title="row.name">{{ row.name }}</span>
|
||||
<div class="bar-track">
|
||||
<div
|
||||
class="bar-fill"
|
||||
:style="{ width: `${row.pct}%`, background: scoreColor(row.value) }"
|
||||
/>
|
||||
</div>
|
||||
<span class="bar-value" :style="{ color: scoreColor(row.value) }">
|
||||
{{ row.value.toFixed(3) }}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Latency chart -->
|
||||
<section class="chart-section">
|
||||
<h2 class="chart-title">Latency (ms / email, lower = better)</h2>
|
||||
<div class="bar-chart">
|
||||
<div v-for="row in latencyRows" :key="row.name" class="bar-row">
|
||||
<span class="bar-label" :title="row.name">{{ row.name }}</span>
|
||||
<div class="bar-track">
|
||||
<div
|
||||
class="bar-fill latency-fill"
|
||||
:style="{ width: `${row.pct}%` }"
|
||||
/>
|
||||
</div>
|
||||
<span class="bar-value">{{ row.value.toFixed(1) }} ms</span>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Per-label F1 heatmap -->
|
||||
<section class="chart-section">
|
||||
<h2 class="chart-title">Per-label F1</h2>
|
||||
<div class="heatmap-scroll">
|
||||
<table class="heatmap">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="hm-label-col">Label</th>
|
||||
<th v-for="name in modelNames" :key="name" class="hm-model-col" :title="name">
|
||||
{{ name }}
|
||||
</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr v-for="label in labelNames" :key="label">
|
||||
<td class="hm-label-cell">
|
||||
<span class="hm-emoji">{{ LABEL_META[label]?.emoji ?? '🏷️' }}</span>
|
||||
{{ label.replace(/_/g, '\u00a0') }}
|
||||
</td>
|
||||
<td
|
||||
v-for="name in modelNames"
|
||||
:key="name"
|
||||
class="hm-value-cell"
|
||||
:style="{ background: heatmapBg(f1For(name, label)), color: heatmapFg(f1For(name, label)) }"
|
||||
:title="`${name} / ${label}: F1 ${f1For(name, label).toFixed(3)}, support ${supportFor(name, label)}`"
|
||||
>
|
||||
{{ f1For(name, label).toFixed(2) }}
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</div>
|
||||
<p class="heatmap-hint">Hover a cell for precision / recall / support. Color: 🟢 ≥ 0.7 · 🟡 0.4–0.7 · 🔴 < 0.4</p>
|
||||
</section>
|
||||
</template>
|
||||
</div>
|
||||
</template>
|
||||
|
||||
<script setup lang="ts">
|
||||
import { ref, computed, onMounted, nextTick } from 'vue'
|
||||
import { useApiFetch, useApiSSE } from '../composables/useApi'
|
||||
|
||||
// ── Label metadata (same as StatsView) ──────────────────────────────────────
|
||||
const LABEL_META: Record<string, { emoji: string }> = {
|
||||
interview_scheduled: { emoji: '🗓️' },
|
||||
offer_received: { emoji: '🎉' },
|
||||
rejected: { emoji: '❌' },
|
||||
positive_response: { emoji: '👍' },
|
||||
survey_received: { emoji: '📋' },
|
||||
neutral: { emoji: '⬜' },
|
||||
event_rescheduled: { emoji: '🔄' },
|
||||
digest: { emoji: '📰' },
|
||||
new_lead: { emoji: '🤝' },
|
||||
hired: { emoji: '🎊' },
|
||||
}
|
||||
|
||||
// ── Types ────────────────────────────────────────────────────────────────────
|
||||
interface PerLabel { f1: number; precision: number; recall: number; support: number }
|
||||
interface ModelResult {
|
||||
macro_f1: number
|
||||
accuracy: number
|
||||
latency_ms: number
|
||||
per_label: Record<string, PerLabel>
|
||||
}
|
||||
interface BenchResults {
|
||||
timestamp: string | null
|
||||
sample_count: number
|
||||
models: Record<string, ModelResult>
|
||||
}
|
||||
|
||||
// ── State ────────────────────────────────────────────────────────────────────
|
||||
const results = ref<BenchResults | null>(null)
|
||||
const loading = ref(true)
|
||||
const running = ref(false)
|
||||
const runLog = ref<string[]>([])
|
||||
const runError = ref('')
|
||||
const includeSlow = ref(false)
|
||||
const logEl = ref<HTMLElement | null>(null)
|
||||
|
||||
// ── Derived ──────────────────────────────────────────────────────────────────
|
||||
const modelNames = computed(() => Object.keys(results.value?.models ?? {}))
|
||||
const modelCount = computed(() => modelNames.value.length)
|
||||
|
||||
const labelNames = computed(() => {
|
||||
const canonical = Object.keys(LABEL_META)
|
||||
const inResults = new Set(
|
||||
modelNames.value.flatMap(n => Object.keys(results.value!.models[n].per_label))
|
||||
)
|
||||
return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))]
|
||||
})
|
||||
|
||||
const f1Rows = computed(() => {
|
||||
if (!results.value) return []
|
||||
const rows = modelNames.value.map(name => ({
|
||||
name,
|
||||
value: results.value!.models[name].macro_f1,
|
||||
}))
|
||||
rows.sort((a, b) => b.value - a.value)
|
||||
const max = rows[0]?.value || 1
|
||||
return rows.map(r => ({ ...r, pct: Math.round((r.value / max) * 100) }))
|
||||
})
|
||||
|
||||
const latencyRows = computed(() => {
|
||||
if (!results.value) return []
|
||||
const rows = modelNames.value.map(name => ({
|
||||
name,
|
||||
value: results.value!.models[name].latency_ms,
|
||||
}))
|
||||
rows.sort((a, b) => a.value - b.value) // fastest first
|
||||
const max = rows[rows.length - 1]?.value || 1
|
||||
return rows.map(r => ({ ...r, pct: Math.round((r.value / max) * 100) }))
|
||||
})
|
||||
|
||||
// ── Helpers ──────────────────────────────────────────────────────────────────
|
||||
function f1For(model: string, label: string): number {
|
||||
return results.value?.models[model]?.per_label[label]?.f1 ?? 0
|
||||
}
|
||||
function supportFor(model: string, label: string): number {
|
||||
return results.value?.models[model]?.per_label[label]?.support ?? 0
|
||||
}
|
||||
|
||||
function scoreColor(v: number): string {
|
||||
if (v >= 0.7) return 'var(--color-success, #4CAF50)'
|
||||
if (v >= 0.4) return 'var(--app-accent, #B8622A)'
|
||||
return 'var(--color-error, #ef4444)'
|
||||
}
|
||||
|
||||
function heatmapBg(v: number): string {
|
||||
// Blend red→yellow→green using the F1 value
|
||||
if (v >= 0.7) return `color-mix(in srgb, #4CAF50 ${Math.round(v * 100)}%, #1a2338 ${Math.round((1 - v) * 80)}%)`
|
||||
if (v >= 0.4) return `color-mix(in srgb, #FF9800 ${Math.round(v * 120)}%, #1a2338 40%)`
|
||||
return `color-mix(in srgb, #ef4444 ${Math.round(v * 200 + 30)}%, #1a2338 60%)`
|
||||
}
|
||||
function heatmapFg(v: number): string {
|
||||
return v >= 0.5 ? '#fff' : 'rgba(255,255,255,0.75)'
|
||||
}
|
||||
|
||||
function formatDate(iso: string | null): string {
|
||||
if (!iso) return 'unknown date'
|
||||
const d = new Date(iso)
|
||||
return d.toLocaleString(undefined, { dateStyle: 'medium', timeStyle: 'short' })
|
||||
}
|
||||
|
||||
// ── Data loading ─────────────────────────────────────────────────────────────
|
||||
async function loadResults() {
|
||||
loading.value = true
|
||||
const { data } = await useApiFetch<BenchResults>('/api/benchmark/results')
|
||||
loading.value = false
|
||||
if (data && Object.keys(data.models).length > 0) {
|
||||
results.value = data
|
||||
}
|
||||
}
|
||||
|
||||
// ── Benchmark run ─────────────────────────────────────────────────────────────
|
||||
function startBenchmark() {
|
||||
running.value = true
|
||||
runLog.value = []
|
||||
runError.value = ''
|
||||
|
||||
const url = `/api/benchmark/run${includeSlow.value ? '?include_slow=true' : ''}`
|
||||
useApiSSE(
|
||||
url,
|
||||
async (event) => {
|
||||
if (event.type === 'progress' && typeof event.message === 'string') {
|
||||
runLog.value.push(event.message)
|
||||
await nextTick()
|
||||
logEl.value?.scrollTo({ top: logEl.value.scrollHeight, behavior: 'smooth' })
|
||||
}
|
||||
if (event.type === 'error' && typeof event.message === 'string') {
|
||||
runError.value = event.message
|
||||
}
|
||||
},
|
||||
async () => {
|
||||
running.value = false
|
||||
await loadResults()
|
||||
},
|
||||
() => {
|
||||
running.value = false
|
||||
if (!runError.value) runError.value = 'Connection lost'
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
onMounted(loadResults)
|
||||
</script>
|
||||
|
||||
<style scoped>
|
||||
.bench-view {
|
||||
max-width: 860px;
|
||||
margin: 0 auto;
|
||||
padding: 1.5rem 1rem 4rem;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.75rem;
|
||||
}
|
||||
|
||||
.bench-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.page-title {
|
||||
font-family: var(--font-display, var(--font-body, sans-serif));
|
||||
font-size: 1.4rem;
|
||||
font-weight: 700;
|
||||
color: var(--app-primary, #2A6080);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
.header-actions {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.75rem;
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
|
||||
.slow-toggle {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.4rem;
|
||||
font-size: 0.85rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
}
|
||||
.slow-toggle.disabled { opacity: 0.5; pointer-events: none; }
|
||||
|
||||
.btn-run {
|
||||
padding: 0.45rem 1.1rem;
|
||||
border-radius: 0.375rem;
|
||||
border: none;
|
||||
background: var(--app-primary, #2A6080);
|
||||
color: #fff;
|
||||
font-size: 0.88rem;
|
||||
font-family: var(--font-body, sans-serif);
|
||||
cursor: pointer;
|
||||
transition: opacity 0.15s;
|
||||
}
|
||||
.btn-run:disabled { opacity: 0.5; cursor: not-allowed; }
|
||||
.btn-run:not(:disabled):hover { opacity: 0.85; }
|
||||
|
||||
/* ── Run log ────────────────────────────────────────────── */
|
||||
.run-log {
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
border-radius: 0.5rem;
|
||||
overflow: hidden;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.run-log-title {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-size: 0.8rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
}
|
||||
|
||||
.btn-ghost {
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
cursor: pointer;
|
||||
font-size: 0.78rem;
|
||||
padding: 0.1rem 0.3rem;
|
||||
border-radius: 0.2rem;
|
||||
}
|
||||
.btn-ghost:hover { background: var(--color-border, #d0d7e8); }
|
||||
|
||||
.log-lines {
|
||||
max-height: 200px;
|
||||
overflow-y: auto;
|
||||
padding: 0.5rem 0.75rem;
|
||||
background: var(--color-surface, #fff);
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.1rem;
|
||||
}
|
||||
|
||||
.log-line { color: var(--color-text, #1a2338); line-height: 1.5; }
|
||||
.log-line.log-error { color: var(--color-error, #ef4444); }
|
||||
|
||||
.run-error {
|
||||
margin: 0;
|
||||
padding: 0.4rem 0.75rem;
|
||||
background: color-mix(in srgb, var(--color-error, #ef4444) 10%, transparent);
|
||||
color: var(--color-error, #ef4444);
|
||||
font-size: 0.82rem;
|
||||
font-family: var(--font-mono, monospace);
|
||||
}
|
||||
|
||||
/* ── Status notices ─────────────────────────────────────── */
|
||||
.status-notice {
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-size: 0.9rem;
|
||||
padding: 1rem;
|
||||
}
|
||||
.status-notice.empty {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
padding: 3rem 1rem;
|
||||
text-align: center;
|
||||
}
|
||||
.hint { font-size: 0.85rem; opacity: 0.75; }
|
||||
|
||||
/* ── Meta line ──────────────────────────────────────────── */
|
||||
.meta-line {
|
||||
display: flex;
|
||||
gap: 0.5rem;
|
||||
align-items: center;
|
||||
font-size: 0.85rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
font-family: var(--font-mono, monospace);
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.sep { opacity: 0.4; }
|
||||
|
||||
/* ── Chart sections ─────────────────────────────────────── */
|
||||
.chart-section {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.75rem;
|
||||
}
|
||||
|
||||
.chart-title {
|
||||
font-size: 0.95rem;
|
||||
font-weight: 600;
|
||||
color: var(--color-text, #1a2338);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ── Bar charts ─────────────────────────────────────────── */
|
||||
.bar-chart {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 0.4rem;
|
||||
}
|
||||
|
||||
.bar-row {
|
||||
display: grid;
|
||||
grid-template-columns: 14rem 1fr 5rem;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
font-size: 0.82rem;
|
||||
}
|
||||
|
||||
.bar-label {
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.76rem;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
text-overflow: ellipsis;
|
||||
color: var(--color-text, #1a2338);
|
||||
}
|
||||
|
||||
.bar-track {
|
||||
height: 16px;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-radius: 99px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.bar-fill {
|
||||
height: 100%;
|
||||
border-radius: 99px;
|
||||
transition: width 0.5s cubic-bezier(0.16, 1, 0.3, 1);
|
||||
}
|
||||
|
||||
.latency-fill { background: var(--app-primary, #2A6080); opacity: 0.65; }
|
||||
|
||||
.bar-value {
|
||||
text-align: right;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.8rem;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
/* ── Heatmap ────────────────────────────────────────────── */
|
||||
.heatmap-scroll {
|
||||
overflow-x: auto;
|
||||
border-radius: 0.5rem;
|
||||
border: 1px solid var(--color-border, #d0d7e8);
|
||||
}
|
||||
|
||||
.heatmap {
|
||||
border-collapse: collapse;
|
||||
min-width: 100%;
|
||||
font-size: 0.78rem;
|
||||
}
|
||||
|
||||
.hm-label-col {
|
||||
text-align: left;
|
||||
min-width: 11rem;
|
||||
padding: 0.4rem 0.6rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
font-weight: 600;
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
position: sticky;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.hm-model-col {
|
||||
min-width: 5rem;
|
||||
max-width: 8rem;
|
||||
padding: 0.4rem 0.5rem;
|
||||
background: var(--color-surface-raised, #e4ebf5);
|
||||
border-bottom: 1px solid var(--color-border, #d0d7e8);
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.7rem;
|
||||
text-overflow: ellipsis;
|
||||
overflow: hidden;
|
||||
white-space: nowrap;
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.hm-label-cell {
|
||||
padding: 0.35rem 0.6rem;
|
||||
background: var(--color-surface, #fff);
|
||||
border-top: 1px solid var(--color-border, #d0d7e8);
|
||||
white-space: nowrap;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-size: 0.74rem;
|
||||
position: sticky;
|
||||
left: 0;
|
||||
}
|
||||
|
||||
.hm-emoji { margin-right: 0.3rem; }
|
||||
|
||||
.hm-value-cell {
|
||||
padding: 0.35rem 0.5rem;
|
||||
text-align: center;
|
||||
font-family: var(--font-mono, monospace);
|
||||
font-variant-numeric: tabular-nums;
|
||||
border-top: 1px solid rgba(255,255,255,0.08);
|
||||
cursor: default;
|
||||
transition: filter 0.15s;
|
||||
}
|
||||
.hm-value-cell:hover { filter: brightness(1.15); }
|
||||
|
||||
.heatmap-hint {
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-secondary, #6b7a99);
|
||||
margin: 0;
|
||||
}
|
||||
|
||||
/* ── Mobile tweaks ──────────────────────────────────────── */
|
||||
@media (max-width: 600px) {
|
||||
.bar-row { grid-template-columns: 9rem 1fr 4rem; }
|
||||
.bar-label { font-size: 0.7rem; }
|
||||
.bench-header { flex-direction: column; align-items: flex-start; }
|
||||
}
|
||||
</style>
|
||||
|
|
@ -103,7 +103,7 @@
|
|||
|
||||
<script setup lang="ts">
|
||||
import { ref, watch, onMounted, onUnmounted } from 'vue'
|
||||
import { animate, spring } from 'animejs'
|
||||
import { animate } from 'animejs'
|
||||
import { useLabelStore } from '../stores/label'
|
||||
import { useApiFetch } from '../composables/useApi'
|
||||
import { useHaptics } from '../composables/useHaptics'
|
||||
|
|
@ -132,8 +132,8 @@ watch(isHeld, (held) => {
|
|||
if (!motion.rich.value || !gridEl.value) return
|
||||
animate(gridEl.value,
|
||||
held
|
||||
? { y: -8, opacity: 0.45, ease: spring({ mass: 1, stiffness: 80, damping: 10 }), duration: 250 }
|
||||
: { y: 0, opacity: 1, ease: spring({ mass: 1, stiffness: 80, damping: 10 }), duration: 250 }
|
||||
? { y: -8, opacity: 0.45, ease: 'out(4)', duration: 380 }
|
||||
: { y: 0, opacity: 1, ease: 'out(4)', duration: 320 }
|
||||
)
|
||||
})
|
||||
|
||||
|
|
@ -469,15 +469,11 @@ onUnmounted(() => {
|
|||
padding: 0.5rem 0 0.75rem;
|
||||
z-index: 10;
|
||||
}
|
||||
/* During toss: switch to fixed so the grid is guaranteed in-viewport
|
||||
regardless of scroll position, then fade so ball aura shows through. */
|
||||
/* During toss: stay sticky so the grid holds its natural column position
|
||||
(fixed caused a horizontal jump on desktop due to sidebar offset).
|
||||
Opacity and translateY(-8px) are owned by Anime.js. */
|
||||
.bucket-grid-footer.grid-active {
|
||||
position: fixed;
|
||||
bottom: 0;
|
||||
left: calc(50% - min(50%, 320px));
|
||||
right: calc(50% - min(50%, 320px));
|
||||
opacity: 0.45;
|
||||
/* translateY(-8px) is owned by Anime.js — no transform here */
|
||||
}
|
||||
|
||||
/* ── Toss edge zones ── */
|
||||
|
|
|
|||
Loading…
Reference in a new issue