feat: multi-bench dashboard, API path migration, benchmark reliability fixes
- dashboard: eval card now shows last run + score for all bench types
(classifier, LLM, style, plans) via new _get_recent_bench_runs()
- dashboard: skip cforch LLM-bench list summaries when scanning for
classifier best_macro_f1 (fixes _find_latest_classifier_bench)
- cforch: stale _BENCH_RUNNING flag now auto-resets if process exited;
idle timeout (120s via select) kills hung benchmark if node crashes
- api: add /api/finetune/{run,cancel} backward-compat shims while
ClassifierTab fine-tune section is migrated to TrainJobsView
- ClassifierTab: migrate all /api/benchmark/* paths to /api/cforch/*;
fix null-safety on results.models access; load fine-tuned models from
/api/train/results instead of /api/finetune/status
- CompareTab: extend model picker to include vllm + cf-text alongside
ollama, grouped by service; pre-select all LLM_SERVICES on load
- LlmEvalTab: null-safety on quality_by_task_type lookups
- models: AVOCET_MODELS_DIR env var overrides default models/ path
This commit is contained in:
parent
71bf88d09b
commit
9fdaeeb3d6
8 changed files with 285 additions and 70 deletions
27
app/api.py
27
app/api.py
|
|
@ -40,6 +40,33 @@ app.include_router(plans_bench_router, prefix="/api/plans-bench")
|
||||||
# In-memory last-action store (single user, local tool — in-memory is fine)
|
# In-memory last-action store (single user, local tool — in-memory is fine)
|
||||||
_last_action: dict | None = None
|
_last_action: dict | None = None
|
||||||
|
|
||||||
|
# -- Backward-compat shims (ClassifierTab still uses old /api/finetune/* paths)
|
||||||
|
# Remove once ClassifierTab fine-tune section is migrated to TrainJobsView.
|
||||||
|
|
||||||
|
from fastapi import Query
|
||||||
|
from fastapi.responses import StreamingResponse as _StreamingResponse
|
||||||
|
|
||||||
|
@app.get("/api/finetune/run")
|
||||||
|
def finetune_run_compat(model: str = Query(...), epochs: int = Query(5)) -> _StreamingResponse:
|
||||||
|
"""Shim: create a classifier train job and immediately stream it."""
|
||||||
|
from app.train.train import create_job, run_job, CreateJobRequest
|
||||||
|
job = create_job(CreateJobRequest(type="classifier", model_key=model, config_json={"epochs": epochs}))
|
||||||
|
return run_job(job["id"])
|
||||||
|
|
||||||
|
@app.post("/api/finetune/cancel")
|
||||||
|
def finetune_cancel_compat() -> dict:
|
||||||
|
"""Shim: cancel the most recent running classifier job."""
|
||||||
|
from app.train.train import _db, _init_db, cancel_job
|
||||||
|
from fastapi import HTTPException
|
||||||
|
_init_db()
|
||||||
|
with _db() as conn:
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id FROM jobs WHERE type='classifier' AND status='running' ORDER BY started_at DESC LIMIT 1"
|
||||||
|
).fetchone()
|
||||||
|
if row is None:
|
||||||
|
return {"status": "nothing_running"}
|
||||||
|
return cancel_job(row["id"])
|
||||||
|
|
||||||
from app.dashboard import router as dashboard_router
|
from app.dashboard import router as dashboard_router
|
||||||
app.include_router(dashboard_router, prefix="/api")
|
app.include_router(dashboard_router, prefix="/api")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import select as _select
|
||||||
import subprocess as _subprocess
|
import subprocess as _subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -311,8 +312,12 @@ def run_benchmark(
|
||||||
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
|
"""Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
|
||||||
global _BENCH_RUNNING, _bench_proc
|
global _BENCH_RUNNING, _bench_proc
|
||||||
|
|
||||||
|
# Check if the process is actually still alive; reset stale flag if not.
|
||||||
if _BENCH_RUNNING:
|
if _BENCH_RUNNING:
|
||||||
raise HTTPException(409, "A benchmark is already running")
|
if _bench_proc is not None and _bench_proc.poll() is None:
|
||||||
|
raise HTTPException(409, "A benchmark is already running")
|
||||||
|
_BENCH_RUNNING = False
|
||||||
|
_bench_proc = None
|
||||||
|
|
||||||
cfg = _load_cforch_config()
|
cfg = _load_cforch_config()
|
||||||
bench_script = cfg.get("bench_script", "")
|
bench_script = cfg.get("bench_script", "")
|
||||||
|
|
@ -436,8 +441,23 @@ def run_benchmark(
|
||||||
env=proc_env,
|
env=proc_env,
|
||||||
)
|
)
|
||||||
_bench_proc = proc
|
_bench_proc = proc
|
||||||
|
_IDLE_TIMEOUT_S = 120 # kill if no output for 2 minutes (node crash)
|
||||||
try:
|
try:
|
||||||
for line in proc.stdout:
|
while True:
|
||||||
|
ready = _select.select([proc.stdout], [], [], _IDLE_TIMEOUT_S)
|
||||||
|
if not ready[0]:
|
||||||
|
# No output for IDLE_TIMEOUT_S — node likely crashed
|
||||||
|
proc.terminate()
|
||||||
|
try:
|
||||||
|
proc.wait(timeout=5)
|
||||||
|
except _subprocess.TimeoutExpired:
|
||||||
|
proc.kill()
|
||||||
|
msg = f"Benchmark timed out — no output for {_IDLE_TIMEOUT_S}s (cluster node may have crashed)"
|
||||||
|
yield f"data: {json.dumps({'type': 'error', 'message': msg})}\n\n"
|
||||||
|
break
|
||||||
|
line = proc.stdout.readline()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
line = _strip_ansi(line.rstrip())
|
line = _strip_ansi(line.rstrip())
|
||||||
if line:
|
if line:
|
||||||
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
|
||||||
|
|
|
||||||
109
app/dashboard.py
109
app/dashboard.py
|
|
@ -1,17 +1,18 @@
|
||||||
"""Avocet -- dashboard aggregate API.
|
"""Avocet -- dashboard aggregate API.
|
||||||
|
|
||||||
GET /api/dashboard returns the current flywheel state:
|
GET /api/dashboard returns the current flywheel state:
|
||||||
labeled_since_last_eval -- items labeled after the most recent eval run
|
labeled_since_last_eval -- items labeled after the most recent bench run
|
||||||
last_eval_timestamp -- ISO timestamp of newest bench_results summary
|
last_eval_timestamp -- ISO timestamp of newest bench_results summary
|
||||||
last_eval_best_score -- best macro_f1 from that summary
|
last_eval_best_score -- best macro_f1 from that summary
|
||||||
active_jobs -- jobs with status queued or running
|
active_jobs -- jobs with status queued or running
|
||||||
corrections_pending -- sft_candidates with status=needs_review
|
corrections_pending -- sft_candidates with status=needs_review
|
||||||
corrections_export_ready -- approved sft candidates with non-blank correction
|
corrections_export_ready -- approved sft candidates with non-blank correction
|
||||||
|
recent_bench_runs -- most-recent timestamp + score per bench type
|
||||||
signals -- computed booleans for UI nudge indicators
|
signals -- computed booleans for UI nudge indicators
|
||||||
|
|
||||||
Thresholds in label_tool.yaml pipeline: section:
|
Thresholds in label_tool.yaml pipeline: section:
|
||||||
pipeline:
|
pipeline:
|
||||||
data_eval_threshold: 50 # labeled items since last eval to trigger nudge
|
data_eval_threshold: 50 # labeled items since last bench to trigger nudge
|
||||||
eval_train_threshold: 0.05 # improvement delta needed before retraining (future)
|
eval_train_threshold: 0.05 # improvement delta needed before retraining (future)
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -77,7 +78,7 @@ def _load_score_records() -> list[dict]:
|
||||||
pass
|
pass
|
||||||
return records
|
return records
|
||||||
|
|
||||||
def _find_latest_eval(results_dir_override: str = "") -> tuple[str | None, float | None]:
|
def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str | None, float | None]:
|
||||||
"""Return (iso_timestamp, best_macro_f1) from the newest bench_results summary.
|
"""Return (iso_timestamp, best_macro_f1) from the newest bench_results summary.
|
||||||
|
|
||||||
Checks results_dir from cforch config if set, then falls back to
|
Checks results_dir from cforch config if set, then falls back to
|
||||||
|
|
@ -107,6 +108,8 @@ def _find_latest_eval(results_dir_override: str = "") -> tuple[str | None, float
|
||||||
if summary.exists():
|
if summary.exists():
|
||||||
try:
|
try:
|
||||||
data = json.loads(summary.read_text(encoding="utf-8"))
|
data = json.loads(summary.read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(data, dict):
|
||||||
|
continue # cforch LLM-bench summaries are lists; skip
|
||||||
ts = data.get("timestamp") or subdir.name
|
ts = data.get("timestamp") or subdir.name
|
||||||
score = data.get("best_macro_f1") or data.get("macro_f1")
|
score = data.get("best_macro_f1") or data.get("macro_f1")
|
||||||
return ts, (float(score) if isinstance(score, (int, float)) else None)
|
return ts, (float(score) if isinstance(score, (int, float)) else None)
|
||||||
|
|
@ -114,6 +117,10 @@ def _find_latest_eval(results_dir_override: str = "") -> tuple[str | None, float
|
||||||
logger.warning("Failed to parse summary.json at %s: %s", summary, exc)
|
logger.warning("Failed to parse summary.json at %s: %s", summary, exc)
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
|
# Keep old name as alias so existing callers in tests still work.
|
||||||
|
_find_latest_eval = _find_latest_classifier_bench
|
||||||
|
|
||||||
|
|
||||||
def _count_corrections() -> tuple[int, int]:
|
def _count_corrections() -> tuple[int, int]:
|
||||||
"""Return (pending_count, export_ready_count)."""
|
"""Return (pending_count, export_ready_count)."""
|
||||||
pending = 0
|
pending = 0
|
||||||
|
|
@ -169,22 +176,106 @@ def _count_labeled_since(since_ts: str | None) -> int:
|
||||||
return sum(1 for r in records if r.get("labeled_at", "") > since_ts)
|
return sum(1 for r in records if r.get("labeled_at", "") > since_ts)
|
||||||
|
|
||||||
|
|
||||||
|
def _get_recent_bench_runs() -> dict:
|
||||||
|
"""Return most-recent run summary for each bench type.
|
||||||
|
|
||||||
|
Each entry: {"timestamp": str|None, "metric": str|None, "score": float|None}
|
||||||
|
"""
|
||||||
|
runs: dict[str, dict] = {
|
||||||
|
"classifier": {"timestamp": None, "metric": "macro_f1", "score": None},
|
||||||
|
"llm": {"timestamp": None, "metric": None, "score": None},
|
||||||
|
"style": {"timestamp": None, "metric": None, "score": None},
|
||||||
|
"plans": {"timestamp": None, "metric": "avg_score", "score": None},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Classifier: bench_results/<run>/summary.json ──────────────────────
|
||||||
|
clf_ts, clf_score = _find_latest_classifier_bench()
|
||||||
|
if clf_ts:
|
||||||
|
runs["classifier"]["timestamp"] = clf_ts
|
||||||
|
runs["classifier"]["score"] = clf_score
|
||||||
|
|
||||||
|
# ── LLM bench + Style: benchmark_results/ ─────────────────────────────
|
||||||
|
f = _config_file()
|
||||||
|
bench_dir: Path | None = None
|
||||||
|
if f.exists():
|
||||||
|
try:
|
||||||
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
||||||
|
rd = (raw.get("cforch", {}) or {}).get("results_dir", "")
|
||||||
|
if rd:
|
||||||
|
bench_dir = Path(rd)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
if bench_dir is None:
|
||||||
|
bench_dir = _ROOT / "benchmark_results"
|
||||||
|
|
||||||
|
if bench_dir.exists():
|
||||||
|
llm_files = sorted(
|
||||||
|
[p for p in bench_dir.glob("*.json") if not p.name.startswith("style_")],
|
||||||
|
key=lambda p: p.stat().st_mtime, reverse=True,
|
||||||
|
)
|
||||||
|
if llm_files:
|
||||||
|
try:
|
||||||
|
data = json.loads(llm_files[0].read_text(encoding="utf-8"))
|
||||||
|
runs["llm"]["timestamp"] = data.get("timestamp") or llm_files[0].stem
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
style_files = sorted(bench_dir.glob("style_*.json"), reverse=True)
|
||||||
|
if style_files:
|
||||||
|
try:
|
||||||
|
data = json.loads(style_files[0].read_text(encoding="utf-8"))
|
||||||
|
if isinstance(data, list) and data:
|
||||||
|
runs["style"]["timestamp"] = data[0].get("timestamp") or style_files[0].stem
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ── Plans bench: data/plans_bench_results/plans_*.json ────────────────
|
||||||
|
plans_dir = _DATA_DIR / "plans_bench_results"
|
||||||
|
if plans_dir.exists():
|
||||||
|
plans_files = sorted(plans_dir.glob("plans_*.json"), reverse=True)
|
||||||
|
if plans_files:
|
||||||
|
run_id = plans_files[0].stem
|
||||||
|
try:
|
||||||
|
d: dict = json.loads(plans_files[0].read_text(encoding="utf-8"))
|
||||||
|
all_scores = [
|
||||||
|
r["total_score"]
|
||||||
|
for results in d.values()
|
||||||
|
for r in results
|
||||||
|
if isinstance(r, dict) and not r.get("error")
|
||||||
|
]
|
||||||
|
avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else None
|
||||||
|
try:
|
||||||
|
date_part = run_id.removeprefix("plans_")
|
||||||
|
date, time_part = date_part.split("_")
|
||||||
|
ts_display = f"{date} {time_part[:2]}:{time_part[2:4]}"
|
||||||
|
except Exception:
|
||||||
|
ts_display = run_id
|
||||||
|
runs["plans"]["timestamp"] = ts_display
|
||||||
|
runs["plans"]["score"] = avg
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return runs
|
||||||
|
|
||||||
|
|
||||||
@router.get("/dashboard")
|
@router.get("/dashboard")
|
||||||
def get_dashboard() -> dict:
|
def get_dashboard() -> dict:
|
||||||
data_eval_threshold, eval_train_threshold = _load_thresholds()
|
data_threshold, _train_threshold = _load_thresholds()
|
||||||
last_eval_ts, last_eval_score = _find_latest_eval()
|
last_ts, last_score = _find_latest_classifier_bench()
|
||||||
labeled_since = _count_labeled_since(last_eval_ts)
|
labeled_since = _count_labeled_since(last_ts)
|
||||||
corrections_pending, corrections_export_ready = _count_corrections()
|
corrections_pending, corrections_export_ready = _count_corrections()
|
||||||
active_jobs = _get_active_jobs()
|
active_jobs = _get_active_jobs()
|
||||||
|
recent_bench = _get_recent_bench_runs()
|
||||||
return {
|
return {
|
||||||
"labeled_since_last_eval": labeled_since,
|
"labeled_since_last_eval": labeled_since,
|
||||||
"last_eval_timestamp": last_eval_ts,
|
"last_eval_timestamp": last_ts,
|
||||||
"last_eval_best_score": last_eval_score,
|
"last_eval_best_score": last_score,
|
||||||
"active_jobs": active_jobs,
|
"active_jobs": active_jobs,
|
||||||
"corrections_pending": corrections_pending,
|
"corrections_pending": corrections_pending,
|
||||||
"corrections_export_ready": corrections_export_ready,
|
"corrections_export_ready": corrections_export_ready,
|
||||||
|
"recent_bench_runs": recent_bench,
|
||||||
"signals": {
|
"signals": {
|
||||||
"data_to_eval": labeled_since >= data_eval_threshold,
|
"data_to_eval": labeled_since >= data_threshold,
|
||||||
"eval_to_train": False, # future: implement delta-F1 comparison
|
"eval_to_train": False, # future: implement delta-F1 comparison
|
||||||
"train_to_fleet": False, # future: implement fleet sync signal
|
"train_to_fleet": False, # future: implement fleet sync signal
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -38,13 +38,17 @@ except ImportError: # pragma: no cover
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
_ROOT = Path(__file__).parent.parent
|
_ROOT = Path(__file__).parent.parent
|
||||||
_MODELS_DIR: Path = _ROOT / "models"
|
_MODELS_DIR: Path = Path(
|
||||||
|
os.environ.get("AVOCET_MODELS_DIR", str(_ROOT / "models"))
|
||||||
|
)
|
||||||
_QUEUE_DIR: Path = _ROOT / "data"
|
_QUEUE_DIR: Path = _ROOT / "data"
|
||||||
|
|
||||||
# Service-specific model destinations.
|
# Service-specific model destinations.
|
||||||
# cf-text models land on the NFS-mounted shared asset store so every cluster
|
# cf-text models land on the NFS-mounted shared asset store so every cluster
|
||||||
# node can reach them without a separate download. Avocet classifiers stay local
|
# node can reach them without a separate download. Avocet classifiers default
|
||||||
# because they are fine-tuned in-place and are only consumed by avocet itself.
|
# to a local path but can be redirected via AVOCET_MODELS_DIR — set this to
|
||||||
|
# /Library/Assets/LLM/avocet/models on NFS-connected nodes to keep all model
|
||||||
|
# weights out of the repo directory.
|
||||||
# Override via CF_TEXT_MODELS_DIR env var (useful for dev / non-NFS setups).
|
# Override via CF_TEXT_MODELS_DIR env var (useful for dev / non-NFS setups).
|
||||||
_CF_TEXT_MODELS_DIR: Path = Path(
|
_CF_TEXT_MODELS_DIR: Path = Path(
|
||||||
os.environ.get("CF_TEXT_MODELS_DIR", "/Library/Assets/LLM/cf-text/models")
|
os.environ.get("CF_TEXT_MODELS_DIR", "/Library/Assets/LLM/cf-text/models")
|
||||||
|
|
|
||||||
|
|
@ -325,7 +325,7 @@ function toggleCategory(models: AvailableModel[], checked: boolean) {
|
||||||
|
|
||||||
async function loadModelCategories() {
|
async function loadModelCategories() {
|
||||||
modelsLoading.value = true
|
modelsLoading.value = true
|
||||||
const { data } = await useApiFetch<ModelCategoriesResponse>('/api/benchmark/models')
|
const { data } = await useApiFetch<ModelCategoriesResponse>('/api/cforch/models')
|
||||||
modelsLoading.value = false
|
modelsLoading.value = false
|
||||||
if (data?.categories) {
|
if (data?.categories) {
|
||||||
modelCategories.value = data.categories
|
modelCategories.value = data.categories
|
||||||
|
|
@ -342,7 +342,7 @@ const modelCount = computed(() => modelNames.value.length)
|
||||||
const labelNames = computed(() => {
|
const labelNames = computed(() => {
|
||||||
const canonical = Object.keys(LABEL_META)
|
const canonical = Object.keys(LABEL_META)
|
||||||
const inResults = new Set(
|
const inResults = new Set(
|
||||||
modelNames.value.flatMap(n => Object.keys(results.value!.models[n].per_label))
|
modelNames.value.flatMap(n => Object.keys(results.value?.models[n]?.per_label ?? {}))
|
||||||
)
|
)
|
||||||
return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))]
|
return [...canonical.filter(l => inResults.has(l)), ...[...inResults].filter(l => !canonical.includes(l))]
|
||||||
})
|
})
|
||||||
|
|
@ -401,16 +401,16 @@ function formatDate(iso: string | null): string {
|
||||||
// ── Data loading ─────────────────────────────────────────────────────────────
|
// ── Data loading ─────────────────────────────────────────────────────────────
|
||||||
async function loadResults() {
|
async function loadResults() {
|
||||||
loading.value = true
|
loading.value = true
|
||||||
const { data } = await useApiFetch<BenchResults>('/api/benchmark/results')
|
const { data } = await useApiFetch<BenchResults>('/api/cforch/results')
|
||||||
loading.value = false
|
loading.value = false
|
||||||
if (data && Object.keys(data.models).length > 0) {
|
if (data?.models && Object.keys(data.models).length > 0) {
|
||||||
results.value = data
|
results.value = data
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function loadFineTunedModels() {
|
async function loadFineTunedModels() {
|
||||||
const { data } = await useApiFetch<FineTunedModel[]>('/api/finetune/status')
|
const { data } = await useApiFetch<{ results: FineTunedModel[] }>('/api/train/results')
|
||||||
if (Array.isArray(data)) fineTunedModels.value = data
|
if (Array.isArray(data?.results)) fineTunedModels.value = data.results
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Benchmark run ────────────────────────────────────────────────────────────
|
// ── Benchmark run ────────────────────────────────────────────────────────────
|
||||||
|
|
@ -428,7 +428,7 @@ function startBenchmark() {
|
||||||
params.set('model_names', [...selectedModels.value].join(','))
|
params.set('model_names', [...selectedModels.value].join(','))
|
||||||
}
|
}
|
||||||
const qs = params.toString()
|
const qs = params.toString()
|
||||||
const url = `/api/benchmark/run${qs ? `?${qs}` : ''}`
|
const url = `/api/cforch/run${qs ? `?${qs}` : ''}`
|
||||||
useApiSSE(
|
useApiSSE(
|
||||||
url,
|
url,
|
||||||
async (event) => {
|
async (event) => {
|
||||||
|
|
@ -457,7 +457,7 @@ function startBenchmark() {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function cancelBenchmark() {
|
async function cancelBenchmark() {
|
||||||
await fetch('/api/benchmark/cancel', { method: 'POST' }).catch(() => {})
|
await fetch('/api/cforch/cancel', { method: 'POST' }).catch(() => {})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ── Fine-tune ─────────────────────────────────────────────────────────────────
|
// ── Fine-tune ─────────────────────────────────────────────────────────────────
|
||||||
|
|
|
||||||
|
|
@ -71,32 +71,35 @@
|
||||||
rows="6"
|
rows="6"
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<!-- Ollama model picker -->
|
<!-- LLM model picker (ollama + vllm + cf-text) -->
|
||||||
<details class="model-picker" open>
|
<details class="model-picker" open>
|
||||||
<summary class="picker-summary">
|
<summary class="picker-summary">
|
||||||
<span class="picker-title">🤖 Ollama Models</span>
|
<span class="picker-title">🤖 LLM Models</span>
|
||||||
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ ollamaLlmModels.length }}</span>
|
<span class="picker-badge">{{ cmpSelectedModels.size }} / {{ llmSelectableModels.length }}</span>
|
||||||
</summary>
|
</summary>
|
||||||
<div class="picker-body">
|
<div class="picker-body">
|
||||||
<label class="picker-cat-header">
|
<label class="picker-cat-header">
|
||||||
<input
|
<input
|
||||||
type="checkbox"
|
type="checkbox"
|
||||||
:checked="cmpSelectedModels.size === ollamaLlmModels.length"
|
:checked="cmpSelectedModels.size === llmSelectableModels.length"
|
||||||
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < ollamaLlmModels.length"
|
:indeterminate="cmpSelectedModels.size > 0 && cmpSelectedModels.size < llmSelectableModels.length"
|
||||||
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
|
@change="toggleAllCmpModels(($event.target as HTMLInputElement).checked)"
|
||||||
/>
|
/>
|
||||||
<span class="picker-cat-name">All ollama models</span>
|
<span class="picker-cat-name">All LLM models</span>
|
||||||
</label>
|
</label>
|
||||||
<div class="picker-model-list">
|
<div v-for="(models, service) in llmModelsByService" :key="service" class="picker-category">
|
||||||
<label v-for="m in ollamaLlmModels" :key="m.id" class="picker-model-row">
|
<span class="picker-cat-section">{{ service }}</span>
|
||||||
<input
|
<div class="picker-model-list">
|
||||||
type="checkbox"
|
<label v-for="m in models" :key="m.id" class="picker-model-row">
|
||||||
:checked="cmpSelectedModels.has(m.id)"
|
<input
|
||||||
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
|
type="checkbox"
|
||||||
/>
|
:checked="cmpSelectedModels.has(m.id)"
|
||||||
<span class="picker-model-name">{{ m.name }}</span>
|
@change="toggleCmpModel(m.id, ($event.target as HTMLInputElement).checked)"
|
||||||
<span class="picker-adapter-type">{{ m.tags.slice(0, 3).join(', ') }}</span>
|
/>
|
||||||
</label>
|
<span class="picker-model-name">{{ m.name }}</span>
|
||||||
|
<span class="picker-adapter-type">{{ m.tags.slice(0, 2).join(', ') }}</span>
|
||||||
|
</label>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</details>
|
</details>
|
||||||
|
|
@ -232,10 +235,22 @@ const cmpResults = ref<CmpResult[]>([])
|
||||||
const cmpEventSource = ref<EventSource | null>(null)
|
const cmpEventSource = ref<EventSource | null>(null)
|
||||||
|
|
||||||
// ── Computed ────────────────────────────────────────────────────────────────
|
// ── Computed ────────────────────────────────────────────────────────────────
|
||||||
const ollamaLlmModels = computed(() =>
|
const LLM_SERVICES = new Set(['ollama', 'vllm', 'cf-text'])
|
||||||
llmModels.value.filter(m => m.service === 'ollama')
|
|
||||||
|
const llmSelectableModels = computed(() =>
|
||||||
|
llmModels.value.filter(m => LLM_SERVICES.has(m.service))
|
||||||
)
|
)
|
||||||
|
|
||||||
|
/** Group selectable models by service for the picker UI */
|
||||||
|
const llmModelsByService = computed((): Record<string, CfOrchModel[]> => {
|
||||||
|
const groups: Record<string, CfOrchModel[]> = {}
|
||||||
|
for (const m of llmSelectableModels.value) {
|
||||||
|
if (!groups[m.service]) groups[m.service] = []
|
||||||
|
groups[m.service].push(m)
|
||||||
|
}
|
||||||
|
return groups
|
||||||
|
})
|
||||||
|
|
||||||
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
|
||||||
const groups: Record<string, CfOrchTask[]> = {}
|
const groups: Record<string, CfOrchTask[]> = {}
|
||||||
for (const t of llmTasks.value) {
|
for (const t of llmTasks.value) {
|
||||||
|
|
@ -270,7 +285,7 @@ function toggleCmpModel(id: string, checked: boolean) {
|
||||||
|
|
||||||
function toggleAllCmpModels(checked: boolean) {
|
function toggleAllCmpModels(checked: boolean) {
|
||||||
cmpSelectedModels.value = checked
|
cmpSelectedModels.value = checked
|
||||||
? new Set(ollamaLlmModels.value.map(m => m.id))
|
? new Set(llmSelectableModels.value.map(m => m.id))
|
||||||
: new Set()
|
: new Set()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -288,9 +303,8 @@ async function loadLlmModels() {
|
||||||
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
const { data } = await useApiFetch<{ models: CfOrchModel[] }>('/api/cforch/models')
|
||||||
if (data?.models) {
|
if (data?.models) {
|
||||||
llmModels.value = data.models
|
llmModels.value = data.models
|
||||||
// Pre-select all ollama models
|
|
||||||
cmpSelectedModels.value = new Set(
|
cmpSelectedModels.value = new Set(
|
||||||
data.models.filter(m => m.service === 'ollama').map(m => m.id)
|
data.models.filter(m => LLM_SERVICES.has(m.service)).map(m => m.id)
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -28,9 +28,6 @@
|
||||||
<span class="metric-label"> labeled since last eval</span>
|
<span class="metric-label"> labeled since last eval</span>
|
||||||
</p>
|
</p>
|
||||||
</div>
|
</div>
|
||||||
<div v-if="data.signals.data_to_eval" class="card-cta">
|
|
||||||
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
|
|
||||||
</div>
|
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- ② Eval card -->
|
<!-- ② Eval card -->
|
||||||
|
|
@ -40,18 +37,28 @@
|
||||||
<h2 class="card-title">Eval</h2>
|
<h2 class="card-title">Eval</h2>
|
||||||
</div>
|
</div>
|
||||||
<div class="card-body">
|
<div class="card-body">
|
||||||
<p class="card-metric">
|
<div class="bench-run-table">
|
||||||
<span class="metric-label">Last run: </span>
|
<div
|
||||||
<strong class="metric-value">{{ formattedEvalTime }}</strong>
|
v-for="(run, type) in data.recent_bench_runs"
|
||||||
</p>
|
:key="type"
|
||||||
<p v-if="data.last_eval_best_score != null" class="card-metric">
|
class="bench-run-row"
|
||||||
<span class="metric-label">Best score: </span>
|
>
|
||||||
<strong class="metric-value">{{ formatScore(data.last_eval_best_score) }}</strong>
|
<span class="bench-type-label">{{ BENCH_LABELS[type as BenchType] ?? type }}</span>
|
||||||
</p>
|
<span class="bench-run-time" :class="{ 'metric-muted': !run.timestamp }">
|
||||||
|
{{ run.timestamp ? formatBenchTs(run.timestamp) : '—' }}
|
||||||
|
</span>
|
||||||
|
<span v-if="run.score != null" class="bench-run-score">
|
||||||
|
{{ formatScore(run.score) }}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div v-if="data.signals.eval_to_train" class="card-cta">
|
<div v-if="data.signals.eval_to_train" class="card-cta">
|
||||||
<RouterLink to="/train/jobs" class="cta-btn">Queue Finetune</RouterLink>
|
<RouterLink to="/train/jobs" class="cta-btn">Queue Finetune</RouterLink>
|
||||||
</div>
|
</div>
|
||||||
|
<div v-if="data.signals.data_to_eval" class="card-cta">
|
||||||
|
<RouterLink to="/eval/benchmark" class="cta-btn">Run Eval</RouterLink>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<!-- ③ Train card -->
|
<!-- ③ Train card -->
|
||||||
|
|
@ -104,33 +111,49 @@ interface DashboardSignals {
|
||||||
train_to_fleet: boolean
|
train_to_fleet: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface BenchRun {
|
||||||
|
timestamp: string | null
|
||||||
|
metric: string | null
|
||||||
|
score: number | null
|
||||||
|
}
|
||||||
|
|
||||||
|
type BenchType = 'classifier' | 'llm' | 'style' | 'plans'
|
||||||
|
|
||||||
interface DashboardData {
|
interface DashboardData {
|
||||||
labeled_since_last_eval: number
|
labeled_since_last_eval: number
|
||||||
last_eval_timestamp: string | null
|
last_eval_timestamp: string | null
|
||||||
last_eval_best_score: number | null
|
last_eval_best_score: number | null
|
||||||
active_jobs: ActiveJob[]
|
active_jobs: ActiveJob[]
|
||||||
corrections_export_ready: number
|
corrections_export_ready: number
|
||||||
|
recent_bench_runs: Record<BenchType, BenchRun>
|
||||||
signals: DashboardSignals
|
signals: DashboardSignals
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const BENCH_LABELS: Record<BenchType, string> = {
|
||||||
|
classifier: 'Classifier',
|
||||||
|
llm: 'LLM Eval',
|
||||||
|
style: 'Style',
|
||||||
|
plans: 'Planning',
|
||||||
|
}
|
||||||
|
|
||||||
const data = ref<DashboardData | null>(null)
|
const data = ref<DashboardData | null>(null)
|
||||||
const loading = ref(false)
|
const loading = ref(false)
|
||||||
const error = ref<string | null>(null)
|
const error = ref<string | null>(null)
|
||||||
|
|
||||||
const formattedEvalTime = computed(() => {
|
function formatBenchTs(ts: string): string {
|
||||||
if (!data.value?.last_eval_timestamp) return 'Never'
|
const date = new Date(ts)
|
||||||
const date = new Date(data.value.last_eval_timestamp)
|
if (!isNaN(date.getTime())) {
|
||||||
if (isNaN(date.getTime())) return 'Unknown'
|
const diff = Date.now() - date.getTime()
|
||||||
const now = Date.now()
|
const mins = Math.floor(diff / 60000)
|
||||||
const diff = now - date.getTime()
|
if (mins < 1) return 'just now'
|
||||||
const mins = Math.floor(diff / 60000)
|
if (mins < 60) return `${mins}m ago`
|
||||||
if (mins < 1) return 'just now'
|
const hrs = Math.floor(mins / 60)
|
||||||
if (mins < 60) return `${mins}m ago`
|
if (hrs < 24) return `${hrs}h ago`
|
||||||
const hrs = Math.floor(mins / 60)
|
return `${Math.floor(hrs / 24)}d ago`
|
||||||
if (hrs < 24) return `${hrs}h ago`
|
}
|
||||||
const days = Math.floor(hrs / 24)
|
// Non-ISO: show as-is (plans bench uses "YYYY-MM-DD HH:MM")
|
||||||
return `${days}d ago`
|
return ts.length > 16 ? ts.slice(0, 16) : ts
|
||||||
})
|
}
|
||||||
|
|
||||||
function formatScore(score: number): string {
|
function formatScore(score: number): string {
|
||||||
return `${(score * 100).toFixed(1)}%`
|
return `${(score * 100).toFixed(1)}%`
|
||||||
|
|
@ -285,6 +308,42 @@ onMounted(() => load())
|
||||||
|
|
||||||
.cta-btn:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 85%, black); }
|
.cta-btn:hover { background: color-mix(in srgb, var(--app-primary, #2A6080) 85%, black); }
|
||||||
|
|
||||||
|
/* ── Bench run table ── */
|
||||||
|
.bench-run-table {
|
||||||
|
display: flex;
|
||||||
|
flex-direction: column;
|
||||||
|
gap: 0.3rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.bench-run-row {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: 6rem 1fr auto;
|
||||||
|
align-items: center;
|
||||||
|
gap: 0.5rem;
|
||||||
|
font-size: 0.82rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.bench-type-label {
|
||||||
|
font-weight: 600;
|
||||||
|
color: var(--color-text, #1a2338);
|
||||||
|
font-size: 0.78rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.bench-run-time {
|
||||||
|
color: var(--color-text-secondary, #6b7a99);
|
||||||
|
font-size: 0.78rem;
|
||||||
|
}
|
||||||
|
|
||||||
|
.bench-run-score {
|
||||||
|
font-family: var(--font-mono, monospace);
|
||||||
|
font-size: 0.75rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: var(--app-primary, #2A6080);
|
||||||
|
background: color-mix(in srgb, var(--app-primary, #2A6080) 10%, transparent);
|
||||||
|
padding: 0.1rem 0.35rem;
|
||||||
|
border-radius: 0.25rem;
|
||||||
|
}
|
||||||
|
|
||||||
/* ── Job pills ── */
|
/* ── Job pills ── */
|
||||||
.job-row {
|
.job-row {
|
||||||
display: flex;
|
display: flex;
|
||||||
|
|
|
||||||
|
|
@ -302,7 +302,7 @@ const llmModelBadge = computed(() => {
|
||||||
const llmTaskTypeCols = computed(() => {
|
const llmTaskTypeCols = computed(() => {
|
||||||
const types = new Set<string>()
|
const types = new Set<string>()
|
||||||
for (const r of llmResults.value) {
|
for (const r of llmResults.value) {
|
||||||
for (const k of Object.keys(r.quality_by_task_type)) types.add(k)
|
for (const k of Object.keys(r.quality_by_task_type ?? {})) types.add(k)
|
||||||
}
|
}
|
||||||
return [...types].sort()
|
return [...types].sort()
|
||||||
})
|
})
|
||||||
|
|
@ -338,7 +338,7 @@ const llmBestByCol = computed((): Record<string, string> => {
|
||||||
for (const col of llmTaskTypeCols.value) {
|
for (const col of llmTaskTypeCols.value) {
|
||||||
bestId = ''; bestVal = -Infinity
|
bestId = ''; bestVal = -Infinity
|
||||||
for (const r of llmResults.value) {
|
for (const r of llmResults.value) {
|
||||||
const v = r.quality_by_task_type[col]
|
const v = r.quality_by_task_type?.[col]
|
||||||
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
if (v != null && v > bestVal) { bestVal = v; bestId = r.model_id }
|
||||||
}
|
}
|
||||||
best[col] = bestId
|
best[col] = bestId
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue