- dashboard: eval card now shows last run + score for all bench types
(classifier, LLM, style, plans) via new _get_recent_bench_runs()
- dashboard: skip cforch LLM-bench list summaries when scanning for
classifier best_macro_f1 (fixes _find_latest_classifier_bench)
- cforch: stale _BENCH_RUNNING flag now auto-resets if process exited;
idle timeout (120s via select) kills hung benchmark if node crashes
- api: add /api/finetune/{run,cancel} backward-compat shims while
ClassifierTab fine-tune section is migrated to TrainJobsView
- ClassifierTab: migrate all /api/benchmark/* paths to /api/cforch/*;
fix null-safety on results.models access; load fine-tuned models from
/api/train/results instead of /api/finetune/status
- CompareTab: extend model picker to include vllm + cf-text alongside
ollama, grouped by service; pre-select all LLM_SERVICES on load
- LlmEvalTab: null-safety on quality_by_task_type lookups
- models: AVOCET_MODELS_DIR env var overrides default models/ path
282 lines
11 KiB
Python
282 lines
11 KiB
Python
"""Avocet -- dashboard aggregate API.
|
|
|
|
GET /api/dashboard returns the current flywheel state:
|
|
labeled_since_last_eval -- items labeled after the most recent bench run
|
|
last_eval_timestamp -- ISO timestamp of newest bench_results summary
|
|
last_eval_best_score -- best macro_f1 from that summary
|
|
active_jobs -- jobs with status queued or running
|
|
corrections_pending -- sft_candidates with status=needs_review
|
|
corrections_export_ready -- approved sft candidates with non-blank correction
|
|
recent_bench_runs -- most-recent timestamp + score per bench type
|
|
signals -- computed booleans for UI nudge indicators
|
|
|
|
Thresholds in label_tool.yaml pipeline: section:
|
|
pipeline:
|
|
data_eval_threshold: 50 # labeled items since last bench to trigger nudge
|
|
eval_train_threshold: 0.05 # improvement delta needed before retraining (future)
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import yaml
|
|
from pathlib import Path
|
|
|
|
from fastapi import APIRouter
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
_ROOT = Path(__file__).parent.parent
|
|
_DATA_DIR: Path = _ROOT / "data"
|
|
_CONFIG_DIR: Path | None = None
|
|
|
|
router = APIRouter()
|
|
|
|
_DEFAULT_DATA_EVAL_THRESHOLD = 50
|
|
_DEFAULT_EVAL_TRAIN_THRESHOLD = 0.05
|
|
|
|
|
|
def set_data_dir(path: Path) -> None:
|
|
global _DATA_DIR
|
|
_DATA_DIR = path
|
|
|
|
def set_config_dir(path: Path | None) -> None:
|
|
global _CONFIG_DIR
|
|
_CONFIG_DIR = path
|
|
|
|
def _config_file() -> Path:
|
|
if _CONFIG_DIR is not None:
|
|
return _CONFIG_DIR / "label_tool.yaml"
|
|
return _ROOT / "config" / "label_tool.yaml"
|
|
|
|
def _load_thresholds() -> tuple[int, float]:
|
|
f = _config_file()
|
|
if f.exists():
|
|
try:
|
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
|
pipeline = raw.get("pipeline", {}) or {}
|
|
return (
|
|
int(pipeline.get("data_eval_threshold", _DEFAULT_DATA_EVAL_THRESHOLD)),
|
|
float(pipeline.get("eval_train_threshold", _DEFAULT_EVAL_TRAIN_THRESHOLD)),
|
|
)
|
|
except Exception as exc:
|
|
logger.warning("Failed to read pipeline thresholds: %s", exc)
|
|
return _DEFAULT_DATA_EVAL_THRESHOLD, _DEFAULT_EVAL_TRAIN_THRESHOLD
|
|
|
|
def _load_score_records() -> list[dict]:
|
|
path = _DATA_DIR / "email_score.jsonl"
|
|
if not path.exists():
|
|
return []
|
|
records = []
|
|
for line in path.read_text(encoding="utf-8").splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
records.append(json.loads(line))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return records
|
|
|
|
def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str | None, float | None]:
|
|
"""Return (iso_timestamp, best_macro_f1) from the newest bench_results summary.
|
|
|
|
Checks results_dir from cforch config if set, then falls back to
|
|
_ROOT/bench_results/. Returns (None, None) if no results exist.
|
|
"""
|
|
candidates = []
|
|
if results_dir_override:
|
|
candidates.append(Path(results_dir_override))
|
|
else:
|
|
f = _config_file()
|
|
if f.exists():
|
|
try:
|
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
|
rd = (raw.get("cforch", {}) or {}).get("results_dir", "")
|
|
if rd:
|
|
candidates.append(Path(rd))
|
|
except Exception as exc:
|
|
logger.warning("Failed to read cforch.results_dir from config: %s", exc)
|
|
candidates.append(_ROOT / "bench_results")
|
|
|
|
for rdir in candidates:
|
|
if not rdir.exists():
|
|
continue
|
|
subdirs = sorted([d for d in rdir.iterdir() if d.is_dir()], key=lambda d: d.name)
|
|
for subdir in reversed(subdirs):
|
|
summary = subdir / "summary.json"
|
|
if summary.exists():
|
|
try:
|
|
data = json.loads(summary.read_text(encoding="utf-8"))
|
|
if not isinstance(data, dict):
|
|
continue # cforch LLM-bench summaries are lists; skip
|
|
ts = data.get("timestamp") or subdir.name
|
|
score = data.get("best_macro_f1") or data.get("macro_f1")
|
|
return ts, (float(score) if isinstance(score, (int, float)) else None)
|
|
except Exception as exc:
|
|
logger.warning("Failed to parse summary.json at %s: %s", summary, exc)
|
|
return None, None
|
|
|
|
# Keep old name as alias so existing callers in tests still work.
|
|
_find_latest_eval = _find_latest_classifier_bench
|
|
|
|
|
|
def _count_corrections() -> tuple[int, int]:
|
|
"""Return (pending_count, export_ready_count)."""
|
|
pending = 0
|
|
export_ready = 0
|
|
candidates_path = _DATA_DIR / "sft_candidates.jsonl"
|
|
approved_path = _DATA_DIR / "sft_approved.jsonl"
|
|
if candidates_path.exists():
|
|
for line in candidates_path.read_text(encoding="utf-8").splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
r = json.loads(line)
|
|
if r.get("status") == "needs_review":
|
|
pending += 1
|
|
except json.JSONDecodeError:
|
|
pass
|
|
if approved_path.exists():
|
|
for line in approved_path.read_text(encoding="utf-8").splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
try:
|
|
r = json.loads(line)
|
|
if (r.get("status") == "approved"
|
|
and r.get("corrected_response")
|
|
and str(r["corrected_response"]).strip()):
|
|
export_ready += 1
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return pending, export_ready
|
|
|
|
def _get_active_jobs() -> list[dict]:
|
|
"""Query train SQLite DB for queued/running jobs. Returns [] if DB absent."""
|
|
try:
|
|
from app.train.train import _DB_PATH, _db, _init_db
|
|
if not _DB_PATH.exists():
|
|
return []
|
|
_init_db()
|
|
with _db() as conn:
|
|
rows = conn.execute(
|
|
"SELECT id, type, model_key, status FROM jobs WHERE status IN ('queued', 'running')"
|
|
).fetchall()
|
|
return [{"id": r["id"], "type": r["type"], "model_key": r["model_key"], "status": r["status"]} for r in rows]
|
|
except Exception as exc:
|
|
logger.warning("Failed to query train jobs DB: %s", exc)
|
|
return []
|
|
|
|
def _count_labeled_since(since_ts: str | None) -> int:
|
|
records = _load_score_records()
|
|
if since_ts is None:
|
|
return len(records)
|
|
return sum(1 for r in records if r.get("labeled_at", "") > since_ts)
|
|
|
|
|
|
def _get_recent_bench_runs() -> dict:
|
|
"""Return most-recent run summary for each bench type.
|
|
|
|
Each entry: {"timestamp": str|None, "metric": str|None, "score": float|None}
|
|
"""
|
|
runs: dict[str, dict] = {
|
|
"classifier": {"timestamp": None, "metric": "macro_f1", "score": None},
|
|
"llm": {"timestamp": None, "metric": None, "score": None},
|
|
"style": {"timestamp": None, "metric": None, "score": None},
|
|
"plans": {"timestamp": None, "metric": "avg_score", "score": None},
|
|
}
|
|
|
|
# ── Classifier: bench_results/<run>/summary.json ──────────────────────
|
|
clf_ts, clf_score = _find_latest_classifier_bench()
|
|
if clf_ts:
|
|
runs["classifier"]["timestamp"] = clf_ts
|
|
runs["classifier"]["score"] = clf_score
|
|
|
|
# ── LLM bench + Style: benchmark_results/ ─────────────────────────────
|
|
f = _config_file()
|
|
bench_dir: Path | None = None
|
|
if f.exists():
|
|
try:
|
|
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
|
|
rd = (raw.get("cforch", {}) or {}).get("results_dir", "")
|
|
if rd:
|
|
bench_dir = Path(rd)
|
|
except Exception:
|
|
pass
|
|
if bench_dir is None:
|
|
bench_dir = _ROOT / "benchmark_results"
|
|
|
|
if bench_dir.exists():
|
|
llm_files = sorted(
|
|
[p for p in bench_dir.glob("*.json") if not p.name.startswith("style_")],
|
|
key=lambda p: p.stat().st_mtime, reverse=True,
|
|
)
|
|
if llm_files:
|
|
try:
|
|
data = json.loads(llm_files[0].read_text(encoding="utf-8"))
|
|
runs["llm"]["timestamp"] = data.get("timestamp") or llm_files[0].stem
|
|
except Exception:
|
|
pass
|
|
|
|
style_files = sorted(bench_dir.glob("style_*.json"), reverse=True)
|
|
if style_files:
|
|
try:
|
|
data = json.loads(style_files[0].read_text(encoding="utf-8"))
|
|
if isinstance(data, list) and data:
|
|
runs["style"]["timestamp"] = data[0].get("timestamp") or style_files[0].stem
|
|
except Exception:
|
|
pass
|
|
|
|
# ── Plans bench: data/plans_bench_results/plans_*.json ────────────────
|
|
plans_dir = _DATA_DIR / "plans_bench_results"
|
|
if plans_dir.exists():
|
|
plans_files = sorted(plans_dir.glob("plans_*.json"), reverse=True)
|
|
if plans_files:
|
|
run_id = plans_files[0].stem
|
|
try:
|
|
d: dict = json.loads(plans_files[0].read_text(encoding="utf-8"))
|
|
all_scores = [
|
|
r["total_score"]
|
|
for results in d.values()
|
|
for r in results
|
|
if isinstance(r, dict) and not r.get("error")
|
|
]
|
|
avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else None
|
|
try:
|
|
date_part = run_id.removeprefix("plans_")
|
|
date, time_part = date_part.split("_")
|
|
ts_display = f"{date} {time_part[:2]}:{time_part[2:4]}"
|
|
except Exception:
|
|
ts_display = run_id
|
|
runs["plans"]["timestamp"] = ts_display
|
|
runs["plans"]["score"] = avg
|
|
except Exception:
|
|
pass
|
|
|
|
return runs
|
|
|
|
|
|
@router.get("/dashboard")
|
|
def get_dashboard() -> dict:
|
|
data_threshold, _train_threshold = _load_thresholds()
|
|
last_ts, last_score = _find_latest_classifier_bench()
|
|
labeled_since = _count_labeled_since(last_ts)
|
|
corrections_pending, corrections_export_ready = _count_corrections()
|
|
active_jobs = _get_active_jobs()
|
|
recent_bench = _get_recent_bench_runs()
|
|
return {
|
|
"labeled_since_last_eval": labeled_since,
|
|
"last_eval_timestamp": last_ts,
|
|
"last_eval_best_score": last_score,
|
|
"active_jobs": active_jobs,
|
|
"corrections_pending": corrections_pending,
|
|
"corrections_export_ready": corrections_export_ready,
|
|
"recent_bench_runs": recent_bench,
|
|
"signals": {
|
|
"data_to_eval": labeled_since >= data_threshold,
|
|
"eval_to_train": False, # future: implement delta-F1 comparison
|
|
"train_to_fleet": False, # future: implement fleet sync signal
|
|
},
|
|
}
|