"""Avocet -- dashboard aggregate API. GET /api/dashboard returns the current flywheel state: labeled_since_last_eval -- items labeled after the most recent bench run last_eval_timestamp -- ISO timestamp of newest bench_results summary last_eval_best_score -- best macro_f1 from that summary active_jobs -- jobs with status queued or running corrections_pending -- sft_candidates with status=needs_review corrections_export_ready -- approved sft candidates with non-blank correction recent_bench_runs -- most-recent timestamp + score per bench type signals -- computed booleans for UI nudge indicators Thresholds in label_tool.yaml pipeline: section: pipeline: data_eval_threshold: 50 # labeled items since last bench to trigger nudge eval_train_threshold: 0.05 # improvement delta needed before retraining (future) """ from __future__ import annotations import json import logging import yaml from pathlib import Path from fastapi import APIRouter logger = logging.getLogger(__name__) _ROOT = Path(__file__).parent.parent _DATA_DIR: Path = _ROOT / "data" _CONFIG_DIR: Path | None = None router = APIRouter() _DEFAULT_DATA_EVAL_THRESHOLD = 50 _DEFAULT_EVAL_TRAIN_THRESHOLD = 0.05 def set_data_dir(path: Path) -> None: global _DATA_DIR _DATA_DIR = path def set_config_dir(path: Path | None) -> None: global _CONFIG_DIR _CONFIG_DIR = path def _config_file() -> Path: if _CONFIG_DIR is not None: return _CONFIG_DIR / "label_tool.yaml" return _ROOT / "config" / "label_tool.yaml" def _load_thresholds() -> tuple[int, float]: f = _config_file() if f.exists(): try: raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} pipeline = raw.get("pipeline", {}) or {} return ( int(pipeline.get("data_eval_threshold", _DEFAULT_DATA_EVAL_THRESHOLD)), float(pipeline.get("eval_train_threshold", _DEFAULT_EVAL_TRAIN_THRESHOLD)), ) except Exception as exc: logger.warning("Failed to read pipeline thresholds: %s", exc) return _DEFAULT_DATA_EVAL_THRESHOLD, _DEFAULT_EVAL_TRAIN_THRESHOLD def _load_score_records() -> list[dict]: path = _DATA_DIR / "email_score.jsonl" if not path.exists(): return [] records = [] for line in path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: records.append(json.loads(line)) except json.JSONDecodeError: pass return records def _find_latest_classifier_bench(results_dir_override: str = "") -> tuple[str | None, float | None]: """Return (iso_timestamp, best_macro_f1) from the newest bench_results summary. Checks results_dir from cforch config if set, then falls back to _ROOT/bench_results/. Returns (None, None) if no results exist. """ candidates = [] if results_dir_override: candidates.append(Path(results_dir_override)) else: f = _config_file() if f.exists(): try: raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} rd = (raw.get("cforch", {}) or {}).get("results_dir", "") if rd: candidates.append(Path(rd)) except Exception as exc: logger.warning("Failed to read cforch.results_dir from config: %s", exc) candidates.append(_ROOT / "bench_results") for rdir in candidates: if not rdir.exists(): continue subdirs = sorted([d for d in rdir.iterdir() if d.is_dir()], key=lambda d: d.name) for subdir in reversed(subdirs): summary = subdir / "summary.json" if summary.exists(): try: data = json.loads(summary.read_text(encoding="utf-8")) if not isinstance(data, dict): continue # cforch LLM-bench summaries are lists; skip ts = data.get("timestamp") or subdir.name score = data.get("best_macro_f1") or data.get("macro_f1") return ts, (float(score) if isinstance(score, (int, float)) else None) except Exception as exc: logger.warning("Failed to parse summary.json at %s: %s", summary, exc) return None, None # Keep old name as alias so existing callers in tests still work. _find_latest_eval = _find_latest_classifier_bench def _count_corrections() -> tuple[int, int]: """Return (pending_count, export_ready_count).""" pending = 0 export_ready = 0 candidates_path = _DATA_DIR / "sft_candidates.jsonl" approved_path = _DATA_DIR / "sft_approved.jsonl" if candidates_path.exists(): for line in candidates_path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: r = json.loads(line) if r.get("status") == "needs_review": pending += 1 except json.JSONDecodeError: pass if approved_path.exists(): for line in approved_path.read_text(encoding="utf-8").splitlines(): line = line.strip() if not line: continue try: r = json.loads(line) if (r.get("status") == "approved" and r.get("corrected_response") and str(r["corrected_response"]).strip()): export_ready += 1 except json.JSONDecodeError: pass return pending, export_ready def _get_active_jobs() -> list[dict]: """Query train SQLite DB for queued/running jobs. Returns [] if DB absent.""" try: from app.train.train import _DB_PATH, _db, _init_db if not _DB_PATH.exists(): return [] _init_db() with _db() as conn: rows = conn.execute( "SELECT id, type, model_key, status FROM jobs WHERE status IN ('queued', 'running')" ).fetchall() return [{"id": r["id"], "type": r["type"], "model_key": r["model_key"], "status": r["status"]} for r in rows] except Exception as exc: logger.warning("Failed to query train jobs DB: %s", exc) return [] def _count_labeled_since(since_ts: str | None) -> int: records = _load_score_records() if since_ts is None: return len(records) return sum(1 for r in records if r.get("labeled_at", "") > since_ts) def _get_recent_bench_runs() -> dict: """Return most-recent run summary for each bench type. Each entry: {"timestamp": str|None, "metric": str|None, "score": float|None} """ runs: dict[str, dict] = { "classifier": {"timestamp": None, "metric": "macro_f1", "score": None}, "llm": {"timestamp": None, "metric": None, "score": None}, "style": {"timestamp": None, "metric": None, "score": None}, "plans": {"timestamp": None, "metric": "avg_score", "score": None}, } # ── Classifier: bench_results//summary.json ────────────────────── clf_ts, clf_score = _find_latest_classifier_bench() if clf_ts: runs["classifier"]["timestamp"] = clf_ts runs["classifier"]["score"] = clf_score # ── LLM bench + Style: benchmark_results/ ───────────────────────────── f = _config_file() bench_dir: Path | None = None if f.exists(): try: raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {} rd = (raw.get("cforch", {}) or {}).get("results_dir", "") if rd: bench_dir = Path(rd) except Exception: pass if bench_dir is None: bench_dir = _ROOT / "benchmark_results" if bench_dir.exists(): llm_files = sorted( [p for p in bench_dir.glob("*.json") if not p.name.startswith("style_")], key=lambda p: p.stat().st_mtime, reverse=True, ) if llm_files: try: data = json.loads(llm_files[0].read_text(encoding="utf-8")) runs["llm"]["timestamp"] = data.get("timestamp") or llm_files[0].stem except Exception: pass style_files = sorted(bench_dir.glob("style_*.json"), reverse=True) if style_files: try: data = json.loads(style_files[0].read_text(encoding="utf-8")) if isinstance(data, list) and data: runs["style"]["timestamp"] = data[0].get("timestamp") or style_files[0].stem except Exception: pass # ── Plans bench: data/plans_bench_results/plans_*.json ──────────────── plans_dir = _DATA_DIR / "plans_bench_results" if plans_dir.exists(): plans_files = sorted(plans_dir.glob("plans_*.json"), reverse=True) if plans_files: run_id = plans_files[0].stem try: d: dict = json.loads(plans_files[0].read_text(encoding="utf-8")) all_scores = [ r["total_score"] for results in d.values() for r in results if isinstance(r, dict) and not r.get("error") ] avg = round(sum(all_scores) / len(all_scores), 3) if all_scores else None try: date_part = run_id.removeprefix("plans_") date, time_part = date_part.split("_") ts_display = f"{date} {time_part[:2]}:{time_part[2:4]}" except Exception: ts_display = run_id runs["plans"]["timestamp"] = ts_display runs["plans"]["score"] = avg except Exception: pass return runs @router.get("/dashboard") def get_dashboard() -> dict: data_threshold, _train_threshold = _load_thresholds() last_ts, last_score = _find_latest_classifier_bench() labeled_since = _count_labeled_since(last_ts) corrections_pending, corrections_export_ready = _count_corrections() active_jobs = _get_active_jobs() recent_bench = _get_recent_bench_runs() return { "labeled_since_last_eval": labeled_since, "last_eval_timestamp": last_ts, "last_eval_best_score": last_score, "active_jobs": active_jobs, "corrections_pending": corrections_pending, "corrections_export_ready": corrections_export_ready, "recent_bench_runs": recent_bench, "signals": { "data_to_eval": labeled_since >= data_threshold, "eval_to_train": False, # future: implement delta-F1 comparison "train_to_fleet": False, # future: implement fleet sync signal }, }