avocet/app/plans_bench.py
pyr0ball 13ca082a43 chore(models): refresh model registries with current cluster catalog
Replace stale llama/mistral/phi model refs with models active on the
cluster: deepseek-r1 (1.5b, 7b-4bit, 0528-qwen3-8b-gguf), granite-4.1-8b,
qwen2.5 (3b, 7b), capybarahermes-2.5-mistral-7b, darwin-9b-opus. Update
benchmark_plans.py doc examples to match.
2026-05-17 11:24:03 -07:00

327 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Avocet — CF planning benchmark integration API.
Wraps scripts/benchmark_plans.py and exposes it via the Avocet API.
Connection config (api_base) is read from label_tool.yaml under the
`plans_bench:` key (optional; falls back to localhost:8080).
All endpoints are registered on `router` (FastAPI APIRouter).
api.py includes this router with prefix="/api/plans-bench".
"""
from __future__ import annotations
import json
import logging
import subprocess as _subprocess
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import httpx
import yaml
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import StreamingResponse
logger = logging.getLogger(__name__)
_ROOT = Path(__file__).parent.parent
_CONFIG_DIR: Path | None = None # override in tests via set_config_dir()
_BENCH_RUNNING: bool = False
_bench_proc: Any = None
_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_plans.py"
_RESULTS_DIR = _ROOT / "data" / "plans_bench_results"
router = APIRouter()
# ── Registered model shortcuts (mirrors benchmark_plans.MODEL_REGISTRY) ────────
# Kept here so the UI can list them without importing the script.
MODEL_REGISTRY: dict[str, str] = {
"deepseek-r1-1.5b": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
"deepseek-r1-7b-4bit": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
"deepseek-r1-0528-qwen3-8b-gguf": "DeepSeek R1 0528 Qwen3 8B GGUF (4 nodes)",
"deepseek-coder-6.7b-4bit": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
"granite-4.1-8b": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
"qwen2.5-3b": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key)",
"qwen2.5-7b": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key)",
"capybarahermes-2.5-mistral-7b-gguf": "CapybaraHermes 2.5 Mistral 7B GGUF (4 nodes)",
"darwin-9b-opus-gguf": "Darwin 9B Opus GGUF -- long-form writing (3 nodes)",
}
RUBRIC_LABELS: dict[str, str] = {
"task_structure": "Task structure (checkboxes + commits)",
"tier_awareness": "Tier awareness (Free/Paid/Premium/Ultra)",
"privacy_pillar": "Privacy pillar (local-first, no logging)",
"safety_pillar": "Safety pillar (human approval, reversibility)",
"accessibility": "Accessibility (ND/adaptive users)",
"license_split": "License awareness (MIT vs BSL)",
"file_paths": "File paths (plausible project paths)",
"cf_conventions": "CF conventions (conda, manage.sh, /Library/…)",
"length_ok": "Response length (2002500 words)",
}
# ── Testability seam ───────────────────────────────────────────────────────────
def set_config_dir(path: Path | None) -> None:
global _CONFIG_DIR
_CONFIG_DIR = path
# ── Internal helpers ───────────────────────────────────────────────────────────
def _config_file() -> Path:
if _CONFIG_DIR is not None:
return _CONFIG_DIR / "label_tool.yaml"
return _ROOT / "config" / "label_tool.yaml"
def _load_config() -> dict:
f = _config_file()
cforch_cfg: dict = {}
bench_cfg: dict = {}
if f.exists():
try:
raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
cforch_cfg = raw.get("cforch", {}) or {}
bench_cfg = raw.get("plans_bench", {}) or {}
except yaml.YAMLError as exc:
logger.warning("Failed to parse plans_bench config %s: %s", f, exc)
return {
"coordinator_url": cforch_cfg.get("coordinator_url",
bench_cfg.get("coordinator_url", "http://10.1.10.71:7700")),
"python_bin": cforch_cfg.get("python_bin",
bench_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python")),
}
def _results_file(run_id: str) -> Path:
return _RESULTS_DIR / f"{run_id}.json"
# ── GET /models ────────────────────────────────────────────────────────────────
@router.get("/models")
def get_models() -> dict:
"""Return registered model shortcuts, live cf-orch catalog, and rubric labels."""
cfg = _load_config()
cforch_models: list[dict] = []
try:
resp = httpx.get(
f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
timeout=5.0,
)
resp.raise_for_status()
for model_id, entry in resp.json().items():
if isinstance(entry, dict):
cforch_models.append({
"id": model_id,
"name": model_id,
"vram_mb": entry.get("vram_mb"),
"description": entry.get("description", ""),
})
except Exception as exc:
logger.warning("Failed to fetch cf-orch catalog: %s", exc)
return {
"registry": [
{"key": k, "description": v}
for k, v in MODEL_REGISTRY.items()
],
"cforch_models": cforch_models,
"coordinator_url": cfg["coordinator_url"],
"rubric_labels": RUBRIC_LABELS,
}
# ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/run")
def run_plans_benchmark(
models: str = Query(..., description="Comma-separated model IDs (registry keys or cf-orch model names)"),
prompt_ids: str = Query("", description="Comma-separated prompt IDs to run (empty = all 10)"),
use_cforch: bool = Query(True, description="Route inference through cf-orch coordinator"),
api_base: str = Query("", description="Direct API base URL when not using cf-orch"),
workers: int = Query(1, ge=1, le=8, description="Number of models to benchmark concurrently"),
) -> StreamingResponse:
"""Spawn benchmark_plans.py and stream stdout as SSE progress events.
On successful completion emits a `type: result` event with parsed JSON
and saves results to data/plans_bench_results/<run_id>.json.
"""
global _BENCH_RUNNING, _bench_proc
if _BENCH_RUNNING:
raise HTTPException(409, "A planning benchmark is already running")
cfg = _load_config()
python_bin = cfg["python_bin"]
coordinator_url = cfg["coordinator_url"]
model_keys = [m.strip() for m in models.split(",") if m.strip()]
if not model_keys:
raise HTTPException(400, "At least one model key is required")
run_id = datetime.now(tz=timezone.utc).strftime("plans_%Y-%m-%d_%H%M%S")
output_path = _results_file(run_id)
_RESULTS_DIR.mkdir(parents=True, exist_ok=True)
def generate():
global _BENCH_RUNNING, _bench_proc
if not _BENCH_SCRIPT.exists():
yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_plans.py not found at {_BENCH_SCRIPT}'})}\n\n"
return
cmd = [python_bin, str(_BENCH_SCRIPT)]
if len(model_keys) > 1:
cmd.extend(["--compare"] + model_keys)
else:
cmd.extend(["--model", model_keys[0]])
if use_cforch:
cmd.extend(["--cforch", "--cforch-url", coordinator_url])
elif api_base.strip():
cmd.extend(["--api-base", api_base.strip()])
cmd.extend(["--verbose", "--output", str(output_path)])
if workers > 1:
cmd.extend(["--workers", str(workers)])
if prompt_ids.strip():
cmd.extend(["--prompts"] + [p.strip() for p in prompt_ids.split(",") if p.strip()])
_BENCH_RUNNING = True
try:
proc = _subprocess.Popen(
cmd,
stdout=_subprocess.PIPE,
stderr=_subprocess.STDOUT,
text=True,
bufsize=1,
cwd=str(_ROOT),
)
_bench_proc = proc
try:
for line in proc.stdout:
line = line.rstrip()
if line:
yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
proc.wait()
if proc.returncode == 0 and output_path.exists():
try:
results = json.loads(output_path.read_text(encoding="utf-8"))
yield f"data: {json.dumps({'type': 'result', 'run_id': run_id, 'results': results})}\n\n"
except Exception as exc:
logger.warning("Failed to read plans benchmark output: %s", exc)
yield f"data: {json.dumps({'type': 'complete'})}\n\n"
else:
yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
finally:
_bench_proc = None
except Exception as exc:
yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
finally:
_BENCH_RUNNING = False
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
)
# ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results")
def list_results() -> list[dict]:
"""List past planning benchmark runs, newest first."""
if not _RESULTS_DIR.exists():
return []
runs: list[dict] = []
for f in sorted(_RESULTS_DIR.glob("plans_*.json"), reverse=True):
run_id = f.stem
try:
data: dict = json.loads(f.read_text(encoding="utf-8"))
model_keys = list(data.keys())
# Average total_score across all models and prompts
all_scores = [
r["total_score"]
for results in data.values()
for r in results
if not r.get("error")
]
avg_score = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0.0
except Exception:
model_keys = []
avg_score = 0.0
# Parse display date from run_id (plans_2026-04-27_143022)
try:
date_part = run_id.removeprefix("plans_") # 2026-04-27_143022
date, time = date_part.split("_")
display_date = f"{date} {time[:2]}:{time[2:4]}"
except Exception:
display_date = run_id
runs.append({
"run_id": run_id,
"filename": f.name,
"date": display_date,
"models": model_keys,
"avg_score": avg_score,
})
return runs
@router.get("/results/latest")
def get_latest_results() -> dict:
"""Return the most recent planning benchmark results dict."""
if not _RESULTS_DIR.exists():
raise HTTPException(404, "No benchmark results found")
files = sorted(_RESULTS_DIR.glob("plans_*.json"))
if not files:
raise HTTPException(404, "No benchmark results found")
try:
return json.loads(files[-1].read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
@router.get("/results/{run_id}")
def get_results_by_run_id(run_id: str) -> dict:
"""Return planning benchmark results for a specific run."""
if not run_id.startswith("plans_"):
raise HTTPException(400, "Invalid run_id — expected plans_YYYY-MM-DD_HHMMSS")
f = _results_file(run_id)
if not f.exists():
raise HTTPException(404, f"Results not found: {run_id}")
try:
return json.loads(f.read_text(encoding="utf-8"))
except Exception as exc:
raise HTTPException(500, f"Failed to read results: {exc}") from exc
# ── POST /cancel ───────────────────────────────────────────────────────────────
@router.post("/cancel")
def cancel_plans_benchmark() -> dict:
"""Kill the running planning benchmark subprocess."""
global _BENCH_RUNNING, _bench_proc
if not _BENCH_RUNNING:
raise HTTPException(404, "No planning benchmark is currently running")
if _bench_proc is not None:
try:
_bench_proc.terminate()
except Exception as exc:
logger.warning("Failed to terminate plans benchmark: %s", exc)
_BENCH_RUNNING = False
_bench_proc = None
return {"status": "cancelled"}