avocet/app/plans_bench.py

"""Avocet — CF planning benchmark integration API.

Wraps scripts/benchmark_plans.py and exposes it via the Avocet API.
Connection config (api_base) is read from label_tool.yaml under the
`plans_bench:` key (optional; falls back to localhost:8080).

All endpoints are registered on `router` (FastAPI APIRouter).
api.py includes this router with prefix="/api/plans-bench".
"""
from __future__ import annotations

import json
import logging
import subprocess as _subprocess
import uuid
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

import httpx
import yaml
from fastapi import APIRouter, HTTPException, Query
from fastapi.responses import StreamingResponse

logger = logging.getLogger(__name__)

_ROOT = Path(__file__).parent.parent
_CONFIG_DIR: Path | None = None  # override in tests via set_config_dir()
_BENCH_RUNNING: bool = False
_bench_proc: Any = None

_BENCH_SCRIPT = _ROOT / "scripts" / "benchmark_plans.py"
_RESULTS_DIR  = _ROOT / "data" / "plans_bench_results"

router = APIRouter()

# ── Registered model shortcuts (mirrors benchmark_plans.MODEL_REGISTRY) ────────
# Kept here so the UI can list them without importing the script.

MODEL_REGISTRY: dict[str, str] = {
    "deepseek-r1-1.5b":        "DeepSeek R1 1.5B distill (cf-orch catalog key)",
    "deepseek-r1-7b-4bit":     "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
    "deepseek-r1-0528-qwen3-8b-gguf": "DeepSeek R1 0528 Qwen3 8B GGUF (4 nodes)",
    "deepseek-coder-6.7b-4bit": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
    "granite-4.1-8b":          "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
    "qwen2.5-3b":               "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key)",
    "qwen2.5-7b":               "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key)",
    "capybarahermes-2.5-mistral-7b-gguf": "CapybaraHermes 2.5 Mistral 7B GGUF (4 nodes)",
    "darwin-9b-opus-gguf":     "Darwin 9B Opus GGUF -- long-form writing (3 nodes)",
}

RUBRIC_LABELS: dict[str, str] = {
    "task_structure":  "Task structure (checkboxes + commits)",
    "tier_awareness":  "Tier awareness (Free/Paid/Premium/Ultra)",
    "privacy_pillar":  "Privacy pillar (local-first, no logging)",
    "safety_pillar":   "Safety pillar (human approval, reversibility)",
    "accessibility":   "Accessibility (ND/adaptive users)",
    "license_split":   "License awareness (MIT vs BSL)",
    "file_paths":      "File paths (plausible project paths)",
    "cf_conventions":  "CF conventions (conda, manage.sh, /Library/…)",
    "length_ok":       "Response length (200–2500 words)",
}


# ── Testability seam ───────────────────────────────────────────────────────────

def set_config_dir(path: Path | None) -> None:
    global _CONFIG_DIR
    _CONFIG_DIR = path


# ── Internal helpers ───────────────────────────────────────────────────────────

def _config_file() -> Path:
    if _CONFIG_DIR is not None:
        return _CONFIG_DIR / "label_tool.yaml"
    return _ROOT / "config" / "label_tool.yaml"


def _load_config() -> dict:
    f = _config_file()
    cforch_cfg: dict = {}
    bench_cfg: dict = {}
    if f.exists():
        try:
            raw = yaml.safe_load(f.read_text(encoding="utf-8")) or {}
            cforch_cfg = raw.get("cforch", {}) or {}
            bench_cfg  = raw.get("plans_bench", {}) or {}
        except yaml.YAMLError as exc:
            logger.warning("Failed to parse plans_bench config %s: %s", f, exc)
    return {
        "coordinator_url": cforch_cfg.get("coordinator_url",
                           bench_cfg.get("coordinator_url", "http://10.1.10.71:7700")),
        "python_bin":      cforch_cfg.get("python_bin",
                           bench_cfg.get("python_bin", "/devl/miniconda3/envs/cf/bin/python")),
    }


def _results_file(run_id: str) -> Path:
    return _RESULTS_DIR / f"{run_id}.json"


# ── GET /models ────────────────────────────────────────────────────────────────

@router.get("/models")
def get_models() -> dict:
    """Return registered model shortcuts, live cf-orch catalog, and rubric labels."""
    cfg = _load_config()

    cforch_models: list[dict] = []
    try:
        resp = httpx.get(
            f"{cfg['coordinator_url']}/api/services/cf-text/catalog",
            timeout=5.0,
        )
        resp.raise_for_status()
        for model_id, entry in resp.json().items():
            if isinstance(entry, dict):
                cforch_models.append({
                    "id":          model_id,
                    "name":        model_id,
                    "vram_mb":     entry.get("vram_mb"),
                    "description": entry.get("description", ""),
                })
    except Exception as exc:
        logger.warning("Failed to fetch cf-orch catalog: %s", exc)

    return {
        "registry": [
            {"key": k, "description": v}
            for k, v in MODEL_REGISTRY.items()
        ],
        "cforch_models": cforch_models,
        "coordinator_url": cfg["coordinator_url"],
        "rubric_labels": RUBRIC_LABELS,
    }


# ── GET /run ───────────────────────────────────────────────────────────────────

@router.get("/run")
def run_plans_benchmark(
    models: str = Query(..., description="Comma-separated model IDs (registry keys or cf-orch model names)"),
    prompt_ids: str = Query("", description="Comma-separated prompt IDs to run (empty = all 10)"),
    use_cforch: bool = Query(True, description="Route inference through cf-orch coordinator"),
    api_base: str = Query("", description="Direct API base URL when not using cf-orch"),
    workers: int = Query(1, ge=1, le=8, description="Number of models to benchmark concurrently"),
) -> StreamingResponse:
    """Spawn benchmark_plans.py and stream stdout as SSE progress events.

    On successful completion emits a `type: result` event with parsed JSON
    and saves results to data/plans_bench_results/<run_id>.json.
    """
    global _BENCH_RUNNING, _bench_proc

    if _BENCH_RUNNING:
        raise HTTPException(409, "A planning benchmark is already running")

    cfg = _load_config()
    python_bin = cfg["python_bin"]
    coordinator_url = cfg["coordinator_url"]

    model_keys = [m.strip() for m in models.split(",") if m.strip()]
    if not model_keys:
        raise HTTPException(400, "At least one model key is required")

    run_id = datetime.now(tz=timezone.utc).strftime("plans_%Y-%m-%d_%H%M%S")
    output_path = _results_file(run_id)
    _RESULTS_DIR.mkdir(parents=True, exist_ok=True)

    def generate():
        global _BENCH_RUNNING, _bench_proc

        if not _BENCH_SCRIPT.exists():
            yield f"data: {json.dumps({'type': 'error', 'message': f'benchmark_plans.py not found at {_BENCH_SCRIPT}'})}\n\n"
            return

        cmd = [python_bin, str(_BENCH_SCRIPT)]
        if len(model_keys) > 1:
            cmd.extend(["--compare"] + model_keys)
        else:
            cmd.extend(["--model", model_keys[0]])

        if use_cforch:
            cmd.extend(["--cforch", "--cforch-url", coordinator_url])
        elif api_base.strip():
            cmd.extend(["--api-base", api_base.strip()])

        cmd.extend(["--verbose", "--output", str(output_path)])
        if workers > 1:
            cmd.extend(["--workers", str(workers)])

        if prompt_ids.strip():
            cmd.extend(["--prompts"] + [p.strip() for p in prompt_ids.split(",") if p.strip()])

        _BENCH_RUNNING = True
        try:
            proc = _subprocess.Popen(
                cmd,
                stdout=_subprocess.PIPE,
                stderr=_subprocess.STDOUT,
                text=True,
                bufsize=1,
                cwd=str(_ROOT),
            )
            _bench_proc = proc
            try:
                for line in proc.stdout:
                    line = line.rstrip()
                    if line:
                        yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
                proc.wait()
                if proc.returncode == 0 and output_path.exists():
                    try:
                        results = json.loads(output_path.read_text(encoding="utf-8"))
                        yield f"data: {json.dumps({'type': 'result', 'run_id': run_id, 'results': results})}\n\n"
                    except Exception as exc:
                        logger.warning("Failed to read plans benchmark output: %s", exc)
                    yield f"data: {json.dumps({'type': 'complete'})}\n\n"
                else:
                    yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
            finally:
                _bench_proc = None
        except Exception as exc:
            yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
        finally:
            _BENCH_RUNNING = False

    return StreamingResponse(
        generate(),
        media_type="text/event-stream",
        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
    )


# ── GET /results ───────────────────────────────────────────────────────────────

@router.get("/results")
def list_results() -> list[dict]:
    """List past planning benchmark runs, newest first."""
    if not _RESULTS_DIR.exists():
        return []

    runs: list[dict] = []
    for f in sorted(_RESULTS_DIR.glob("plans_*.json"), reverse=True):
        run_id = f.stem
        try:
            data: dict = json.loads(f.read_text(encoding="utf-8"))
            model_keys = list(data.keys())
            # Average total_score across all models and prompts
            all_scores = [
                r["total_score"]
                for results in data.values()
                for r in results
                if not r.get("error")
            ]
            avg_score = round(sum(all_scores) / len(all_scores), 3) if all_scores else 0.0
        except Exception:
            model_keys = []
            avg_score = 0.0

        # Parse display date from run_id (plans_2026-04-27_143022)
        try:
            date_part = run_id.removeprefix("plans_")  # 2026-04-27_143022
            date, time = date_part.split("_")
            display_date = f"{date} {time[:2]}:{time[2:4]}"
        except Exception:
            display_date = run_id

        runs.append({
            "run_id":      run_id,
            "filename":    f.name,
            "date":        display_date,
            "models":      model_keys,
            "avg_score":   avg_score,
        })

    return runs


@router.get("/results/latest")
def get_latest_results() -> dict:
    """Return the most recent planning benchmark results dict."""
    if not _RESULTS_DIR.exists():
        raise HTTPException(404, "No benchmark results found")
    files = sorted(_RESULTS_DIR.glob("plans_*.json"))
    if not files:
        raise HTTPException(404, "No benchmark results found")
    try:
        return json.loads(files[-1].read_text(encoding="utf-8"))
    except Exception as exc:
        raise HTTPException(500, f"Failed to read results: {exc}") from exc


@router.get("/results/{run_id}")
def get_results_by_run_id(run_id: str) -> dict:
    """Return planning benchmark results for a specific run."""
    if not run_id.startswith("plans_"):
        raise HTTPException(400, "Invalid run_id — expected plans_YYYY-MM-DD_HHMMSS")
    f = _results_file(run_id)
    if not f.exists():
        raise HTTPException(404, f"Results not found: {run_id}")
    try:
        return json.loads(f.read_text(encoding="utf-8"))
    except Exception as exc:
        raise HTTPException(500, f"Failed to read results: {exc}") from exc


# ── POST /cancel ───────────────────────────────────────────────────────────────

@router.post("/cancel")
def cancel_plans_benchmark() -> dict:
    """Kill the running planning benchmark subprocess."""
    global _BENCH_RUNNING, _bench_proc

    if not _BENCH_RUNNING:
        raise HTTPException(404, "No planning benchmark is currently running")

    if _bench_proc is not None:
        try:
            _bench_proc.terminate()
        except Exception as exc:
            logger.warning("Failed to terminate plans benchmark: %s", exc)

    _BENCH_RUNNING = False
    _bench_proc = None
    return {"status": "cancelled"}