feat: build app/eval/cforch.py aggregating eval benchmark routers

2026-05-01 22:23:06 -07:00 · 2026-05-01 22:23:06 -07:00 · bccb385f61
commit bccb385f61
parent d74ad3f972
6 changed files with 80 additions and 181 deletions
--- a/app/api.py
+++ b/app/api.py
@ -147,15 +147,12 @@ from app.models import router as models_router
 import app.models as _models_module
 app.include_router(models_router, prefix="/api/models")

-from app.cforch import router as cforch_router
-app.include_router(cforch_router, prefix="/api/cforch")
+from app.eval.cforch import router as eval_router
+app.include_router(eval_router, prefix="/api")

 from app.imitate import router as imitate_router
 app.include_router(imitate_router, prefix="/api/imitate")

-from app.style import router as style_router
-app.include_router(style_router, prefix="/api/style")
-
 from app.data.fetch import router as fetch_router
 app.include_router(fetch_router, prefix="/api")

@ -163,99 +160,6 @@ app.include_router(fetch_router, prefix="/api")
 from fastapi.responses import StreamingResponse


-# ---------------------------------------------------------------------------
-# Benchmark endpoints
-# ---------------------------------------------------------------------------
-
-@app.get("/api/benchmark/models")
-def get_benchmark_models() -> dict:
-    """Return installed models grouped by adapter_type category."""
-    models_dir: Path = _models_module._MODELS_DIR
-    categories: dict[str, list[dict]] = {
-        "ZeroShotAdapter": [],
-        "RerankerAdapter": [],
-        "GenerationAdapter": [],
-        "Unknown": [],
-    }
-    if models_dir.exists():
-        for sub in models_dir.iterdir():
-            if not sub.is_dir():
-                continue
-            info_path = sub / "model_info.json"
-            adapter_type = "Unknown"
-            repo_id: str | None = None
-            if info_path.exists():
-                try:
-                    info = json.loads(info_path.read_text(encoding="utf-8"))
-                    adapter_type = info.get("adapter_type") or info.get("adapter_recommendation") or "Unknown"
-                    repo_id = info.get("repo_id")
-                except Exception:
-                    pass
-            bucket = adapter_type if adapter_type in categories else "Unknown"
-            entry: dict = {"name": sub.name, "repo_id": repo_id, "adapter_type": adapter_type}
-            categories[bucket].append(entry)
-    return {"categories": categories}
-
-
-@app.get("/api/benchmark/results")
-def get_benchmark_results():
-    """Return the most recently saved benchmark results, or an empty envelope."""
-    path = _DATA_DIR / "benchmark_results.json"
-    if not path.exists():
-        return {"models": {}, "sample_count": 0, "timestamp": None}
-    return json.loads(path.read_text())
-
-
-@app.get("/api/benchmark/run")
-def run_benchmark(include_slow: bool = False, model_names: str = ""):
-    """Spawn the benchmark script and stream stdout as SSE progress events."""
-    python_bin = "/devl/miniconda3/envs/job-seeker-classifiers/bin/python"
-    script = str(_ROOT / "scripts" / "benchmark_classifier.py")
-    cmd = [python_bin, script, "--score", "--save"]
-    if include_slow:
-        cmd.append("--include-slow")
-    if model_names:
-        names = [n.strip() for n in model_names.split(",") if n.strip()]
-        if names:
-            cmd.extend(["--models"] + names)
-
-    def generate():
-        try:
-            proc = _subprocess.Popen(
-                cmd,
-                stdout=_subprocess.PIPE,
-                stderr=_subprocess.STDOUT,
-                text=True,
-                bufsize=1,
-                cwd=str(_ROOT),
-            )
-            _running_procs["benchmark"] = proc
-            _cancelled_jobs.discard("benchmark")  # clear any stale flag from a prior run
-            try:
-                for line in proc.stdout:
-                    line = line.rstrip()
-                    if line:
-                        yield f"data: {json.dumps({'type': 'progress', 'message': line})}\n\n"
-                proc.wait()
-                if proc.returncode == 0:
-                    yield f"data: {json.dumps({'type': 'complete'})}\n\n"
-                elif "benchmark" in _cancelled_jobs:
-                    _cancelled_jobs.discard("benchmark")
-                    yield f"data: {json.dumps({'type': 'cancelled'})}\n\n"
-                else:
-                    yield f"data: {json.dumps({'type': 'error', 'message': f'Process exited with code {proc.returncode}'})}\n\n"
-            finally:
-                _running_procs.pop("benchmark", None)
-        except Exception as exc:
-            yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
-
-    return StreamingResponse(
-        generate(),
-        media_type="text/event-stream",
-        headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
-    )
-
-
 # ---------------------------------------------------------------------------
 # Finetune endpoints
 # ---------------------------------------------------------------------------
@ -347,20 +251,6 @@ def run_finetune_endpoint(
    )


-@app.post("/api/benchmark/cancel")
-def cancel_benchmark():
-    """Kill the running benchmark subprocess. 404 if none is running."""
-    proc = _running_procs.get("benchmark")
-    if proc is None:
-        raise HTTPException(404, "No benchmark is running")
-    _cancelled_jobs.add("benchmark")
-    proc.terminate()
-    try:
-        proc.wait(timeout=3)
-    except _subprocess.TimeoutExpired:
-        proc.kill()
-    return {"status": "cancelled"}
-

@app.post("/api/finetune/cancel")
 def cancel_finetune():
--- a/app/eval/init.py
+++ b/app/eval/init.py
--- a/app/eval/cforch.py
+++ b/app/eval/cforch.py
@ -0,0 +1,38 @@
+"""Avocet -- eval router aggregator.
+
+Collects benchmark sub-routers into a single importable `router`
+for the api.py factory. Each sub-router retains its established prefix
+so no frontend URL changes are needed.
+
+Route prefixes when mounted at /api in api.py:
+  /api/cforch/*      -- cf-orch benchmark routes
+  /api/style/*       -- writing style benchmark routes
+  /api/voice/*       -- voice benchmark routes
+  /api/plans-bench/* -- plans benchmark routes
+"""
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+from app.cforch import router as _cforch_router
+from app.style import router as _style_router
+from app.voice import router as _voice_router
+from app.plans_bench import router as _plans_router
+
+router = APIRouter()
+router.include_router(_cforch_router, prefix="/cforch")
+router.include_router(_style_router, prefix="/style")
+router.include_router(_voice_router, prefix="/voice")
+router.include_router(_plans_router, prefix="/plans-bench")
+
+
+def set_config_dir(path) -> None:
+    """Propagate config dir override to all sub-modules -- used by tests."""
+    import app.cforch as _cforch_mod
+    import app.style as _style_mod
+    import app.voice as _voice_mod
+    import app.plans_bench as _plans_mod
+    _cforch_mod.set_config_dir(path)
+    _style_mod.set_config_dir(path)
+    _voice_mod.set_config_dir(path)
+    _plans_mod.set_config_dir(path)
--- a/app/plans_bench.py
+++ b/app/plans_bench.py
@ -0,0 +1,30 @@
+"""Avocet -- Plans benchmark integration API (stub).
+
+Placeholder module so that app/eval/cforch.py can import and include
+this router. Full implementation follows in a subsequent task.
+
+All endpoints are registered on `router` (a FastAPI APIRouter).
+api.py (via the eval aggregator) includes this router at
+prefix="/api/plans-bench".
+"""
+from __future__ import annotations
+
+from pathlib import Path
+
+from fastapi import APIRouter
+
+router = APIRouter()
+
+_CONFIG_DIR: Path | None = None   # override in tests via set_config_dir()
+
+
+def set_config_dir(path: Path | None) -> None:
+    """Override config directory -- used by tests."""
+    global _CONFIG_DIR
+    _CONFIG_DIR = path
+
+
+@router.get("/status")
+def get_plans_bench_status() -> dict:
+    """Return placeholder status for the plans benchmark module."""
+    return {"status": "not_implemented"}
--- a/tests/test_api.py
+++ b/tests/test_api.py
@ -443,13 +443,6 @@ def test_finetune_run_passes_score_files_to_subprocess(client):

 # ---- Cancel endpoint tests ----

-def test_benchmark_cancel_returns_404_when_not_running(client):
-    """POST /api/benchmark/cancel must return 404 if no benchmark is running."""
-    from app import api as api_module
-    api_module._running_procs.pop("benchmark", None)
-    r = client.post("/api/benchmark/cancel")
-    assert r.status_code == 404
-

 def test_finetune_cancel_returns_404_when_not_running(client):
    """POST /api/finetune/cancel must return 404 if no finetune is running."""
@ -459,24 +452,6 @@ def test_finetune_cancel_returns_404_when_not_running(client):
    assert r.status_code == 404


-def test_benchmark_cancel_terminates_running_process(client):
-    """POST /api/benchmark/cancel must call terminate() on the running process."""
-    from unittest.mock import MagicMock
-    from app import api as api_module
-
-    mock_proc = MagicMock()
-    mock_proc.wait = MagicMock()
-    api_module._running_procs["benchmark"] = mock_proc
-
-    try:
-        r = client.post("/api/benchmark/cancel")
-        assert r.status_code == 200
-        assert r.json()["status"] == "cancelled"
-        mock_proc.terminate.assert_called_once()
-    finally:
-        api_module._running_procs.pop("benchmark", None)
-        api_module._cancelled_jobs.discard("benchmark")
-

 def test_finetune_cancel_terminates_running_process(client):
    """POST /api/finetune/cancel must call terminate() on the running process."""
@ -497,24 +472,6 @@ def test_finetune_cancel_terminates_running_process(client):
        api_module._cancelled_jobs.discard("finetune")


-def test_benchmark_cancel_kills_process_on_timeout(client):
-    """POST /api/benchmark/cancel must call kill() if the process does not exit within 3 s."""
-    import subprocess
-    from unittest.mock import MagicMock
-    from app import api as api_module
-
-    mock_proc = MagicMock()
-    mock_proc.wait.side_effect = subprocess.TimeoutExpired(cmd="benchmark", timeout=3)
-    api_module._running_procs["benchmark"] = mock_proc
-
-    try:
-        r = client.post("/api/benchmark/cancel")
-        assert r.status_code == 200
-        mock_proc.kill.assert_called_once()
-    finally:
-        api_module._running_procs.pop("benchmark", None)
-        api_module._cancelled_jobs.discard("benchmark")
-

 def test_finetune_run_emits_cancelled_event(client):
    """GET /api/finetune/run must emit cancelled (not error) when job was cancelled."""
@ -542,29 +499,3 @@ def test_finetune_run_emits_cancelled_event(client):
    finally:
        api_module._cancelled_jobs.discard("finetune")

-
-def test_benchmark_run_emits_cancelled_event(client):
-    """GET /api/benchmark/run must emit cancelled (not error) when job was cancelled."""
-    from unittest.mock import patch, MagicMock
-    from app import api as api_module
-
-    mock_proc = MagicMock()
-    mock_proc.stdout = iter([])
-    mock_proc.returncode = -15
-
-    def mock_wait():
-        # Simulate cancel being called while the process is running (after discard clears stale flag)
-        api_module._cancelled_jobs.add("benchmark")
-
-    mock_proc.wait = mock_wait
-
-    def mock_popen(cmd, **kwargs):
-        return mock_proc
-
-    try:
-        with patch("app.api._subprocess.Popen",side_effect=mock_popen):
-            r = client.get("/api/benchmark/run")
-        assert '{"type": "cancelled"}' in r.text
-        assert '"type": "error"' not in r.text
-    finally:
-        api_module._cancelled_jobs.discard("benchmark")
--- a/tests/test_cforch.py
+++ b/tests/test_cforch.py
@ -367,3 +367,13 @@ def test_run_passes_license_key_env_to_subprocess(client, config_dir, tmp_path,
        client.get("/api/cforch/run")

    assert captured_env.get("CF_LICENSE_KEY") == "CFG-AVCT-ENV-ONLY-KEY"
+
+
+def test_eval_cforch_router_includes_all_sub_routers():
+    """eval/cforch.py router must include routes from all four sub-routers."""
+    from app.eval.cforch import router
+    paths = {r.path for r in router.routes}
+    assert any("/cforch/" in p for p in paths), f"no /cforch/ routes found in {paths}"
+    assert any("/style/" in p for p in paths), f"no /style/ routes found in {paths}"
+    assert any("/voice/" in p for p in paths), f"no /voice/ routes found in {paths}"
+    assert any("/plans-bench/" in p for p in paths), f"no /plans-bench/ routes found in {paths}"