From bce932461a762f66bf4064243d98b1e3af2f1c48 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Sat, 2 May 2026 23:36:04 -0700
Subject: [PATCH] =?UTF-8?q?feat:=20plans=20benchmark=20harness=20=E2=80=94?=
 =?UTF-8?q?=20model=20scoring=20for=20CF=20planning=20prompts?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue
component, and registers /api/plans-bench in api.py. Also extends models
registry (cf-text catalog integration), cforch client, LlmEvalTab, and
ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
---
 .env.example                    |    4 +
 app/api.py                      |    6 +
 app/cforch.py                   |  244 +++++++-
 app/models.py                   |  151 ++++-
 config/label_tool.yaml.example  |    6 +-
 manage.sh                       |   35 ++
 scripts/benchmark_plans.py      |  719 +++++++++++++++++++++
 scripts/export_plans.py         |  458 ++++++++++++++
 tests/test_cforch.py            |   36 +-
 tests/test_models.py            |   81 +++
 web/src/views/BenchmarkView.vue |   21 +-
 web/src/views/LlmEvalTab.vue    |  242 +++++++
 web/src/views/ModelsView.vue    |  150 ++++-
 web/src/views/PlansBenchTab.vue | 1043 +++++++++++++++++++++++++++++++
 14 files changed, 3137 insertions(+), 59 deletions(-)
 create mode 100644 scripts/benchmark_plans.py
 create mode 100644 scripts/export_plans.py
 create mode 100644 web/src/views/PlansBenchTab.vue

diff --git a/.env.example b/.env.example
index 03c3c9f..cf441a2 100644
--- a/.env.example
+++ b/.env.example
@@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
 # Set one of these to use a cloud LLM instead of a local model.
 # ANTHROPIC_API_KEY=sk-ant-...
 # OPENAI_API_KEY=sk-...
+
+# ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
+# Generate at https://huggingface.co/settings/tokens and accept model terms first.
+# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx
diff --git a/app/api.py b/app/api.py
index 9709115..eb2e151 100644
--- a/app/api.py
+++ b/app/api.py
@@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
 from app.train.train import router as train_router
 app.include_router(train_router, prefix="/api/train")
 
+from app.plans_bench import router as plans_bench_router
+app.include_router(plans_bench_router, prefix="/api/plans-bench")
+
+# In-memory last-action store (single user, local tool — in-memory is fine)
+_last_action: dict | None = None
+
 from app.dashboard import router as dashboard_router
 app.include_router(dashboard_router, prefix="/api")
 
diff --git a/app/cforch.py b/app/cforch.py
index ccab705..60c8221 100644
--- a/app/cforch.py
+++ b/app/cforch.py
@@ -17,9 +17,12 @@ import logging
 import os
 import re
 import subprocess as _subprocess
+import tempfile
 from pathlib import Path
 from typing import Any
 
+import urllib.parse
+
 import yaml
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
@@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
         "license_key":     _coalesce(file_cfg.get("license_key", ""),     "CF_LICENSE_KEY"),
         "ollama_url":      _coalesce(file_cfg.get("ollama_url", ""),       "OLLAMA_HOST"),
         "ollama_model":    _coalesce(file_cfg.get("ollama_model", ""),     "OLLAMA_MODEL"),
+        "judge_url":       _coalesce(file_cfg.get("judge_url", ""),        "CF_JUDGE_URL"),
+        "hf_token":        _coalesce(file_cfg.get("hf_token", ""),         "HF_TOKEN"),
     }
 
 
+def _validate_service_url(url: str, param_name: str) -> str:
+    """Validate that a URL is a well-formed http/https URL with a hostname.
+
+    Guards against SSRF: only http/https is allowed; the URL must have a
+    non-empty host. Does not enforce an allowlist — call sites are internal
+    tooling, not a public API.
+    """
+    if not url:
+        return url
+    try:
+        parsed = urllib.parse.urlparse(url)
+    except Exception:
+        raise HTTPException(400, f"{param_name}: not a valid URL")
+    if parsed.scheme not in ("http", "https"):
+        raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
+    if not parsed.hostname:
+        raise HTTPException(400, f"{param_name}: URL has no hostname")
+    return url
+
+
 def _strip_ansi(text: str) -> str:
     """Remove ANSI escape codes from a string."""
     return re.sub(r'\x1b\[[0-9;]*m', '', text)
@@ -147,48 +172,141 @@ def get_tasks() -> dict:
 
 # ── GET /models ────────────────────────────────────────────────────────────────
 
+# Services and roles surfaced in the benchmark model picker.
+# Covers all cf-orch service types that benchmark.py can route tasks to.
+_BENCH_SERVICES = frozenset({
+    "cf-text", "vllm",         # LLM text generation
+    "cf-stt",                  # speech-to-text
+    "cf-tts",                  # text-to-speech
+    "cf-vision",               # image classification / embedding
+    "cf-voice",                # audio context classification
+})
+_BENCH_ROLES = frozenset({
+    "generator", "vlm",        # LLM roles
+    "stt", "alm",              # speech recognition
+    "tts",                     # speech synthesis
+    "vision", "embedding",     # image understanding
+    "classifier",              # audio classification (cf-voice)
+})
+
+
 @router.get("/models")
 def get_models() -> dict:
-    """Return model list from bench_models.yaml."""
+    """Return model list from bench_models.yaml merged with locally installed models.
+
+    bench_models.yaml entries are listed first and take precedence; any installed
+    model whose repo_id is already present in the YAML is skipped.  Only models
+    whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
+    cf-voice) are surfaced from the installed registry.
+    """
     cfg = _load_cforch_config()
     models_path = cfg.get("bench_models", "")
-    if not models_path:
-        return {"models": []}
 
-    p = Path(models_path)
-    if not p.exists():
-        return {"models": []}
-
-    try:
-        raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
-    except yaml.YAMLError as exc:
-        logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
-        return {"models": []}
-
-    models_raw = raw.get("models", []) or []
     models: list[dict] = []
-    for m in models_raw:
-        if not isinstance(m, dict):
-            continue
-        models.append({
-            "name": m.get("name", ""),
-            "id": m.get("id", ""),
-            "service": m.get("service", "ollama"),
-            "tags": m.get("tags", []) or [],
-            "vram_estimate_mb": m.get("vram_estimate_mb", 0),
-        })
+    bench_ids: set[str] = set()
+
+    if models_path:
+        p = Path(models_path)
+        if p.exists():
+            try:
+                raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
+            except yaml.YAMLError as exc:
+                logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
+                raw = {}
+            for m in (raw.get("models", []) or []):
+                if not isinstance(m, dict):
+                    continue
+                model_id = m.get("id", "")
+                models.append({
+                    "name": m.get("name", ""),
+                    "id": model_id,
+                    "service": m.get("service", "ollama"),
+                    "tags": m.get("tags", []) or [],
+                    "vram_estimate_mb": m.get("vram_estimate_mb", 0),
+                })
+                if model_id:
+                    bench_ids.add(model_id)
+
+    # Merge installed generator models not already in bench_models.yaml.
+    try:
+        from app.models import list_installed  # local import avoids circular dependency at module load
+        for installed in list_installed():
+            model_id: str = installed.get("model_id") or ""
+            service: str = installed.get("service") or ""
+            role: str = installed.get("role") or ""
+            if not model_id:
+                continue
+            if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
+                continue
+            if model_id in bench_ids:
+                continue
+            display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
+            models.append({
+                "name": display_name,
+                "id": model_id,
+                "service": service,
+                "tags": [role],
+                "vram_estimate_mb": installed.get("vram_mb") or 0,
+            })
+            bench_ids.add(model_id)
+    except Exception as exc:
+        logger.warning("Could not merge installed models into model list: %s", exc)
 
     return {"models": models}
 
 
 # ── GET /run ───────────────────────────────────────────────────────────────────
 
+@router.get("/nodes")
+def get_nodes() -> dict:
+    """Proxy the coordinator's /api/nodes list, returning node_id + online status.
+
+    Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
+    Returns an empty list if the coordinator is unreachable.
+    """
+    cfg = _load_cforch_config()
+    coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
+    if not coordinator_url:
+        return {"nodes": []}
+    try:
+        import httpx as _httpx
+        resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
+        resp.raise_for_status()
+        raw_nodes = resp.json().get("nodes", [])
+        return {
+            "nodes": [
+                {
+                    "node_id": n.get("node_id", ""),
+                    "online": n.get("last_heartbeat") is not None,
+                    "gpus": [
+                        {
+                            "gpu_id": g.get("gpu_id"),
+                            "name": g.get("name", ""),
+                            "vram_total_mb": g.get("vram_total_mb", 0),
+                            "vram_free_mb": g.get("vram_free_mb", 0),
+                        }
+                        for g in n.get("gpus", [])
+                    ],
+                }
+                for n in raw_nodes
+            ]
+        }
+    except Exception as exc:
+        logger.warning("Could not fetch nodes from coordinator: %s", exc)
+        return {"nodes": []}
+
+
 @router.get("/run")
 def run_benchmark(
     task_ids: str = "",
+    model_ids: str = "",
     model_tags: str = "",
     coordinator_url: str = "",
     ollama_url: str = "",
+    judge_url: str = "",
+    judge_backend: str = "chat",
+    workers: int = 1,
+    node_ids: str = "",
 ) -> StreamingResponse:
     """Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
     global _BENCH_RUNNING, _bench_proc
@@ -205,6 +323,13 @@ def run_benchmark(
     cfg_coordinator = cfg.get("coordinator_url", "")
     cfg_ollama      = cfg.get("ollama_url", "")
     cfg_license_key = cfg.get("license_key", "")
+    cfg_judge_url   = cfg.get("judge_url", "")
+
+    # Validate URL params before spawning the subprocess.
+    # _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
+    _validate_service_url(coordinator_url, "coordinator_url")
+    _validate_service_url(ollama_url, "ollama_url")
+    _validate_service_url(judge_url, "judge_url")
 
     def generate():
         global _BENCH_RUNNING, _bench_proc
@@ -213,16 +338,68 @@ def run_benchmark(
             yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
             return
 
+        # Build effective models file: bench_models.yaml + any installed models
+        # whose IDs were selected but are absent from the YAML (e.g. downloaded
+        # via the Models view). Written to a temp file so benchmark.py sees one
+        # unified list; cleaned up in the finally block.
+        effective_models_file = bench_models
+        _tmp_models_path: str | None = None
+
+        if model_ids and bench_models and Path(bench_models).exists():
+            requested_ids = set(model_ids.split(","))
+            try:
+                raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
+                bench_entries: list[dict] = raw_bench.get("models", []) or []
+                bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
+                missing_ids = requested_ids - bench_id_set
+                if missing_ids:
+                    from app.models import list_installed
+                    installed_map = {
+                        m["model_id"]: m
+                        for m in list_installed()
+                        if m.get("model_id") and m.get("service") in _BENCH_SERVICES
+                    }
+                    extra: list[dict] = []
+                    for mid in missing_ids:
+                        if mid in installed_map:
+                            inst = installed_map[mid]
+                            entry: dict[str, Any] = {
+                                "id": mid,
+                                "name": mid.split("/", 1)[-1] if "/" in mid else mid,
+                                "service": inst.get("service", "cf-text"),
+                                "vram_estimate_mb": inst.get("vram_mb") or 0,
+                                "tags": [inst.get("role", "generator")],
+                                "temperature": 0.0,
+                            }
+                            local_path = inst.get("path", "") or inst.get("local_path", "")
+                            if local_path:
+                                entry["model_path"] = local_path
+                            extra.append(entry)
+                    if extra:
+                        merged = {"models": bench_entries + extra}
+                        tf = tempfile.NamedTemporaryFile(
+                            mode="w", suffix=".yaml", delete=False,
+                            prefix="avocet_bench_models_",
+                        )
+                        yaml.dump(merged, tf)
+                        tf.close()
+                        _tmp_models_path = tf.name
+                        effective_models_file = _tmp_models_path
+            except Exception as exc:
+                logger.warning("Could not merge installed models into temp bench file: %s", exc)
+
         cmd = [
             python_bin,
             bench_script,
             "--tasks", bench_tasks,
-            "--models", bench_models,
+            "--models", effective_models_file,
             "--output", results_dir,
         ]
 
         if task_ids:
             cmd.extend(["--filter-tasks"] + task_ids.split(","))
+        if model_ids:
+            cmd.extend(["--filter-models"] + model_ids.split(","))
         if model_tags:
             cmd.extend(["--filter-tags"] + model_tags.split(","))
 
@@ -233,6 +410,15 @@ def run_benchmark(
             cmd.extend(["--coordinator", effective_coordinator])
         if effective_ollama:
             cmd.extend(["--ollama-url", effective_ollama])
+        effective_judge = judge_url if judge_url else cfg_judge_url
+        if effective_judge:
+            cmd.extend(["--judge-url", effective_judge])
+        if judge_backend and judge_backend != "chat":
+            cmd.extend(["--judge-backend", judge_backend])
+        if workers > 1:
+            cmd.extend(["--workers", str(workers)])
+        if node_ids:
+            cmd.extend(["--nodes"] + node_ids.split(","))
 
         # Pass license key as env var so subprocess can authenticate with cf-orch
         proc_env = {**os.environ}
@@ -273,6 +459,11 @@ def run_benchmark(
             yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
         finally:
             _BENCH_RUNNING = False
+            if _tmp_models_path:
+                try:
+                    os.unlink(_tmp_models_path)
+                except OSError:
+                    pass
 
     return StreamingResponse(
         generate(),
@@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
         "coordinator_url": cfg.get("coordinator_url", ""),
         "ollama_url":      cfg.get("ollama_url", ""),
         "ollama_model":    cfg.get("ollama_model", ""),
+        "judge_url":       cfg.get("judge_url", ""),
         "license_key_set": bool(cfg.get("license_key", "")),
         "source": "env" if not _config_file().exists() else "yaml+env",
     }
@@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
 # ── GET /results ───────────────────────────────────────────────────────────────
 
 @router.get("/results")
-def get_results() -> dict:
+def get_results() -> list:
     """Return the latest benchmark summary.json from results_dir."""
     cfg = _load_cforch_config()
     results_dir = cfg.get("results_dir", "")
diff --git a/app/models.py b/app/models.py
index c332a34..b4c05ae 100644
--- a/app/models.py
+++ b/app/models.py
@@ -15,6 +15,7 @@ from __future__ import annotations
 import json
 import logging
 import os
+import re
 import shutil
 import threading
 from datetime import datetime, timezone
@@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(
 
 router = APIRouter()
 
+# ── HuggingFace auth ─────────────────────────────────────────────────────────
+
+def _get_hf_token() -> str | None:
+    """Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
+    config_file = _ROOT / "config" / "label_tool.yaml"
+    if config_file.exists():
+        try:
+            import yaml as _yaml
+            raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
+            token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
+            if token:
+                return token
+        except Exception:
+            pass
+    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
+
+
+# ── GGUF quantization detection ───────────────────────────────────────────────
+# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
+_QUANT_RE = re.compile(
+    r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
+    re.IGNORECASE,
+)
+
 # ── Download progress shared state ────────────────────────────────────────────
 # Updated by the background download thread; read by GET /download/stream.
 _download_progress: dict[str, Any] = {}
@@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
     "audio-classification":           {"adapter": None, "role": "classifier", "service": "cf-voice"},
     # TTS — cf-tts text-to-speech service
     "text-to-speech":                 {"adapter": None, "role": "tts",       "service": "cf-tts"},
-    # Vision — cf-vision image classification / embedding / VLM service
+    # Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
     "image-classification":           {"adapter": None, "role": "vision",    "service": "cf-vision"},
     "zero-shot-image-classification": {"adapter": None, "role": "vision",    "service": "cf-vision"},
     "image-feature-extraction":       {"adapter": None, "role": "embedding", "service": "cf-vision"},
-    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "cf-vision"},
-    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "cf-vision"},
+    # Generative VLMs (image+text → text) — run under vllm, not cf-vision.
+    # cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
+    # LLaVA, and InternVL are textgen models that happen to accept image inputs.
+    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "vllm"},
+    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "vllm"},
     # Image generation — cf-image (text → image; distinct from cf-vision image understanding)
     "text-to-image":                  {"adapter": None, "role": "image-gen", "service": "cf-image"},
     # Embedding — cf-core shared embedding layer
@@ -195,10 +223,17 @@ def _get_queue_entry(entry_id: str) -> dict | None:
 def _catalog_key(repo_id: str) -> str:
     """Derive a readable catalog key from repo_id.
 
-    ibm-granite/granite-4.1-8b  →  granite-4.1-8b
-    facebook/bart-large-cnn     →  bart-large-cnn
+    ibm-granite/granite-4.1-8b              →  granite-4.1-8b
+    facebook/bart-large-cnn                 →  bart-large-cnn
+    WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF  →  opus4.7-gods.ghost.codex-4b
+
+    The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
+    as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
     """
-    return repo_id.split("/", 1)[-1].lower()
+    key = repo_id.split("/", 1)[-1].lower()
+    if key.endswith(".gguf"):
+        key = key[:-5]
+    return key
 
 
 def _insert_catalog_entry(content: str, entry_lines: str) -> str:
@@ -290,6 +325,15 @@ def _register_in_node_catalogs(
             max_mb: int = cf_text.get("max_mb", 0)
             catalog: dict = cf_text.get("catalog") or {}
 
+            # If the node has a different local model dir, remap the NFS path.
+            model_base = cf_text.get("model_base_path", "").rstrip("/")
+            if model_base:
+                nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
+                model_name = local_path.name
+                effective_path_str = f"{model_base}/{model_name}"
+            else:
+                effective_path_str = local_path_str
+
             # Skip if key already exists
             if model_key in catalog:
                 logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
@@ -301,10 +345,10 @@ def _register_in_node_catalogs(
                 for entry in catalog.values()
                 if isinstance(entry, dict)
             }
-            if local_path_str in registered_paths or any(
-                p.startswith(local_path_str + "/") for p in registered_paths
+            if effective_path_str in registered_paths or any(
+                p.startswith(effective_path_str + "/") for p in registered_paths
             ):
-                logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name)
+                logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
                 continue
 
             # Determine whether model fits at FP16 or needs 4-bit
@@ -330,12 +374,18 @@ def _register_in_node_catalogs(
                 if needs_4bit
                 else f"  # FP16 file-size estimate"
             )
+            env_block = (
+                f"        env:\n"
+                f"          CF_TEXT_4BIT: \"1\"\n"
+                if needs_4bit else ""
+            )
             entry_block = (
                 f"      # auto-registered by avocet on download\n"
                 f"      {model_key}:\n"
-                f"        path: {local_path_str}\n"
+                f"        path: {effective_path_str}\n"
                 f"        vram_mb: {vram_for_node}{vram_comment}\n"
                 f"        description: \"{desc}\"\n"
+                f"{env_block}"
             )
 
             new_content = _insert_catalog_entry(content, entry_block)
@@ -388,12 +438,17 @@ def _run_download(
     role: str | None = None,
     service: str | None = None,
     model_size_bytes: int = 0,
+    quant_pattern: str | None = None,
 ) -> None:
     """Background thread: download model via huggingface_hub.snapshot_download.
 
     model_size_bytes is the sum of file sizes reported by the HF API (siblings).
     It is used to estimate vram_mb and written to model_info.json so cf-orch can
     budget VRAM when allocating a cf-text instance for this model.
+
+    quant_pattern: when set, restricts snapshot_download to only files matching
+    *{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
+    from GGUF-only repos like bartowski/*.
     """
     global _download_progress
     local_dir = _model_dir_for(repo_id, service)
@@ -422,10 +477,20 @@ def _run_download(
 
         local_dir.mkdir(parents=True, exist_ok=True)
         poll_thread.start()
-        snapshot_download(
-            repo_id=repo_id,
-            local_dir=str(local_dir),
-        )
+
+        dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
+        hf_token = _get_hf_token()
+        if hf_token:
+            dl_kwargs["token"] = hf_token
+        if quant_pattern:
+            # Include both cases: repos use mixed conventions (Q6_K vs q6_k).
+            dl_kwargs["allow_patterns"] = [
+                f"*{quant_pattern.upper()}*.gguf",
+                f"*{quant_pattern.lower()}*.gguf",
+                "*.json",
+                "README.md",
+            ]
+        snapshot_download(**dl_kwargs)
 
         # Estimate VRAM from reported file size.
         # HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
@@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
         )
         logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)
 
-    # Estimate model size from siblings list
+    # Detect GGUF files and parse quant names from siblings list.
+    # For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
+    # a per-quant size picker instead of downloading every variant.
     siblings = data.get("siblings") or []
-    model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
+    gguf_files: list[dict] = []
+    for s in siblings:
+        if not isinstance(s, dict):
+            continue
+        fname: str = s.get("rfilename", "")
+        if not fname.lower().endswith(".gguf"):
+            continue
+        m = _QUANT_RE.search(fname)
+        gguf_files.append({
+            "filename": fname,
+            "size": s.get("size", 0) or 0,
+            "quant_name": m.group(1).upper() if m else None,
+        })
+    gguf_files.sort(key=lambda f: f["size"])
+
+    # model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
+    # For GGUF repos the frontend will substitute the selected quant's size on submit.
+    if gguf_files:
+        model_size_bytes: int = sum(f["size"] for f in gguf_files)
+    else:
+        model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
 
     # Description: first 300 chars of card data (modelId field used as fallback)
     card_data = data.get("cardData") or {}
@@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
         "compatible": compatible,
         "warning": warning,
         "model_size_bytes": model_size_bytes,
+        "gguf_files": gguf_files if gguf_files else None,
         "description": description,
         "tags": data.get("tags") or [],
         "downloads": data.get("downloads") or 0,
@@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
     # Stored in the queue entry so approve can pass it to _run_download
     # without a second HF API round-trip.
     model_size_bytes: int = 0
+    # GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
+    # restricts to *{quant_pattern}*.gguf instead of fetching all variants.
+    quant_pattern: str | None = None
 
 
 @router.post("/queue", status_code=201)
@@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
         "role": req.role,
         "service": req.service,
         "model_size_bytes": req.model_size_bytes,
+        "quant_pattern": req.quant_pattern,
         "status": "pending",
         "queued_at": datetime.now(timezone.utc).isoformat(),
     }
@@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
             entry.get("role"),
             entry.get("service"),
             entry.get("model_size_bytes", 0),
+            entry.get("quant_pattern"),
         ),
         daemon=True,
         name=f"model-download-{entry_id}",
@@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
     return {"ok": True}
 
 
+# ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
+
+class QueuePatchRequest(BaseModel):
+    service: str | None = None
+    role: str | None = None
+
+
+@router.patch("/queue/{entry_id}")
+def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
+    """Update mutable fields (service, role) on a pending queue entry."""
+    entry = _get_queue_entry(entry_id)
+    if entry is None:
+        raise HTTPException(404, f"Queue entry {entry_id!r} not found")
+    if entry.get("status") != "pending":
+        raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
+
+    updates: dict = {}
+    if body.service is not None:
+        updates["service"] = body.service
+    if body.role is not None:
+        updates["role"] = body.role
+
+    updated = _update_queue_entry(entry_id, updates)
+    return updated or {}
+
+
 # ── DELETE /queue/{id} ─────────────────────────────────────────────────────────
 
 @router.delete("/queue/{entry_id}")
diff --git a/config/label_tool.yaml.example b/config/label_tool.yaml.example
index 3407a0f..3542e9d 100644
--- a/config/label_tool.yaml.example
+++ b/config/label_tool.yaml.example
@@ -41,11 +41,15 @@ cforch:
   # Python interpreter with cf-orch installed
   python_bin:   /devl/miniconda3/envs/cf/bin/python
 
-  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST
+  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
   # coordinator_url: http://localhost:7700
   # license_key:     CFG-AVCT-xxxx-xxxx-xxxx
   # ollama_url:      http://localhost:11434
   # ollama_model:    llama3.2:3b
+  # judge_url:       http://10.1.10.158:8008  # Sif cf-text — LLM-as-judge secondary scorer
+  # judge_url:       http://10.1.10.71:8008   # Heimdall cf-text (alternative)
+  # Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
+  # hf_token:        hf_xxxxxxxxxxxxxxxxxxxx  # HuggingFace token — required for gated/terms-restricted models
 
 # Imitate tab — pull real samples from sibling CF product APIs and run them
 # through local LLMs to build a corrections dataset.
diff --git a/manage.sh b/manage.sh
index daf147d..0a02d1f 100755
--- a/manage.sh
+++ b/manage.sh
@@ -90,6 +90,12 @@ usage() {
     echo -e "    ${GREEN}score [args]${NC}             Shortcut: --score [args]"
     echo -e "    ${GREEN}compare [args]${NC}           Shortcut: --compare [args]"
     echo ""
+    echo "  Planning Benchmark:"
+    echo -e "    ${GREEN}plans-bench [args]${NC}       Run benchmark_plans.py (args passed through)"
+    echo -e "    ${GREEN}plans-list${NC}               Shortcut: --list-models"
+    echo -e "    ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
+    echo -e "    ${GREEN}plans-compare <m1> <m2> [more]${NC}  Compare models side-by-side"
+    echo ""
     echo "  Writing Style Benchmark:"
     echo -e "    ${GREEN}style-bench [args]${NC}       Run benchmark_style.py (args passed through)"
     echo -e "    ${GREEN}style-list${NC}               List available ollama models for style bench"
@@ -127,6 +133,8 @@ case "$CMD" in
         fi
         mkdir -p "$LOG_DIR"
         API_LOG="${LOG_DIR}/api.log"
+        # Load .env if present — sets HF_TOKEN and other optional overrides.
+        [[ -f .env ]] && set -a && source .env && set +a
         info "Building Vue SPA…"
         (cd web && npm run build) >> "$API_LOG" 2>&1
         info "Starting FastAPI on port ${API_PORT}…"
@@ -179,6 +187,9 @@ case "$CMD" in
         mkdir -p "$LOG_DIR"
         DEV_API_LOG="${LOG_DIR}/dev-api.log"
 
+        # Load .env if present — sets HF_TOKEN and other optional overrides.
+        [[ -f .env ]] && set -a && source .env && set +a
+
         if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
             warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
         else
@@ -255,6 +266,30 @@ case "$CMD" in
         exec "$0" benchmark --compare "$@"
         ;;
 
+    plans-bench)
+        info "Running planning benchmark (${ENV_UI})…"
+        "$PYTHON_UI" scripts/benchmark_plans.py "$@"
+        ;;
+
+    plans-list)
+        exec "$0" plans-bench --list-models
+        ;;
+
+    plans-run)
+        if [[ $# -lt 1 ]]; then
+            error "Usage: ./manage.sh plans-run <model-key> [extra args]"
+        fi
+        MODEL="$1"; shift
+        exec "$0" plans-bench --model "$MODEL" --verbose "$@"
+        ;;
+
+    plans-compare)
+        if [[ $# -lt 2 ]]; then
+            error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
+        fi
+        exec "$0" plans-bench --compare "$@" --verbose
+        ;;
+
     style-bench)
         info "Running writing style benchmark (${ENV_BM})…"
         if [[ ! -x "$PYTHON_BM" ]]; then
diff --git a/scripts/benchmark_plans.py b/scripts/benchmark_plans.py
new file mode 100644
index 0000000..0b02fa7
--- /dev/null
+++ b/scripts/benchmark_plans.py
@@ -0,0 +1,719 @@
+#!/usr/bin/env python
+"""CF-specific planning benchmark — compare base models before fine-tuning.
+
+Sends held-out CircuitForge planning prompts to one or more models via the
+cf-text (local) or cf-orch API, then scores responses against CF-specific
+rubrics. Use this to select the best base model for SFT.
+
+Scoring rubrics (each 0-1, summed to total/N):
+  - task_structure    : uses checkbox syntax (- [ ]), git commit steps
+  - tier_awareness    : mentions Free/Paid/Premium/Ultra tiers
+  - privacy_pillar    : mentions privacy/local-inference/no-logging
+  - safety_pillar     : mentions safety, human approval, or reversibility
+  - accessibility     : mentions ND/accessibility/adaptive needs
+  - license_split     : mentions MIT vs BSL or open-core model
+  - file_paths        : uses plausible file path references
+  - cf_conventions    : uses conda run -n cf, /Library/Development/, or known CF dirs
+  - paired_coherence  : (paired only) plan references the design doc's feature name
+  - length_ok         : 300–2500 words (under-short = hallucination risk; over-long = padding)
+
+Usage
+-----
+    # List available model targets
+    python scripts/benchmark_plans.py --list-models
+
+    # Run all held-out prompts against a single model, print report
+    python scripts/benchmark_plans.py --model llama3.2-3b
+
+    # Compare two models side-by-side
+    python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
+
+    # Run with a custom API base (cf-text default: http://localhost:8080/v1)
+    python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
+
+    # Export detailed results JSON
+    python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+# ── Paths ──────────────────────────────────────────────────────────────────────
+
+_ROOT = Path(__file__).parent.parent
+_DATA_DIR = _ROOT / "data"
+
+CF_TEXT_BASE   = "http://localhost:8080/v1"
+CF_ORCH_BASE   = "http://localhost:8090/v1"
+CF_COORD_URL   = "http://10.1.10.71:7700"   # cf-orch coordinator (LAN)
+
+# ── Held-out prompts ───────────────────────────────────────────────────────────
+# These are NOT in the training export (no matching docs in circuitforge-plans/).
+# Each prompt exercises a different CF planning domain.
+
+HELD_OUT_PROMPTS: list[dict[str, Any]] = [
+    {
+        "id": "ho_001",
+        "name": "kiwi_barcode_ocr",
+        "domain": "feature_plan",
+        "prompt": (
+            "You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
+            "Write a detailed implementation plan for adding barcode scanning via device camera "
+            "and receipt OCR to the item-add flow.\n\n"
+            "The plan should include: file structure (create/modify), step-by-step task checklist "
+            "with checkboxes, any DB migrations, and git commit steps."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
+    },
+    {
+        "id": "ho_002",
+        "name": "peregrine_ats_scoring",
+        "domain": "feature_design",
+        "prompt": (
+            "Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
+            "Context: Peregrine users paste job descriptions and their resume. "
+            "We want to score how well the resume keywords match the JD and suggest rewrites. "
+            "Describe the architecture, data flow, and key design decisions."
+        ),
+        "expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
+    },
+    {
+        "id": "ho_003",
+        "name": "tier_gate_local_llm",
+        "domain": "architecture",
+        "prompt": (
+            "Design the tier-gating architecture for a new CircuitForge product. "
+            "The product should:\n"
+            "- Default to local LLM inference for all tiers\n"
+            "- Unlock cloud LLM for Paid tier and above\n"
+            "- Keep fine-tuned model weights for Premium/Ultra only\n\n"
+            "Describe how the tier check integrates with the LLM router, "
+            "what happens when a Free user tries a Paid-tier feature, "
+            "and how BYOK (bring-your-own-key) fits in."
+        ),
+        "expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
+    },
+    {
+        "id": "ho_004",
+        "name": "heimdall_webhook_plan",
+        "domain": "feature_plan",
+        "prompt": (
+            "Break the following Heimdall feature into a detailed implementation plan with "
+            "file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
+            "Heimdall is the CircuitForge license server (FastAPI + SQLite). "
+            "The webhook needs to handle checkout.session.completed, "
+            "customer.subscription.updated, and customer.subscription.deleted events."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
+    },
+    {
+        "id": "ho_005",
+        "name": "nd_accessible_onboarding",
+        "domain": "ux_design",
+        "prompt": (
+            "You are a product designer working on Harrier, a CircuitForge tool for "
+            "helping people navigate government benefits applications.\n\n"
+            "Design the onboarding flow for neurodivergent (ND) users. "
+            "Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
+            "and rejection sensitivity. The flow should reduce cognitive load and "
+            "never use urgency or panic patterns."
+        ),
+        "expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
+    },
+    {
+        "id": "ho_006",
+        "name": "circuitforge_core_extraction",
+        "domain": "architecture",
+        "prompt": (
+            "Produce a CircuitForge-style design document for the following circuitforge-core "
+            "feature — shared ActivityPub federation module.\n\n"
+            "Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
+            "to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
+            "Design the module API, describe what belongs in MIT vs BSL, and note federation "
+            "privacy constraints."
+        ),
+        "expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
+    },
+    {
+        "id": "ho_007",
+        "name": "snipe_trust_score_plan",
+        "domain": "feature_plan",
+        "prompt": (
+            "You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
+            "Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
+            "The score should combine: feedback ratio, account age, item-specifics completeness, "
+            "listing photo quality, and shipping time accuracy. "
+            "Include file structure, test plan, and migration steps."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
+    },
+    {
+        "id": "ho_008",
+        "name": "avocet_training_pipeline",
+        "domain": "feature_plan",
+        "prompt": (
+            "Break the following Avocet feature into a detailed implementation plan — "
+            "end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
+            "Avocet is the CircuitForge email classifier training tool. "
+            "The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
+            "quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
+            "in the Avocet model queue if it beats the baseline."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
+    },
+    {
+        "id": "ho_009",
+        "name": "privacy_data_flow",
+        "domain": "architecture",
+        "prompt": (
+            "Design the data privacy architecture for a CircuitForge cloud product. "
+            "Describe: what PII is collected, how it's stored, retention policy, "
+            "obfuscation strategy for cloud-side logs, and how consent is obtained "
+            "in plain language. The product handles job applications (resumes, cover letters)."
+        ),
+        "expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
+    },
+    {
+        "id": "ho_010",
+        "name": "git_workflow_doc",
+        "domain": "process_doc",
+        "prompt": (
+            "Write a developer process document for CircuitForge: conventional commit and "
+            "branch workflow for a BSL 1.1 open-core product.\n\n"
+            "Cover: commit message format (type: description), branch naming, "
+            "when to use feature branches vs direct main commits, "
+            "how the MIT/BSL split affects which commits go in which branch, "
+            "and how CI gates on gitleaks for secret scanning."
+        ),
+        "expected_signals": ["license_split", "cf_conventions", "task_structure"],
+    },
+]
+
+# ── Rubric scoring ─────────────────────────────────────────────────────────────
+
+_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
+_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
+_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
+_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
+_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
+_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
+_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
+_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
+_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
+
+
+@dataclass
+class RubricScore:
+    task_structure: float = 0.0
+    tier_awareness: float = 0.0
+    privacy_pillar: float = 0.0
+    safety_pillar: float = 0.0
+    accessibility: float = 0.0
+    license_split: float = 0.0
+    file_paths: float = 0.0
+    cf_conventions: float = 0.0
+    length_ok: float = 0.0
+
+    def total(self) -> float:
+        vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
+                self.safety_pillar, self.accessibility, self.license_split,
+                self.file_paths, self.cf_conventions, self.length_ok]
+        return sum(vals) / len(vals)
+
+    def as_dict(self) -> dict[str, float]:
+        return asdict(self)
+
+
+def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
+    words = len(response.split())
+    s = RubricScore()
+
+    # Task structure: needs checkboxes AND at least one commit step
+    checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
+    has_commit = bool(_COMMIT_RE.search(response))
+    s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
+
+    # Tier awareness
+    s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
+
+    # Privacy pillar
+    s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
+
+    # Safety pillar
+    s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
+
+    # Accessibility
+    s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
+
+    # License split awareness
+    s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
+
+    # File paths: at least 3 plausible path references
+    s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
+
+    # CF conventions
+    s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
+
+    # Length: 200–2500 words is healthy; outside = partial credit
+    if 200 <= words <= 2500:
+        s.length_ok = 1.0
+    elif words < 200:
+        s.length_ok = words / 200
+    else:
+        s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
+
+    return s
+
+
+# ── Model client ───────────────────────────────────────────────────────────────
+
+# Registry of named model targets (shorthand → {api_base, model_name})
+MODEL_REGISTRY: dict[str, dict[str, str]] = {
+    "deepseek-r1-1.5b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-r1-1.5b",
+        "description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
+    },
+    "deepseek-r1-7b-4bit": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-r1-7b-4bit",
+        "description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
+    },
+    "deepseek-coder-6.7b-4bit": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-coder-6.7b-4bit",
+        "description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
+    },
+    "granite-4.1-8b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "granite-4.1-8b",
+        "description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
+    },
+    "qwen2.5-3b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "qwen2.5-3b",
+        "description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
+    },
+    "qwen2.5-7b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "qwen2.5-7b",
+        "description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
+    },
+}
+
+
+# ── cf-orch allocation ─────────────────────────────────────────────────────────
+
+def _cforch_allocate(
+    model_id: str,
+    cforch_url: str,
+    startup_timeout_s: float = 300.0,
+) -> tuple[str, str] | None:
+    """Allocate a cf-text instance for model_id via the cf-orch coordinator.
+
+    Returns (service_url, allocation_id) on success, None on failure.
+    service_url is the direct node URL exposing /v1/chat/completions.
+    """
+    try:
+        resp = httpx.post(
+            f"{cforch_url}/api/services/cf-text/allocate",
+            json={
+                "model_candidates": [model_id],
+                "caller": "avocet",
+                "pipeline": "plans_benchmark",
+            },
+            timeout=120.0,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        service_url: str = data["url"]
+        allocation_id: str = data.get("allocation_id", "")
+        node_id: str = data.get("node_id", "")
+        gpu_id: int | None = data.get("gpu_id")
+
+        if data.get("started", False) and not data.get("warm", True):
+            # Use \n so the SSE generator sees the line immediately
+            print(f"  [cold start] loading {model_id!r} — polling every 3s…", flush=True)
+            t0 = time.monotonic()
+            deadline = t0 + startup_timeout_s
+            probe_misses = 0
+
+            while time.monotonic() < deadline:
+                elapsed = time.monotonic() - t0
+                try:
+                    status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
+                    if status.is_success:
+                        instances = status.json().get("instances", [])
+                        match = next(
+                            (i for i in instances
+                             if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
+                            None,
+                        )
+                        if match:
+                            probe_misses = 0
+                            state = match.get("state", "")
+                            if state == "running":
+                                print(f"  [cold start] ready in {elapsed:.0f}s", flush=True)
+                                return service_url, allocation_id
+                            elif state == "stopped":
+                                print(f"  [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
+                                return None
+                            else:
+                                # still starting — emit keepalive so SSE stream stays alive
+                                print(f"  [cold start] state={state!r}  elapsed={elapsed:.0f}s", flush=True)
+                        else:
+                            probe_misses += 1
+                            print(f"  [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
+                            if probe_misses >= 6:
+                                try:
+                                    h = httpx.get(f"{service_url}/health", timeout=3.0)
+                                    if h.is_success:
+                                        print(f"  [cold start] ready via health check in {elapsed:.0f}s", flush=True)
+                                        return service_url, allocation_id
+                                except Exception:
+                                    pass
+                    else:
+                        print(f"  [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
+                except Exception as poll_exc:
+                    print(f"  [cold start] poll error: {poll_exc}  elapsed={elapsed:.0f}s", flush=True)
+                time.sleep(3.0)
+
+            print(f"  [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
+            return None
+
+        return service_url, allocation_id
+    except Exception as exc:
+        print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
+        return None
+
+
+def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
+    """Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
+    t0 = time.monotonic()
+    resp = httpx.post(
+        f"{service_url.rstrip('/')}/v1/chat/completions",
+        json={
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.2,
+        },
+        timeout=timeout,
+    )
+    resp.raise_for_status()
+    latency = time.monotonic() - t0
+    text = resp.json()["choices"][0]["message"]["content"]
+    return text, latency
+
+
+def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
+    """Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
+    t0 = time.monotonic()
+    resp = httpx.post(
+        f"{api_base}/chat/completions",
+        json={
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.2,
+        },
+        timeout=timeout,
+    )
+    resp.raise_for_status()
+    latency = time.monotonic() - t0
+    text = resp.json()["choices"][0]["message"]["content"]
+    return text, latency
+
+
+# ── Benchmark runner ───────────────────────────────────────────────────────────
+
+@dataclass
+class PromptResult:
+    prompt_id: str
+    prompt_name: str
+    model_key: str
+    response: str
+    latency_s: float
+    word_count: int
+    scores: dict[str, float]
+    total_score: float
+    error: str | None = None
+
+
+def run_benchmark(
+    model_key: str,
+    model_name: str,
+    prompts: list[dict[str, Any]] | None = None,
+    verbose: bool = False,
+    # cf-orch path
+    use_cforch: bool = False,
+    cforch_url: str = CF_COORD_URL,
+    # direct path (used when not cf-orch)
+    api_base: str = CF_TEXT_BASE,
+) -> list[PromptResult]:
+    """Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
+    if prompts is None:
+        prompts = HELD_OUT_PROMPTS
+
+    # Allocate once per model when using cf-orch
+    service_url: str | None = None
+    if use_cforch:
+        print(f"  Allocating {model_name!r} via cf-orch…", flush=True)
+        alloc = _cforch_allocate(model_name, cforch_url)
+        if alloc is None:
+            # Return all prompts as errors
+            return [
+                PromptResult(
+                    prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
+                    response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
+                    error=f"cf-orch allocation failed for {model_name!r}",
+                )
+                for p in prompts
+            ]
+        service_url, _alloc_id = alloc
+
+    results: list[PromptResult] = []
+    for p in prompts:
+        if verbose:
+            print(f"  [{p['id']}] {p['name']} … ", end="", flush=True)
+        try:
+            if service_url:
+                response, latency = _call_model_direct(service_url, model_name, p["prompt"])
+            else:
+                response, latency = _call_model(api_base, model_name, p["prompt"])
+            rubric = score_response(response, p)
+            result = PromptResult(
+                prompt_id=p["id"],
+                prompt_name=p["name"],
+                model_key=model_key,
+                response=response,
+                latency_s=round(latency, 2),
+                word_count=len(response.split()),
+                scores=rubric.as_dict(),
+                total_score=round(rubric.total(), 3),
+            )
+            if verbose:
+                print(f"score={result.total_score:.3f}  ({result.word_count}w, {latency:.1f}s)")
+        except Exception as exc:
+            result = PromptResult(
+                prompt_id=p["id"],
+                prompt_name=p["name"],
+                model_key=model_key,
+                response="",
+                latency_s=0.0,
+                word_count=0,
+                scores={},
+                total_score=0.0,
+                error=str(exc),
+            )
+            if verbose:
+                print(f"ERROR: {exc}")
+        results.append(result)
+    return results
+
+
+# ── Reporting ──────────────────────────────────────────────────────────────────
+
+def _print_single_report(results: list[PromptResult], model_key: str) -> None:
+    ok = [r for r in results if not r.error]
+    err = [r for r in results if r.error]
+    if not ok:
+        print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
+        return
+
+    avg_total = sum(r.total_score for r in ok) / len(ok)
+    avg_latency = sum(r.latency_s for r in ok) / len(ok)
+
+    # Aggregate per-rubric averages
+    rubric_keys = list(ok[0].scores.keys())
+    rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
+
+    print(f"\n{'='*60}")
+    print(f"  Model : {model_key}")
+    print(f"  Prompts: {len(ok)}/{len(results)} passed  ({len(err)} errors)")
+    print(f"  Overall score : {avg_total:.3f}  (avg latency {avg_latency:.1f}s)")
+    print(f"\n  Rubric breakdown:")
+    for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
+        bar = "█" * int(v * 20)
+        print(f"    {k:<22} {v:.3f}  {bar}")
+    print(f"\n  Per-prompt scores:")
+    for r in sorted(ok, key=lambda x: -x.total_score):
+        flag = "⚠" if r.total_score < 0.3 else " "
+        print(f"    {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f}  ({r.word_count}w)")
+    if err:
+        print(f"\n  Errors:")
+        for r in err:
+            print(f"    {r.prompt_id} {r.prompt_name}: {r.error}")
+    print(f"{'='*60}\n")
+
+
+def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
+    model_keys = list(all_results.keys())
+    prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
+
+    # Scores by (model, prompt_id)
+    score_map: dict[tuple[str, str], float] = {}
+    for mk, results in all_results.items():
+        for r in results:
+            score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
+
+    col_w = 10
+    header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
+    print(f"\n{'='*len(header)}")
+    print("  COMPARISON TABLE")
+    print(f"{'='*len(header)}")
+    print(f"  {header}")
+    print(f"  {'-'*len(header)}")
+
+    for pid in prompt_ids:
+        pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
+        row = f"  {pname:<35}"
+        best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
+        for mk in model_keys:
+            v = score_map.get((mk, pid), 0.0)
+            marker = "*" if v == best and len(model_keys) > 1 else " "
+            row += f"{v:.3f}{marker}   "
+        print(row)
+
+    print(f"  {'-'*len(header)}")
+    avgs_row = f"  {'AVERAGE':<35}"
+    best_avg = -1.0
+    avgs: dict[str, float] = {}
+    for mk in model_keys:
+        vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
+        avgs[mk] = sum(vals) / len(vals)
+        best_avg = max(best_avg, avgs[mk])
+    for mk in model_keys:
+        marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
+        avgs_row += f"{avgs[mk]:.3f}{marker}   "
+    print(avgs_row)
+    print(f"{'='*len(header)}\n")
+    if len(model_keys) > 1:
+        winner = max(avgs, key=lambda k: avgs[k])
+        print(f"  Winner: {winner}  (avg {avgs[winner]:.3f})\n")
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--list-models", action="store_true",
+                        help="Print registered model shortcuts and exit")
+    parser.add_argument("--model", metavar="KEY",
+                        help="Benchmark a single model (registry key or raw model name)")
+    parser.add_argument("--compare", nargs="+", metavar="KEY",
+                        help="Compare two or more models side-by-side")
+    parser.add_argument("--cforch", action="store_true",
+                        help="Route inference through cf-orch coordinator (allocate per model)")
+    parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
+                        help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
+    parser.add_argument("--api-base", default=None,
+                        help="Direct API base URL when not using cf-orch")
+    parser.add_argument("--model-name", default=None,
+                        help="Override model name sent to API (single-model runs only)")
+    parser.add_argument("--prompts", nargs="+", metavar="ID",
+                        help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
+    parser.add_argument("--output", type=Path, default=None,
+                        help="Write detailed JSON results to this path")
+    parser.add_argument("--workers", type=int, default=1, metavar="N",
+                        help="Run N models concurrently (default 1). Set to number of available nodes.")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="Print per-prompt progress")
+    args = parser.parse_args()
+
+    if args.list_models:
+        print("\nRegistered model shortcuts:")
+        for key, info in MODEL_REGISTRY.items():
+            print(f"  {key:<20} {info['description']}")
+        print(f"\nDefault endpoints:")
+        print(f"  direct    {CF_TEXT_BASE}")
+        print(f"  cf-orch   {CF_COORD_URL}")
+        return
+
+    prompts = HELD_OUT_PROMPTS
+    if args.prompts:
+        ids = set(args.prompts)
+        prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
+        if not prompts:
+            print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
+            sys.exit(1)
+
+    model_keys: list[str] = []
+    if args.compare:
+        model_keys = args.compare
+    elif args.model:
+        model_keys = [args.model]
+    else:
+        parser.print_help()
+        sys.exit(0)
+
+    all_results: dict[str, list[PromptResult]] = {}
+    print_lock = threading.Lock()
+
+    def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
+        if mk in MODEL_REGISTRY:
+            reg = MODEL_REGISTRY[mk]
+            model_name = args.model_name or reg["model"]
+            direct_base = args.api_base or reg["api_base"]
+        else:
+            model_name = args.model_name or mk
+            direct_base = args.api_base or CF_TEXT_BASE
+
+        if args.cforch:
+            with print_lock:
+                print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url})  model={model_name}")
+            results = run_benchmark(
+                mk, model_name, prompts=prompts, verbose=args.verbose,
+                use_cforch=True, cforch_url=args.cforch_url,
+            )
+        else:
+            with print_lock:
+                print(f"\nRunning [{mk}] → {direct_base}  model={model_name}")
+            results = run_benchmark(
+                mk, model_name, prompts=prompts, verbose=args.verbose,
+                api_base=direct_base,
+            )
+
+        with print_lock:
+            _print_single_report(results, mk)
+        return mk, results
+
+    workers = max(1, args.workers)
+    if workers == 1 or len(model_keys) == 1:
+        for mk in model_keys:
+            mk_out, results = _run_one(mk)
+            all_results[mk_out] = results
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
+            for fut in as_completed(futures):
+                mk_out, results = fut.result()
+                all_results[mk_out] = results
+
+    if len(model_keys) > 1:
+        _print_comparison_table(all_results)
+
+    if args.output:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            mk: [asdict(r) for r in results]
+            for mk, results in all_results.items()
+        }
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(payload, f, indent=2, ensure_ascii=False)
+        print(f"Wrote detailed results to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/export_plans.py b/scripts/export_plans.py
new file mode 100644
index 0000000..8a8e850
--- /dev/null
+++ b/scripts/export_plans.py
@@ -0,0 +1,458 @@
+"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
+
+Each record is a HuggingFace chat-format example:
+
+    {
+      "id": "<sha256>",
+      "messages": [
+        {"role": "user", "content": "<reconstructed planning prompt>"},
+        {"role": "assistant", "content": "<cleaned document content>"}
+      ],
+      "meta": {
+        "source": "peregrine/2026-03-03-feedback-button-design.md",
+        "product": "peregrine",
+        "doc_type": "design",     # design | plan | spec | implementation | other
+        "date": "2026-03-03",
+        "paired_with": "...",     # sibling path, or null
+        "word_count": 1847,
+        "pair_role": "context"    # "context" | "target" | "standalone"
+      }
+    }
+
+Pairing strategy
+----------------
+When a design doc and a plan doc share the same date + feature-name prefix,
+they are treated as a pair:
+  - design → plan: instruction = "Given this design doc, write the implementation plan."
+    context appended = full design doc content.
+  - Solo docs get a synthetic instruction from the title + first overview section.
+
+Usage
+-----
+    # Preview stats and 5 sample records
+    python scripts/export_plans.py --preview
+
+    # Write full output
+    python scripts/export_plans.py --output data/plan_pairs.jsonl
+
+    # Restrict to specific products
+    python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Iterator
+
+# ── Paths ──────────────────────────────────────────────────────────────────────
+
+_SCRIPT_DIR = Path(__file__).parent
+_AVOCET_ROOT = _SCRIPT_DIR.parent
+_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
+_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
+
+# ── Doc type detection ─────────────────────────────────────────────────────────
+
+_TYPE_RE = re.compile(
+    r"-(design|plan|spec|implementation|specs|plans)s?$",
+    re.IGNORECASE,
+)
+
+_SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
+
+# Boilerplate lines to strip from document content before using as output.
+_BOILERPLATE_RE = re.compile(
+    r"""
+    ^\s*>\s*\*\*For\s+agentic\s+workers.*         # superpowers agent hints
+    |^\s*>\s*REQUIRED\s+SUB-SKILL.*
+    |^\s*\*\*Date:\*\*.*                           # metadata header lines
+    |\*\*Status:\*\*\s*Complete.*                  # completed-feature noise
+    |\*\*Status:\*\*\s*Done.*
+    |\*\*Product:\*\*.*
+    |\*\*Repo:\*\*.*
+    |\*\*Tech\s+Stack:\*\*.*
+    |\*\*Candidate:\*\*.*                          # old synthetic personas
+    |^Candidate:.*
+    |^Team:.*
+    """,
+    re.VERBOSE | re.MULTILINE,
+)
+
+# Old repo/path names to normalise to current equivalents.
+_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
+    (re.compile(r"/devl/job-seeker", re.IGNORECASE),      "/Library/Development/CircuitForge/peregrine"),
+    (re.compile(r"\bjob-seeker\b",   re.IGNORECASE),      "peregrine"),
+    (re.compile(r"Alex Rivera",      re.IGNORECASE),      "[user]"),
+]
+
+# Instruction paraphrase templates per doc type.
+# Each entry is (user_prefix, paired_prefix).
+# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
+_DESIGN_INSTRUCTIONS = [
+    "Write a design document for {product}: {title}.\n\nContext: {overview}",
+    "You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
+    "Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
+]
+
+_PLAN_INSTRUCTIONS = [
+    "Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
+    "Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
+    "You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
+]
+
+_PAIRED_INSTRUCTIONS = [
+    (
+        "You are a software architect working on {product}, a CircuitForge product. "
+        "Given the following design document, write a detailed implementation plan "
+        "(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
+        "---\n{design_context}\n---"
+    ),
+    (
+        "The following is a design spec for a {product} feature. "
+        "Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
+        "---\n{design_context}\n---"
+    ),
+    (
+        "Convert this {product} design document into an actionable implementation plan. "
+        "Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
+        "---\n{design_context}\n---"
+    ),
+]
+
+
+def _doc_type(stem: str) -> str:
+    m = _TYPE_RE.search(stem)
+    if not m:
+        return "other"
+    raw = m.group(1).lower().rstrip("s")
+    return {"implementation": "plan"}.get(raw, raw)
+
+
+def _date_feature(stem: str) -> tuple[str, str]:
+    """Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
+    m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
+    if m:
+        return m.group(1), m.group(2)
+    return "", stem
+
+
+# ── Content extraction ─────────────────────────────────────────────────────────
+
+def _extract_title(content: str) -> str:
+    m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
+    return m.group(1).strip() if m else ""
+
+
+def _extract_overview(content: str) -> str:
+    """Return first substantive paragraph or h2 section body (≤300 chars)."""
+    # Superpowers plans have an explicit **Goal:** line — prefer that.
+    goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
+    if goal_m:
+        return goal_m.group(1).strip()[:300]
+
+    # Otherwise use the body of the first h2 section.
+    h2_m = re.search(
+        r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
+        content,
+        re.MULTILINE,
+    )
+    if h2_m:
+        body = h2_m.group(1).strip()
+        # Strip markdown bullet/code noise for the instruction
+        body = re.sub(r"```[\s\S]*?```", "", body)
+        body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
+        body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
+        body = re.sub(r"\s+", " ", body).strip()
+        return body[:300]
+
+    return ""
+
+
+def _clean_content(content: str) -> str:
+    """Remove boilerplate, normalize old paths/names, collapse whitespace."""
+    cleaned = _BOILERPLATE_RE.sub("", content)
+    for pattern, replacement in _PATH_NORMALIZATIONS:
+        cleaned = pattern.sub(replacement, cleaned)
+    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
+    return cleaned.strip()
+
+
+def _quality_flags(content: str) -> list[str]:
+    """Return a list of quality issue labels found in cleaned content."""
+    flags = []
+    if "Alex Rivera" in content or "[user]" in content:
+        flags.append("persona-residue")
+    if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
+        flags.append("completed-status")
+    return flags
+
+
+def _make_instruction(
+    title: str,
+    product: str,
+    doc_type: str,
+    overview: str,
+    design_context: str | None = None,
+    variant: int = 0,
+) -> str:
+    """Synthesise a natural planning prompt for this document.
+
+    variant: 0-2 selects which paraphrase template to use. Caller cycles
+    through all three to produce multiple training examples per document.
+    """
+    product_label = product.replace("-", " ").title() if product else "CircuitForge"
+    idx = variant % 3
+
+    if design_context:
+        tmpl = _PAIRED_INSTRUCTIONS[idx]
+        return tmpl.format(
+            product=product_label,
+            design_context=design_context[:2500],
+        )
+
+    templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
+    tmpl = templates[idx]
+    return tmpl.format(
+        product=product_label,
+        title=title,
+        overview=overview or "",
+        type_phrase="planning document",
+    )
+
+
+def _record_id(content: str, source: str) -> str:
+    return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
+
+
+# ── Pair discovery ─────────────────────────────────────────────────────────────
+
+def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
+    """Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
+    by_prefix: dict[str, list[tuple[str, Path]]] = {}
+    for path in plans_dir.rglob("*.md"):
+        if any(part in _SKIP_DIRS for part in path.parts):
+            continue
+        if path.name == "README.md":
+            continue
+        stem = path.stem
+        date, feature = _date_feature(stem)
+        if not date:
+            continue
+        key = str(path.parent / f"{date}-{feature}")
+        by_prefix.setdefault(key, []).append((_doc_type(stem), path))
+    return by_prefix
+
+
+# ── Record generation ──────────────────────────────────────────────────────────
+
+def _records_for_group(
+    doc_type_paths: list[tuple[str, Path]],
+    plans_dir: Path,
+) -> Iterator[dict]:
+    """Yield one or more training records for a group of related docs."""
+    # Separate design vs plan docs within this group
+    designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
+    plans_  = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
+    others  = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
+
+    all_paths = doc_type_paths
+
+    if designs and plans_:
+        # Paired: yield a design→plan record (3 instruction variants)
+        design_type, design_path = designs[0]
+        plan_type,   plan_path   = plans_[0]
+        design_content = design_path.read_text(encoding="utf-8")
+        plan_content   = plan_path.read_text(encoding="utf-8")
+
+        product        = _product_from_path(plan_path, plans_dir)
+        title          = _extract_title(plan_content) or plan_path.stem
+        cleaned        = _clean_content(plan_content)
+        design_cleaned = _clean_content(design_content)
+        flags          = _quality_flags(cleaned)
+
+        if len(cleaned.split()) >= 80:
+            rel_src    = str(plan_path.relative_to(plans_dir))
+            rel_design = str(design_path.relative_to(plans_dir))
+            for variant in range(3):
+                instruction = _make_instruction(
+                    title=title,
+                    product=product,
+                    doc_type="plan",
+                    overview=_extract_overview(design_content),
+                    design_context=design_cleaned,
+                    variant=variant,
+                )
+                yield {
+                    "id": _record_id(f"v{variant}:{cleaned}", rel_src),
+                    "messages": [
+                        {"role": "user", "content": instruction},
+                        {"role": "assistant", "content": cleaned},
+                    ],
+                    "meta": {
+                        "source": rel_src,
+                        "product": product,
+                        "doc_type": "plan",
+                        "date": _date_feature(plan_path.stem)[0],
+                        "paired_with": rel_design,
+                        "word_count": len(cleaned.split()),
+                        "pair_role": "target",
+                        "variant": variant,
+                        "quality_flags": flags,
+                    },
+                }
+
+        # Also yield the design doc as standalone variants
+        all_paths = [(t, p) for t, p in all_paths if p != plan_path]
+
+    # Remaining docs as standalone records (3 instruction variants each)
+    for doc_type, path in all_paths:
+        content = path.read_text(encoding="utf-8")
+        cleaned = _clean_content(content)
+        if len(cleaned.split()) < 80:
+            continue
+
+        product  = _product_from_path(path, plans_dir)
+        title    = _extract_title(content) or path.stem
+        overview = _extract_overview(content)
+        flags    = _quality_flags(cleaned)
+        rel_src  = str(path.relative_to(plans_dir))
+
+        for variant in range(3):
+            instruction = _make_instruction(
+                title=title,
+                product=product,
+                doc_type=doc_type,
+                overview=overview,
+                variant=variant,
+            )
+            yield {
+                "id": _record_id(f"v{variant}:{cleaned}", rel_src),
+                "messages": [
+                    {"role": "user", "content": instruction},
+                    {"role": "assistant", "content": cleaned},
+                ],
+                "meta": {
+                    "source": rel_src,
+                    "product": product,
+                    "doc_type": doc_type,
+                    "date": _date_feature(path.stem)[0],
+                    "paired_with": None,
+                    "word_count": len(cleaned.split()),
+                    "pair_role": "standalone",
+                    "variant": variant,
+                    "quality_flags": flags,
+                },
+            }
+
+
+def _product_from_path(path: Path, plans_dir: Path) -> str:
+    rel = path.relative_to(plans_dir)
+    return rel.parts[0] if len(rel.parts) > 1 else "shared"
+
+
+# ── Main export ────────────────────────────────────────────────────────────────
+
+def export(
+    plans_dir: Path,
+    products: list[str] | None = None,
+) -> list[dict]:
+    groups = _find_pairs(plans_dir)
+    records: list[dict] = []
+    seen_ids: set[str] = set()
+
+    for group_key, doc_type_paths in groups.items():
+        # Filter by product if requested
+        if products:
+            paths = [p for _, p in doc_type_paths]
+            prods = {_product_from_path(p, plans_dir) for p in paths}
+            if not prods.intersection(products):
+                continue
+
+        for record in _records_for_group(doc_type_paths, plans_dir):
+            if record["id"] not in seen_ids:
+                seen_ids.add(record["id"])
+                records.append(record)
+
+    return records
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+def _print_stats(records: list[dict]) -> None:
+    from collections import Counter
+    products  = Counter(r["meta"]["product"]   for r in records)
+    doc_types = Counter(r["meta"]["doc_type"]  for r in records)
+    pair_roles = Counter(r["meta"]["pair_role"] for r in records)
+    wc = [r["meta"]["word_count"] for r in records]
+    wc.sort()
+
+    print(f"\n{'='*55}")
+    print(f"  Total records: {len(records)}")
+    print(f"  Word counts  : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
+    print(f"\n  By product:")
+    for p, n in products.most_common():
+        print(f"    {p:<22} {n}")
+    print(f"\n  By doc type:")
+    for t, n in doc_types.most_common():
+        print(f"    {t:<22} {n}")
+    print(f"\n  Pair roles:")
+    for r, n in pair_roles.most_common():
+        print(f"    {r:<22} {n}")
+    print(f"{'='*55}\n")
+
+
+def _print_sample(records: list[dict], n: int = 3) -> None:
+    import random
+    sample = random.sample(records, min(n, len(records)))
+    for i, rec in enumerate(sample, 1):
+        meta = rec["meta"]
+        user_msg = rec["messages"][0]["content"]
+        asst_msg = rec["messages"][1]["content"]
+        print(f"\n{'─'*55}")
+        print(f"SAMPLE {i}/{n}  [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
+        print(f"source: {meta['source']}")
+        print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
+        print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
+    print(f"\n{'─'*55}\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
+    parser.add_argument("--output", type=Path, default=None,
+                        help="Write JSONL to this path (omit for preview-only)")
+    parser.add_argument("--products", default=None,
+                        help="Comma-separated product filter, e.g. peregrine,kiwi")
+    parser.add_argument("--preview", action="store_true",
+                        help="Print stats + sample records, don't write output")
+    parser.add_argument("--samples", type=int, default=3,
+                        help="Number of sample records to show in preview (default 3)")
+    args = parser.parse_args()
+
+    products = [p.strip() for p in args.products.split(",")] if args.products else None
+
+    print(f"Scanning {args.plans_dir} …", file=sys.stderr)
+    records = export(args.plans_dir, products=products)
+
+    _print_stats(records)
+
+    if args.preview or args.output is None:
+        _print_sample(records, n=args.samples)
+        if args.output is None:
+            print("(Pass --output <path> to write JSONL)")
+        return
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w", encoding="utf-8") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+
+    print(f"Wrote {len(records)} records to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_cforch.py b/tests/test_cforch.py
index 45b109e..03484b8 100644
--- a/tests/test_cforch.py
+++ b/tests/test_cforch.py
@@ -14,7 +14,9 @@ from fastapi.testclient import TestClient
 
 @pytest.fixture(autouse=True)
 def reset_cforch_globals(tmp_path):
-    """Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
+    """Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
+    list_installed to return [] so real disk model directories don't bleed into
+    tests that don't exercise the installed-model merge path."""
     from app import cforch as cforch_module
 
     prev_config_dir = cforch_module._CONFIG_DIR
@@ -25,7 +27,8 @@ def reset_cforch_globals(tmp_path):
     cforch_module._BENCH_RUNNING = False
     cforch_module._bench_proc = None
 
-    yield tmp_path
+    with patch("app.models.list_installed", return_value=[]):
+        yield tmp_path
 
     cforch_module.set_config_dir(prev_config_dir)
     cforch_module._BENCH_RUNNING = prev_running
@@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
     assert m["vram_estimate_mb"] == 6000
 
 
+def test_models_merges_installed_generators(client, config_dir, tmp_path):
+    """Installed cf-text/vllm generator models appear in the model list,
+    deduplicated against bench_models.yaml entries."""
+    models_file = tmp_path / "bench_models.yaml"
+    _write_models_yaml(models_file, [
+        {"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
+        {"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
+    ])
+    _write_config(config_dir, {"bench_models": str(models_file)})
+
+    fake_installed = [
+        # should be included — cf-text generator not already in YAML
+        {"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
+        # should be deduped — repo_id matches a YAML entry
+        {"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
+        # should be excluded — classifier, not a generator
+        {"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
+    ]
+    with patch("app.models.list_installed", return_value=fake_installed):
+        r = client.get("/api/cforch/models")
+    assert r.status_code == 200
+    ids = [m["id"] for m in r.json()["models"]]
+    assert "llama3:8b" in ids                           # from YAML
+    assert "ibm-granite/granite-4.1-8b" in ids          # from YAML (not duplicated)
+    assert "meta-llama/Llama-3.1-8B" in ids             # merged from installed
+    assert "cross-encoder/ms-marco-MiniLM-L6" not in ids  # filtered out (reranker)
+    assert ids.count("ibm-granite/granite-4.1-8b") == 1  # no duplicate
+
+
 # ── GET /run ───────────────────────────────────────────────────────────────────
 
 def test_run_returns_409_when_already_running(client):
diff --git a/tests/test_models.py b/tests/test_models.py
index 19af1e7..4af2340 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
         except _HTTPException as exc:
             assert exc.status_code in (400, 404)
             raise
+
+
+# ── Catalog registration ───────────────────────────────────────────────────────
+
+_MINIMAL_YAML = """\
+services:
+  cf-text:
+    max_mb: {max_mb}
+    catalog:
+      existing-model:
+        path: /some/path
+        vram_mb: 1000
+        description: "placeholder"
+"""
+
+
+def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
+    p = tmp_path / "testnode.yaml"
+    p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
+    return p
+
+
+def test_catalog_registration_fp16_no_env_block(tmp_path):
+    """When model fits at FP16, no env block should be written."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/SmallModel",
+            local_path=tmp_path / "org--SmallModel",
+            vram_mb_fp16=4000,
+            role="generator",
+        )
+
+    assert "testnode" in updated
+    content = node_yaml.read_text()
+    # _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
+    assert "smallmodel:" in content
+    assert "CF_TEXT_4BIT" not in content
+    assert "env:" not in content
+
+
+def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
+    """When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/BigModel",
+            local_path=tmp_path / "org--BigModel",
+            vram_mb_fp16=20000,  # won't fit at FP16 on 8 GB
+            role="generator",
+        )
+
+    assert "testnode" in updated
+    content = node_yaml.read_text()
+    # _catalog_key: "org/BigModel" → "bigmodel"
+    assert "bigmodel:" in content
+    assert "env:" in content
+    assert 'CF_TEXT_4BIT: "1"' in content
+    assert "CF_TEXT_4BIT=1 required" in content  # description note
+
+
+def test_catalog_registration_too_large_skipped(tmp_path):
+    """Model too large even at 4-bit should not be registered."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/HugeModel",
+            local_path=tmp_path / "org--HugeModel",
+            vram_mb_fp16=80000,  # 4-bit ~22 GB, still won't fit on 8 GB
+            role="generator",
+        )
+
+    assert updated == []
+    content = node_yaml.read_text()
+    assert "hugemodel" not in content
diff --git a/web/src/views/BenchmarkView.vue b/web/src/views/BenchmarkView.vue
index a4ae4ea..3791428 100644
--- a/web/src/views/BenchmarkView.vue
+++ b/web/src/views/BenchmarkView.vue
@@ -21,21 +21,28 @@
         :class="{ active: benchMode === 'style' }"
         @click="benchMode = 'style'"
       >✍️ Writing Style</button>
+      <button
+        class="mode-btn"
+        :class="{ active: benchMode === 'plans' }"
+        @click="benchMode = 'plans'"
+      >📐 Planning</button>
     </div>
 
-    <ClassifierTab v-if="benchMode === 'classifier'" />
-    <LlmEvalTab   v-if="benchMode === 'llm'" />
-    <StyleTab     v-if="benchMode === 'style'" />
+    <ClassifierTab  v-if="benchMode === 'classifier'" />
+    <LlmEvalTab     v-if="benchMode === 'llm'" />
+    <StyleTab       v-if="benchMode === 'style'" />
+    <PlansBenchTab  v-if="benchMode === 'plans'" />
   </div>
 </template>
 
 <script setup lang="ts">
 import { ref } from 'vue'
-import ClassifierTab from './ClassifierTab.vue'
-import LlmEvalTab   from './LlmEvalTab.vue'
-import StyleTab     from './StyleTab.vue'
+import ClassifierTab  from './ClassifierTab.vue'
+import LlmEvalTab     from './LlmEvalTab.vue'
+import StyleTab       from './StyleTab.vue'
+import PlansBenchTab  from './PlansBenchTab.vue'
 
-type BenchMode = 'classifier' | 'llm' | 'style'
+type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
 const benchMode = ref<BenchMode>('classifier')
 </script>
 
diff --git a/web/src/views/LlmEvalTab.vue b/web/src/views/LlmEvalTab.vue
index 34c2be1..4475701 100644
--- a/web/src/views/LlmEvalTab.vue
+++ b/web/src/views/LlmEvalTab.vue
@@ -6,6 +6,8 @@
       <summary class="picker-summary">
         <span class="picker-title">📋 Task Selection</span>
         <span class="picker-badge">{{ llmTaskBadge }}</span>
+        <button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
+        <button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
       </summary>
       <div class="picker-body">
         <div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
@@ -44,6 +46,8 @@
       <summary class="picker-summary">
         <span class="picker-title">🎯 Model Selection</span>
         <span class="picker-badge">{{ llmModelBadge }}</span>
+        <button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
+        <button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
       </summary>
       <div class="picker-body">
         <div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
@@ -78,6 +82,33 @@
       </div>
     </details>
 
+    <!-- Node Selection -->
+    <div class="node-picker" v-if="llmNodes.length > 0">
+      <span class="node-picker-label">Nodes:</span>
+      <label
+        v-for="node in llmNodes"
+        :key="node.node_id"
+        class="node-chip"
+        :class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
+        :title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
+      >
+        <input
+          type="checkbox"
+          class="node-chip-check"
+          :checked="enabledNodes.has(node.node_id)"
+          :disabled="!node.online || llmRunning"
+          @change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
+        />
+        {{ node.node_id }}
+        <span class="node-chip-status" v-if="!node.online">offline</span>
+      </label>
+      <span class="node-picker-hint">
+        {{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
+            ? 'auto-routing (all nodes)'
+            : `restricted to: ${enabledNodeIds.join(', ')}` }}
+      </span>
+    </div>
+
     <!-- Run Controls -->
     <div class="run-controls">
       <button
@@ -88,6 +119,24 @@
         {{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
       </button>
       <button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark">✕ Cancel</button>
+      <input
+        v-model="llmJudgeUrl"
+        class="judge-url-input"
+        placeholder="Judge URL — leave empty to skip LLM judge scoring"
+        :disabled="llmRunning"
+        title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
+      />
+      <label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
+        <span class="workers-prefix">workers</span>
+        <input
+          v-model.number="llmWorkers"
+          type="number"
+          min="1"
+          max="8"
+          class="workers-input"
+          :disabled="llmRunning"
+        />
+      </label>
       <span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
         Select at least one task and one model to run.
       </span>
@@ -119,6 +168,7 @@
             <tr>
               <th class="hm-label-col">Model</th>
               <th class="hm-model-col">overall</th>
+              <th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
               <th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
               <th class="hm-model-col">tok/s</th>
             </tr>
@@ -130,6 +180,12 @@
                 class="hm-value-cell"
                 :class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
               >{{ pct(row.avg_quality_score) }}</td>
+              <td
+                v-if="llmHasJudge"
+                class="hm-value-cell hm-judge-cell"
+                :class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
+                title="LLM-as-judge secondary score"
+              >{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
               <td
                 v-for="col in llmTaskTypeCols"
                 :key="col"
@@ -168,6 +224,12 @@ interface CfOrchModel {
   vram_estimate_mb?: number
 }
 
+interface CfOrchNode {
+  node_id: string
+  online: boolean
+  gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
+}
+
 interface LlmModelResult {
   model_name: string
   model_id: string
@@ -175,9 +237,11 @@ interface LlmModelResult {
   avg_tokens_per_sec: number
   avg_completion_ms: number
   avg_quality_score: number
+  avg_judge_score: number | null
   finetune_candidates: number
   error_count: number
   quality_by_task_type: Record<string, number>
+  judge_score_by_task_type?: Record<string, number>
 }
 
 // ── State ───────────────────────────────────────────────────────────────────
@@ -195,6 +259,10 @@ const llmError       = ref('')
 const llmResults     = ref<LlmModelResult[]>([])
 const llmEventSource = ref<EventSource | null>(null)
 const llmLogEl       = ref<HTMLElement | null>(null)
+const llmJudgeUrl    = ref('')
+const llmWorkers     = ref(1)
+const llmNodes       = ref<CfOrchNode[]>([])
+const enabledNodes   = ref<Set<string>>(new Set())
 
 // ── Computed ────────────────────────────────────────────────────────────────
 const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
@@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
   return [...types].sort()
 })
 
+const llmHasJudge = computed(() =>
+  llmResults.value.some(r => r.avg_judge_score != null)
+)
+
+const enabledNodeIds = computed(() =>
+  llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
+)
+
 const llmBestByCol = computed((): Record<string, string> => {
   const best: Record<string, string> = {}
   if (llmResults.value.length === 0) return best
@@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
   }
   best['overall'] = bestId
 
+  if (llmHasJudge.value) {
+    bestId = ''; bestVal = -Infinity
+    for (const r of llmResults.value) {
+      if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
+        bestVal = r.avg_judge_score; bestId = r.model_id
+      }
+    }
+    best['judge'] = bestId
+  }
+
   for (const col of llmTaskTypeCols.value) {
     bestId = ''; bestVal = -Infinity
     for (const r of llmResults.value) {
@@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
   }
   selectedLlmModels.value = next
 }
+function selectAllTasks()  { selectedLlmTasks.value  = new Set(llmTasks.value.map(t => t.id)) }
+function clearAllTasks()   { selectedLlmTasks.value  = new Set() }
+function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
+function clearAllModels()  { selectedLlmModels.value = new Set() }
+function toggleNode(id: string, checked: boolean) {
+  const next = new Set(enabledNodes.value)
+  checked ? next.add(id) : next.delete(id)
+  enabledNodes.value = next
+}
 
 // ── Data loaders ─────────────────────────────────────────────────────────────
 async function loadLlmTasks() {
@@ -335,6 +430,21 @@ async function loadLlmResults() {
   }
 }
 
+async function loadLlmConfig() {
+  const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
+  if (data?.judge_url && !llmJudgeUrl.value) {
+    llmJudgeUrl.value = data.judge_url
+  }
+}
+
+async function loadLlmNodes() {
+  const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
+  if (data?.nodes) {
+    llmNodes.value = data.nodes
+    enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
+  }
+}
+
 // ── Run / cancel ──────────────────────────────────────────────────────────────
 function startLlmBenchmark() {
   llmRunning.value = true
@@ -344,6 +454,15 @@ function startLlmBenchmark() {
   const params = new URLSearchParams()
   const taskIds = [...selectedLlmTasks.value].join(',')
   if (taskIds) params.set('task_ids', taskIds)
+  const modelIds = [...selectedLlmModels.value].join(',')
+  if (modelIds) params.set('model_ids', modelIds)
+  if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
+  if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
+  const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
+  const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
+  if (isRestricted && enabledNodeIds.value.length > 0) {
+    params.set('node_ids', enabledNodeIds.value.join(','))
+  }
 
   const es = new EventSource(`/api/cforch/run?${params}`)
   llmEventSource.value = es
@@ -387,6 +506,8 @@ onMounted(() => {
   loadLlmTasks()
   loadLlmModels()
   loadLlmResults()
+  loadLlmConfig()
+  loadLlmNodes()
 })
 </script>
 
@@ -451,6 +572,43 @@ onMounted(() => {
   color: var(--color-text-secondary, #6b7a99);
 }
 
+.judge-url-input {
+  flex: 1;
+  min-width: 14rem;
+  max-width: 24rem;
+  padding: 0.35rem 0.6rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.375rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2338);
+  font-size: 0.8rem;
+  font-family: var(--font-mono, monospace);
+}
+.judge-url-input:disabled { opacity: 0.5; }
+.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
+
+.workers-label {
+  display: flex;
+  align-items: center;
+  gap: 0.3rem;
+  font-size: 0.8rem;
+  color: var(--color-text-secondary, #6b7a99);
+  white-space: nowrap;
+}
+.workers-prefix { font-family: var(--font-mono, monospace); }
+.workers-input {
+  width: 3.2rem;
+  padding: 0.35rem 0.4rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.375rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2338);
+  font-size: 0.8rem;
+  font-family: var(--font-mono, monospace);
+  text-align: center;
+}
+.workers-input:disabled { opacity: 0.5; }
+
 /* ── Run log ────────────────────────────────────────────── */
 .run-log {
   border: 1px solid var(--color-border, #d0d7e8);
@@ -592,6 +750,15 @@ onMounted(() => {
   white-space: nowrap;
 }
 
+.hm-judge-col {
+  background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
+}
+.hm-judge-cell {
+  background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
+  font-style: italic;
+  opacity: 0.9;
+}
+
 /* ── Model Picker ───────────────────────────────────────── */
 .model-picker {
   border: 1px solid var(--color-border, #d0d7e8);
@@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼  '; }
   margin-left: auto;
 }
 
+.picker-bulk-btn {
+  padding: 0.1rem 0.45rem;
+  font-size: 0.7rem;
+  font-family: var(--font-mono, monospace);
+  background: var(--color-surface, #fff);
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.25rem;
+  color: var(--color-text-secondary, #6b7a99);
+  cursor: pointer;
+  transition: background 0.12s, color 0.12s;
+  flex-shrink: 0;
+}
+.picker-bulk-btn:hover {
+  background: var(--app-primary, #2A6080);
+  color: #fff;
+  border-color: var(--app-primary, #2A6080);
+}
+
 .picker-body {
   padding: 0.75rem;
   border-top: 1px solid var(--color-border, #d0d7e8);
@@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼  '; }
   .picker-model-list { padding-left: 0; }
   .picker-model-name { max-width: 14ch; }
 }
+
+/* ── Node picker ────────────────────────────────────── */
+.node-picker {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  flex-wrap: wrap;
+  padding: 0.5rem 0.75rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  background: var(--color-surface-raised, #e4ebf5);
+}
+
+.node-picker-label {
+  font-size: 0.78rem;
+  font-weight: 600;
+  color: var(--color-text-secondary, #6b7a99);
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  white-space: nowrap;
+}
+
+.node-chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.3rem;
+  padding: 0.2rem 0.55rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 1rem;
+  background: var(--color-surface, #fff);
+  font-size: 0.78rem;
+  font-family: var(--font-mono, monospace);
+  color: var(--color-text, #1a2338);
+  cursor: pointer;
+  transition: background 0.12s, opacity 0.12s;
+}
+.node-chip--off {
+  opacity: 0.45;
+  background: transparent;
+}
+.node-chip--offline {
+  opacity: 0.35;
+  cursor: not-allowed;
+  font-style: italic;
+}
+.node-chip-check { accent-color: var(--app-primary, #2A6080); }
+.node-chip-status {
+  font-size: 0.66rem;
+  color: var(--color-text-secondary, #6b7a99);
+}
+
+.node-picker-hint {
+  font-size: 0.72rem;
+  color: var(--color-text-secondary, #6b7a99);
+  font-family: var(--font-mono, monospace);
+  margin-left: auto;
+}
 </style>
diff --git a/web/src/views/ModelsView.vue b/web/src/views/ModelsView.vue
index 34cba78..364f4cb 100644
--- a/web/src/views/ModelsView.vue
+++ b/web/src/views/ModelsView.vue
@@ -51,8 +51,31 @@
           <span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
             {{ lookupResult.adapter_recommendation }}
           </span>
-          <span v-if="lookupResult.size != null" class="preview-size">
-            {{ humanBytes(lookupResult.size) }}
+          <span v-if="selectedQuantSize > 0" class="preview-size">
+            {{ humanBytes(selectedQuantSize) }}
+          </span>
+        </div>
+
+        <!-- GGUF quantization picker — only shown for GGUF repos -->
+        <div v-if="lookupResult.gguf_files?.length" class="quant-picker">
+          <label class="quant-label" for="quant-select">Quantization</label>
+          <select
+            id="quant-select"
+            v-model="selectedQuant"
+            class="quant-select"
+            aria-label="Select quantization variant"
+          >
+            <option :value="null" disabled>Select quantization…</option>
+            <option
+              v-for="f in lookupResult.gguf_files"
+              :key="f.filename"
+              :value="f.quant_name ?? f.filename"
+            >
+              {{ f.quant_name ?? f.filename }} — {{ humanBytes(f.size) }}
+            </option>
+          </select>
+          <span class="quant-hint">
+            Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
           </span>
         </div>
 
@@ -67,7 +90,7 @@
 
         <button
           class="btn-primary btn-add-queue"
-          :disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue"
+          :disabled="!canAddToQueue"
           @click="addToQueue"
         >
           {{ addingToQueue ? 'Adding…' : 'Add to queue' }}
@@ -99,9 +122,39 @@
           <span v-if="model.role"                  class="chip chip-role">{{ model.role }}</span>
           <span v-if="model.service"               class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
           <span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
+          <span v-if="model.quant_pattern"          class="chip chip-quant">{{ model.quant_pattern }}</span>
+        </div>
+        <!-- Allow manual service/role assignment for unrecognized pipeline tags -->
+        <div v-if="!model.service" class="classify-row queue-classify">
+          <select
+            class="classify-select"
+            :value="classifyDraft[model.id]?.service ?? ''"
+            @change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
+            aria-label="Assign service"
+          >
+            <option value="" disabled>Service…</option>
+            <option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
+          </select>
+          <select
+            class="classify-select"
+            :value="classifyDraft[model.id]?.role ?? ''"
+            :disabled="!classifyDraft[model.id]?.service"
+            @change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
+            aria-label="Assign role"
+          >
+            <option value="" disabled>Role…</option>
+            <option
+              v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
+              :key="role"
+              :value="role"
+            >{{ role }}</option>
+          </select>
         </div>
         <div class="model-card-actions">
-          <button class="btn-primary btn-sm" @click="approveModel(model.id)">
+          <button
+            class="btn-primary btn-sm"
+            @click="approveModel(model.id, classifyDraft[model.id])"
+          >
             Approve download
           </button>
         </div>
@@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'
 
 // ── Type definitions ──────────────────────────────────
 
+interface GgufFile {
+  filename: string
+  size: number
+  quant_name: string | null
+}
+
 interface LookupResult {
   repo_id: string
   pipeline_tag: string | null
@@ -260,7 +319,8 @@ interface LookupResult {
   service: string | null
   compatible: boolean
   warning: string | null
-  size: number | null
+  model_size_bytes: number
+  gguf_files: GgufFile[] | null
   description: string | null
   already_installed: boolean
   already_queued: boolean
@@ -274,6 +334,7 @@ interface QueuedModel {
   adapter_recommendation: string | null
   role: string | null
   service: string | null
+  quant_pattern: string | null
 }
 
 interface InstalledModel {
@@ -302,6 +363,26 @@ const lookupLoading = ref(false)
 const lookupError   = ref<string | null>(null)
 const lookupResult  = ref<LookupResult | null>(null)
 const addingToQueue = ref(false)
+const selectedQuant = ref<string | null>(null)
+
+// Size of the selected GGUF file, or total model size for non-GGUF repos.
+const selectedQuantSize = computed<number>(() => {
+  const r = lookupResult.value
+  if (!r) return 0
+  if (r.gguf_files?.length && selectedQuant.value) {
+    const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
+    return f?.size ?? r.model_size_bytes
+  }
+  return r.model_size_bytes
+})
+
+// Disable "Add to queue" when a GGUF repo but no quant chosen yet.
+const canAddToQueue = computed(() => {
+  const r = lookupResult.value
+  if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
+  if (r.gguf_files?.length && !selectedQuant.value) return false
+  return true
+})
 
 const queuedModels    = ref<QueuedModel[]>([])
 const installedModels = ref<InstalledModel[]>([])
@@ -411,6 +492,7 @@ async function doLookup() {
   lookupLoading.value = true
   lookupError.value   = null
   lookupResult.value  = null
+  selectedQuant.value = null
 
   try {
     const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
@@ -442,7 +524,15 @@ async function addToQueue() {
     const res = await fetch('/api/models/queue', {
       method: 'POST',
       headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }),
+      body: JSON.stringify({
+        repo_id,
+        pipeline_tag,
+        adapter_recommendation,
+        role,
+        service,
+        model_size_bytes: selectedQuantSize.value,
+        quant_pattern: selectedQuant.value,
+      }),
     })
     if (res.ok) {
       lookupResult.value = { ...lookupResult.value, already_queued: true }
@@ -454,8 +544,16 @@ async function addToQueue() {
   }
 }
 
-async function approveModel(id: string) {
+async function approveModel(id: string, draft?: { service: string; role: string }) {
   try {
+    // If the user picked a service/role for an unrecognized model, patch it first.
+    if (draft?.service && draft?.role) {
+      await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
+        method: 'PATCH',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ service: draft.service, role: draft.role }),
+      })
+    }
     const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
     if (res.ok) {
       await loadQueue()
@@ -774,6 +872,44 @@ onUnmounted(() => {
   align-self: flex-start;
 }
 
+/* ── Quant picker ── */
+.quant-picker {
+  display: flex;
+  flex-direction: column;
+  gap: 0.35rem;
+}
+
+.quant-label {
+  font-size: 0.8rem;
+  font-weight: 600;
+  color: var(--color-text-muted, #4a5c7a);
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+}
+
+.quant-select {
+  padding: 0.4rem 0.6rem;
+  border: 1px solid var(--color-border, #a8b8d0);
+  border-radius: var(--radius-md, 0.5rem);
+  background: var(--color-surface, #f0f4fb);
+  color: var(--color-text, #1a2338);
+  font-size: 0.9rem;
+  font-family: var(--font-mono, monospace);
+  cursor: pointer;
+}
+
+.quant-hint {
+  font-size: 0.78rem;
+  color: var(--color-text-muted, #4a5c7a);
+}
+
+.chip-quant {
+  background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
+  color: var(--color-primary, #2A6080);
+  font-family: var(--font-mono, monospace);
+  font-size: 0.75rem;
+}
+
 /* ── Model cards (queue + downloads) ── */
 .model-card {
   border: 1px solid var(--color-border, #a8b8d0);
diff --git a/web/src/views/PlansBenchTab.vue b/web/src/views/PlansBenchTab.vue
new file mode 100644
index 0000000..9368203
--- /dev/null
+++ b/web/src/views/PlansBenchTab.vue
@@ -0,0 +1,1043 @@
+<template>
+  <div class="plans-bench-tab">
+
+    <!-- ── Model selection ──────────────────────────────────────────────── -->
+    <details class="model-picker" open>
+      <summary class="picker-summary">
+        <span class="picker-title">🤖 Models</span>
+        <span class="picker-badge">{{ allSelectedModels.length }} selected</span>
+        <button class="btn-refresh" :disabled="loadingMeta" @click.stop="loadMeta" title="Refresh">
+          {{ loadingMeta ? '⏳' : '🔄' }}
+        </button>
+      </summary>
+      <div class="picker-body">
+        <div v-if="loadingMeta" class="picker-loading">Loading…</div>
+        <div v-else-if="loadError" class="picker-error">{{ loadError }}</div>
+        <template v-else>
+
+          <!-- cf-orch catalog (live from coordinator) -->
+          <div class="picker-group" v-if="cfOrchModels.length">
+            <div class="group-header">
+              <label class="group-check">
+                <input
+                  type="checkbox"
+                  :checked="isGroupAllSelected('cforch')"
+                  :indeterminate.prop="isGroupIndeterminate('cforch')"
+                  @change="toggleGroup('cforch', ($event.target as HTMLInputElement).checked)"
+                />
+                <span class="group-label">cf-orch catalog</span>
+                <span class="group-count">({{ cfOrchModels.length }})</span>
+              </label>
+              <span class="group-note">live from coordinator — auto-allocated per run</span>
+            </div>
+            <div class="model-list">
+              <label v-for="m in cfOrchModels" :key="m.id" class="model-item">
+                <input type="checkbox" :value="m.id" v-model="selectedModels" />
+                <span class="model-key">{{ m.name }}</span>
+                <span v-if="m.vram_mb" class="model-meta">{{ formatMb(m.vram_mb) }} VRAM</span>
+                <span v-if="m.description" class="model-desc">{{ m.description }}</span>
+              </label>
+            </div>
+          </div>
+
+          <!-- Registered shortcuts -->
+          <div class="picker-group">
+            <div class="group-header">
+              <span class="group-label">Registry shortcuts</span>
+              <span class="group-count">({{ registeredModels.length }})</span>
+            </div>
+            <div class="model-list">
+              <label v-for="m in registeredModels" :key="m.key" class="model-item">
+                <input type="checkbox" :value="m.key" v-model="selectedModels" />
+                <span class="model-key">{{ m.key }}</span>
+                <span class="model-desc">{{ m.description }}</span>
+              </label>
+            </div>
+          </div>
+
+          <!-- Custom -->
+          <div class="custom-model-row">
+            <label class="option-label">Custom model ID:</label>
+            <input
+              v-model="customModel"
+              class="custom-model-input"
+              placeholder="e.g. granite-4.1-8b"
+              @keydown.enter="addCustomModel"
+            />
+            <button class="btn-add-custom" :disabled="!customModel.trim()" @click="addCustomModel">Add</button>
+          </div>
+          <div v-if="customModels.length" class="custom-chips">
+            <span v-for="m in customModels" :key="m" class="custom-chip">
+              {{ m }}
+              <button class="chip-remove" @click="removeCustomModel(m)">×</button>
+            </span>
+          </div>
+
+        </template>
+      </div>
+    </details>
+
+    <!-- ── Options ──────────────────────────────────────────────────────── -->
+    <details class="options-panel">
+      <summary class="picker-summary">
+        <span class="picker-title">⚙️ Options</span>
+      </summary>
+      <div class="options-body">
+        <label class="option-row">
+          <input type="checkbox" v-model="useCfOrch" :disabled="running" />
+          <span class="option-label">Use cf-orch coordinator</span>
+          <span class="option-hint">Allocates each model via cf-orch (required for catalog models)</span>
+        </label>
+        <label class="option-row" :class="{ dimmed: useCfOrch }">
+          <span class="option-label">Direct API base</span>
+          <input
+            v-model="apiBase"
+            class="option-text"
+            placeholder="http://localhost:8080/v1"
+            :disabled="useCfOrch || running"
+          />
+          <span class="option-hint">Only used when cf-orch is disabled</span>
+        </label>
+        <div class="option-row">
+          <span class="option-label">Prompts</span>
+          <div class="prompt-filter">
+            <label v-for="p in allPromptIds" :key="p" class="prompt-check-item">
+              <input type="checkbox" :value="p" v-model="selectedPromptIds" />
+              <span>{{ p }}</span>
+            </label>
+          </div>
+          <span class="option-hint">Leave all checked to run all 10 held-out prompts</span>
+        </div>
+        <label class="option-row">
+          <span class="option-label">Workers</span>
+          <input
+            v-model.number="workers"
+            type="number"
+            min="1"
+            max="8"
+            class="option-number"
+            :disabled="running"
+          />
+          <span class="option-hint">Run N models in parallel (1 = sequential)</span>
+        </label>
+      </div>
+    </details>
+
+    <!-- ── Run controls ─────────────────────────────────────────────────── -->
+    <div class="run-bar">
+      <button
+        class="btn-run"
+        :disabled="running || allSelectedModels.length === 0"
+        @click="startBenchmark"
+      >
+        {{ running ? '⏳ Running…' : results ? '🔄 Re-run' : '▶ Run Benchmark' }}
+      </button>
+      <button v-if="running" class="btn-cancel" @click="cancelBenchmark">✕ Cancel</button>
+      <span v-if="allSelectedModels.length === 0 && !running" class="run-hint">
+        Select at least one model above
+      </span>
+    </div>
+
+    <!-- ── Progress log ─────────────────────────────────────────────────── -->
+    <div v-if="runLog.length" class="run-log">
+      <div class="run-log-header">
+        <span class="run-log-title">Run log</span>
+        <div class="run-log-actions">
+          <button class="btn-log-action" @click="copyLog" :title="copyTooltip">{{ copyLabel }}</button>
+          <button class="btn-log-action" @click="runLog = []">Clear</button>
+        </div>
+      </div>
+      <pre class="run-log-body" ref="logEl">{{ runLog.join('\n') }}</pre>
+    </div>
+
+    <!-- ── Past runs ────────────────────────────────────────────────────── -->
+    <div class="history-bar" v-if="pastRuns.length">
+      <label class="history-label">📂 Past runs:</label>
+      <select class="history-select" v-model="selectedRun" @change="loadRun(selectedRun)">
+        <option value="">— select a past run —</option>
+        <option v-for="r in pastRuns" :key="r.run_id" :value="r.run_id">
+          {{ r.date }} · {{ r.models.join(', ') }} · avg {{ r.avg_score.toFixed(3) }}
+        </option>
+      </select>
+    </div>
+
+    <!-- ── Results ──────────────────────────────────────────────────────── -->
+    <div v-if="results" class="results-section">
+
+      <!-- Comparison table (multi-model) -->
+      <template v-if="modelKeys.length > 1">
+        <h2 class="results-title">Comparison</h2>
+        <div class="table-wrap">
+          <table class="results-table">
+            <thead>
+              <tr>
+                <th class="col-prompt">Prompt</th>
+                <th v-for="mk in modelKeys" :key="mk" class="col-model">{{ mk }}</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr v-for="pid in promptOrder" :key="pid">
+                <td class="cell-prompt">
+                  <span class="prompt-name">{{ promptName(pid) }}</span>
+                  <span class="prompt-id">{{ pid }}</span>
+                </td>
+                <td
+                  v-for="mk in modelKeys"
+                  :key="mk"
+                  class="cell-score"
+                  :class="isBestForPrompt(mk, pid) ? 'best-score' : ''"
+                >
+                  {{ scoreFor(mk, pid).toFixed(3) }}
+                </td>
+              </tr>
+              <tr class="avg-row">
+                <td class="cell-prompt avg-label">Average</td>
+                <td
+                  v-for="mk in modelKeys"
+                  :key="mk"
+                  class="cell-score avg-score"
+                  :class="isBestAvg(mk) ? 'best-score' : ''"
+                >
+                  {{ modelAvg(mk).toFixed(3) }}
+                </td>
+              </tr>
+            </tbody>
+          </table>
+        </div>
+
+        <!-- Rubric breakdown per model -->
+        <h2 class="results-title" style="margin-top:1.5rem">Rubric breakdown</h2>
+        <div class="rubric-grid">
+          <div v-for="mk in modelKeys" :key="mk" class="rubric-card">
+            <div class="rubric-card-title">{{ mk }}</div>
+            <div v-for="(label, key) in rubricLabels" :key="key" class="rubric-row">
+              <span class="rubric-label">{{ label }}</span>
+              <div class="rubric-bar-wrap">
+                <div class="rubric-bar" :style="{ width: (rubricAvg(mk, key) * 100).toFixed(1) + '%' }"></div>
+              </div>
+              <span class="rubric-val">{{ rubricAvg(mk, key).toFixed(2) }}</span>
+            </div>
+          </div>
+        </div>
+      </template>
+
+      <!-- Single model results -->
+      <template v-else>
+        <h2 class="results-title">Results — {{ modelKeys[0] }}</h2>
+        <div class="table-wrap">
+          <table class="results-table">
+            <thead>
+              <tr>
+                <th class="col-prompt">Prompt</th>
+                <th class="col-score">Score</th>
+                <th class="col-words">Words</th>
+                <th class="col-latency">Latency</th>
+              </tr>
+            </thead>
+            <tbody>
+              <tr
+                v-for="r in singleResults"
+                :key="r.prompt_id"
+                class="result-row"
+                :class="r.error ? 'error-row' : r.total_score >= 0.6 ? 'good-row' : r.total_score < 0.3 ? 'warn-row' : ''"
+                @click="togglePromptDetail(r.prompt_id)"
+              >
+                <td class="cell-prompt">
+                  <span class="prompt-name">{{ r.prompt_name }}</span>
+                  <span class="prompt-id">{{ r.prompt_id }}</span>
+                </td>
+                <td class="cell-score">
+                  <span class="score-pill" :style="scorePillStyle(r.total_score)">
+                    {{ r.error ? 'ERR' : r.total_score.toFixed(3) }}
+                  </span>
+                </td>
+                <td class="cell-words">{{ r.word_count || '—' }}</td>
+                <td class="cell-latency">{{ r.latency_s ? r.latency_s.toFixed(1) + 's' : '—' }}</td>
+              </tr>
+            </tbody>
+          </table>
+        </div>
+
+        <!-- Rubric breakdown for single model -->
+        <div class="rubric-card single-rubric">
+          <div class="rubric-card-title">Rubric breakdown</div>
+          <div v-for="(label, key) in rubricLabels" :key="key" class="rubric-row">
+            <span class="rubric-label">{{ label }}</span>
+            <div class="rubric-bar-wrap">
+              <div class="rubric-bar" :style="{ width: (rubricAvg(modelKeys[0], key) * 100).toFixed(1) + '%' }"></div>
+            </div>
+            <span class="rubric-val">{{ rubricAvg(modelKeys[0], key).toFixed(2) }}</span>
+          </div>
+        </div>
+
+        <!-- Prompt detail panel -->
+        <div v-if="expandedPrompt" class="prompt-detail">
+          <div class="prompt-detail-header">
+            <span>{{ expandedPrompt.prompt_name }} ({{ expandedPrompt.prompt_id }})</span>
+            <button class="btn-close" @click="expandedPrompt = null">✕</button>
+          </div>
+          <div class="detail-grid">
+            <div class="detail-section">
+              <div class="detail-label">Rubric scores</div>
+              <div v-for="(label, key) in rubricLabels" :key="key" class="rubric-row">
+                <span class="rubric-label">{{ label }}</span>
+                <div class="rubric-bar-wrap">
+                  <div class="rubric-bar" :style="{ width: ((expandedPrompt.scores[key] ?? 0) * 100).toFixed(1) + '%' }"></div>
+                </div>
+                <span class="rubric-val">{{ (expandedPrompt.scores[key] ?? 0).toFixed(2) }}</span>
+              </div>
+            </div>
+            <div class="detail-section">
+              <div class="detail-label">Response</div>
+              <pre class="detail-response">{{ expandedPrompt.response }}</pre>
+            </div>
+          </div>
+        </div>
+      </template>
+    </div>
+
+  </div>
+</template>
+
+<script setup lang="ts">
+import { ref, computed, onMounted, nextTick, watch } from 'vue'
+
+interface ModelEntry    { key: string; description: string }
+interface CfOrchModel  { id: string; name: string; vram_mb: number | null; description: string }
+interface PastRun { run_id: string; filename: string; date: string; models: string[]; avg_score: number }
+interface PromptResult {
+  prompt_id: string
+  prompt_name: string
+  model_key: string
+  response: string
+  latency_s: number
+  word_count: number
+  scores: Record<string, number>
+  total_score: number
+  error?: string
+}
+type BenchResults = Record<string, PromptResult[]>
+
+const ALL_PROMPT_IDS = ['ho_001','ho_002','ho_003','ho_004','ho_005','ho_006','ho_007','ho_008','ho_009','ho_010']
+
+// ── State ────────────────────────────────────────────────────────────────────
+const loadingMeta      = ref(false)
+const loadError        = ref('')
+const registeredModels = ref<ModelEntry[]>([])
+const cfOrchModels     = ref<CfOrchModel[]>([])
+const rubricLabels     = ref<Record<string, string>>({})
+
+const selectedModels   = ref<string[]>([])
+const customModel      = ref('')
+const customModels     = ref<string[]>([])
+const useCfOrch        = ref(true)
+const apiBase          = ref('')
+const workers          = ref(1)
+const allPromptIds     = ref<string[]>(ALL_PROMPT_IDS)
+const selectedPromptIds = ref<string[]>([...ALL_PROMPT_IDS])
+
+const running   = ref(false)
+const runLog    = ref<string[]>([])
+const logEl     = ref<HTMLPreElement | null>(null)
+const copyLabel = ref('Copy')
+const copyTooltip = ref('Copy log to clipboard')
+
+async function copyLog() {
+  const text = runLog.value.join('\n')
+  await navigator.clipboard.writeText(text)
+  copyLabel.value = 'Copied!'
+  copyTooltip.value = 'Copied!'
+  setTimeout(() => {
+    copyLabel.value = 'Copy'
+    copyTooltip.value = 'Copy log to clipboard'
+  }, 2000)
+}
+const results   = ref<BenchResults | null>(null)
+const pastRuns  = ref<PastRun[]>([])
+const selectedRun = ref('')
+
+const expandedPromptId = ref<string | null>(null)
+
+// ── Computed ─────────────────────────────────────────────────────────────────
+const allSelectedModels = computed(() => [...selectedModels.value, ...customModels.value])
+
+const modelKeys = computed(() => results.value ? Object.keys(results.value) : [])
+
+const singleResults = computed((): PromptResult[] => {
+  if (!results.value || modelKeys.value.length !== 1) return []
+  return results.value[modelKeys.value[0]] ?? []
+})
+
+const promptOrder = computed((): string[] => {
+  if (!results.value) return []
+  const first = Object.values(results.value)[0] ?? []
+  return first.map(r => r.prompt_id)
+})
+
+const expandedPrompt = computed((): PromptResult | null => {
+  if (!expandedPromptId.value || !results.value) return null
+  return singleResults.value.find(r => r.prompt_id === expandedPromptId.value) ?? null
+})
+
+// ── Helpers ──────────────────────────────────────────────────────────────────
+
+function formatMb(mb: number | null): string {
+  if (!mb) return ''
+  return mb >= 1024 ? `${(mb / 1024).toFixed(1)} GB` : `${mb} MB`
+}
+
+function isGroupAllSelected(group: 'cforch'): boolean {
+  const ids = cfOrchModels.value.map(m => m.id)
+  return ids.length > 0 && ids.every(id => selectedModels.value.includes(id))
+}
+
+function isGroupIndeterminate(group: 'cforch'): boolean {
+  const ids = cfOrchModels.value.map(m => m.id)
+  const selected = ids.filter(id => selectedModels.value.includes(id))
+  return selected.length > 0 && selected.length < ids.length
+}
+
+function toggleGroup(group: 'cforch', checked: boolean) {
+  const ids = cfOrchModels.value.map(m => m.id)
+  if (checked) {
+    selectedModels.value = [...new Set([...selectedModels.value, ...ids])]
+  } else {
+    selectedModels.value = selectedModels.value.filter(m => !ids.includes(m))
+  }
+}
+
+function promptName(pid: string): string {
+  const r = Object.values(results.value ?? {})[0]?.find(x => x.prompt_id === pid)
+  return r?.prompt_name ?? pid
+}
+
+function scoreFor(mk: string, pid: string): number {
+  return results.value?.[mk]?.find(r => r.prompt_id === pid)?.total_score ?? 0
+}
+
+function modelAvg(mk: string): number {
+  const rows = results.value?.[mk] ?? []
+  const ok = rows.filter(r => !r.error)
+  if (!ok.length) return 0
+  return ok.reduce((s, r) => s + r.total_score, 0) / ok.length
+}
+
+function isBestForPrompt(mk: string, pid: string): boolean {
+  const best = Math.max(...modelKeys.value.map(k => scoreFor(k, pid)))
+  return scoreFor(mk, pid) === best && modelKeys.value.length > 1
+}
+
+function isBestAvg(mk: string): boolean {
+  const best = Math.max(...modelKeys.value.map(k => modelAvg(k)))
+  return modelAvg(mk) === best && modelKeys.value.length > 1
+}
+
+function rubricAvg(mk: string, key: string): number {
+  const rows = results.value?.[mk] ?? []
+  const ok = rows.filter(r => !r.error && r.scores)
+  if (!ok.length) return 0
+  return ok.reduce((s, r) => s + (r.scores[key] ?? 0), 0) / ok.length
+}
+
+function scorePillStyle(score: number): string {
+  const h = Math.round(score * 120)  // 0 = red, 120 = green
+  return `background: hsl(${h}, 60%, 42%); color: #fff`
+}
+
+function togglePromptDetail(pid: string) {
+  expandedPromptId.value = expandedPromptId.value === pid ? null : pid
+}
+
+// ── API calls ─────────────────────────────────────────────────────────────────
+async function loadMeta() {
+  loadingMeta.value = true
+  loadError.value = ''
+  try {
+    const r = await fetch('/api/plans-bench/models')
+    if (!r.ok) throw new Error(`HTTP ${r.status}`)
+    const data = await r.json()
+    registeredModels.value = data.registry ?? []
+    cfOrchModels.value     = data.cforch_models ?? []
+    rubricLabels.value     = data.rubric_labels ?? {}
+  } catch (e: unknown) {
+    loadError.value = e instanceof Error ? e.message : String(e)
+  } finally {
+    loadingMeta.value = false
+  }
+}
+
+async function loadPastRuns() {
+  try {
+    const r = await fetch('/api/plans-bench/results')
+    if (!r.ok) return
+    pastRuns.value = await r.json()
+  } catch {
+    // non-critical
+  }
+}
+
+async function loadRun(runId: string) {
+  if (!runId) return
+  try {
+    const r = await fetch(`/api/plans-bench/results/${runId}`)
+    if (!r.ok) throw new Error(`HTTP ${r.status}`)
+    results.value = await r.json()
+  } catch (e: unknown) {
+    runLog.value.push(`Error loading run: ${e instanceof Error ? e.message : String(e)}`)
+  }
+}
+
+function addCustomModel() {
+  const key = customModel.value.trim()
+  if (key && !customModels.value.includes(key)) {
+    customModels.value = [...customModels.value, key]
+  }
+  customModel.value = ''
+}
+
+function removeCustomModel(key: string) {
+  customModels.value = customModels.value.filter(m => m !== key)
+}
+
+function startBenchmark() {
+  if (running.value || allSelectedModels.value.length === 0) return
+  running.value = true
+  runLog.value = []
+  results.value = null
+  expandedPromptId.value = null
+
+  const params = new URLSearchParams({
+    models: allSelectedModels.value.join(','),
+    use_cforch: String(useCfOrch.value),
+    api_base: apiBase.value.trim(),
+    workers: String(workers.value),
+    prompt_ids: selectedPromptIds.value.length < allPromptIds.value.length
+      ? selectedPromptIds.value.join(',')
+      : '',
+  })
+
+  const es = new EventSource(`/api/plans-bench/run?${params}`)
+  es.onmessage = async (ev) => {
+    const msg = JSON.parse(ev.data)
+    if (msg.type === 'progress') {
+      runLog.value.push(msg.message)
+      await nextTick()
+      if (logEl.value) logEl.value.scrollTop = logEl.value.scrollHeight
+    } else if (msg.type === 'result') {
+      results.value = msg.results
+      await loadPastRuns()
+    } else if (msg.type === 'complete') {
+      running.value = false
+      es.close()
+    } else if (msg.type === 'error') {
+      runLog.value.push(`ERROR: ${msg.message}`)
+      running.value = false
+      es.close()
+    } else if (msg.type === 'cancelled') {
+      runLog.value.push('Benchmark cancelled.')
+      running.value = false
+      es.close()
+    }
+  }
+  es.onerror = () => {
+    runLog.value.push('Connection error — benchmark may have ended unexpectedly.')
+    running.value = false
+    es.close()
+  }
+}
+
+async function cancelBenchmark() {
+  try {
+    await fetch('/api/plans-bench/cancel', { method: 'POST' })
+  } catch {
+    // ignore
+  }
+}
+
+// ── Lifecycle ─────────────────────────────────────────────────────────────────
+onMounted(() => {
+  loadMeta()
+  loadPastRuns()
+})
+</script>
+
+<style scoped>
+.plans-bench-tab {
+  display: flex;
+  flex-direction: column;
+  gap: 1.25rem;
+}
+
+/* ── Picker / options shared ─────────────────────────────────────────── */
+.model-picker,
+.options-panel {
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  overflow: hidden;
+}
+
+.picker-summary {
+  display: flex;
+  align-items: center;
+  gap: 0.6rem;
+  padding: 0.6rem 0.9rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  cursor: pointer;
+  user-select: none;
+  font-size: 0.875rem;
+  font-weight: 600;
+  color: var(--color-text, #1a2540);
+}
+
+.picker-title { flex: 1 }
+
+.picker-badge {
+  background: var(--app-primary, #2A6080);
+  color: #fff;
+  border-radius: 1rem;
+  padding: 0.15rem 0.55rem;
+  font-size: 0.75rem;
+}
+
+.btn-refresh {
+  background: none;
+  border: none;
+  cursor: pointer;
+  font-size: 0.875rem;
+  padding: 0.1rem 0.3rem;
+  border-radius: 0.25rem;
+}
+.btn-refresh:hover { background: var(--color-hover, #e8ecf7) }
+
+.picker-body,
+.options-body {
+  padding: 0.75rem 1rem;
+  display: flex;
+  flex-direction: column;
+  gap: 0.5rem;
+}
+
+.picker-loading { color: var(--color-muted, #8a98b4); font-size: 0.85rem }
+.picker-error   { color: var(--color-danger, #c0392b); font-size: 0.85rem }
+
+/* ── Group headers ───────────────────────────────────────────────────── */
+.picker-group { display: flex; flex-direction: column; gap: 0.4rem }
+
+.group-header {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  padding: 0.2rem 0;
+  border-bottom: 1px solid var(--color-border, #d0d7e8);
+  margin-bottom: 0.2rem;
+}
+.group-check  { display: flex; align-items: center; gap: 0.35rem; cursor: pointer }
+.group-label  { font-weight: 600; font-size: 0.82rem }
+.group-count  { color: var(--color-muted, #8a98b4); font-size: 0.78rem }
+.group-note   { color: var(--color-muted, #8a98b4); font-size: 0.75rem; margin-left: auto }
+
+.model-meta   { color: var(--color-muted, #8a98b4); font-size: 0.78rem; margin-left: auto }
+
+.dimmed { opacity: 0.45 }
+
+/* ── Model list ──────────────────────────────────────────────────────── */
+.model-list {
+  display: flex;
+  flex-direction: column;
+  gap: 0.35rem;
+}
+
+.model-item {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  font-size: 0.875rem;
+  cursor: pointer;
+}
+.model-item:hover { color: var(--app-primary, #2A6080) }
+.model-key  { font-weight: 600; min-width: 10rem }
+.model-desc { color: var(--color-muted, #8a98b4); font-size: 0.8rem }
+
+/* ── Custom model row ────────────────────────────────────────────────── */
+.custom-model-row {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  margin-top: 0.25rem;
+  flex-wrap: wrap;
+}
+.custom-model-input {
+  flex: 1;
+  min-width: 10rem;
+  padding: 0.3rem 0.5rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.3rem;
+  font-size: 0.85rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2540);
+}
+.btn-add-custom {
+  padding: 0.3rem 0.75rem;
+  background: var(--app-primary, #2A6080);
+  color: #fff;
+  border: none;
+  border-radius: 0.3rem;
+  font-size: 0.8rem;
+  cursor: pointer;
+}
+.btn-add-custom:disabled { opacity: 0.4; cursor: not-allowed }
+
+.custom-chips {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.35rem;
+}
+.custom-chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.25rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 1rem;
+  padding: 0.2rem 0.55rem;
+  font-size: 0.8rem;
+}
+.chip-remove {
+  background: none;
+  border: none;
+  cursor: pointer;
+  font-size: 0.9rem;
+  color: var(--color-muted, #8a98b4);
+  padding: 0;
+  line-height: 1;
+}
+
+/* ── Options ─────────────────────────────────────────────────────────── */
+.option-row {
+  display: flex;
+  align-items: flex-start;
+  gap: 0.6rem;
+  font-size: 0.85rem;
+  flex-wrap: wrap;
+}
+.option-label {
+  min-width: 8rem;
+  font-weight: 500;
+  color: var(--color-text, #1a2540);
+  padding-top: 0.25rem;
+}
+.option-text {
+  flex: 1;
+  min-width: 14rem;
+  padding: 0.3rem 0.5rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.3rem;
+  font-size: 0.85rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2540);
+}
+.option-hint { color: var(--color-muted, #8a98b4); font-size: 0.78rem; align-self: center }
+.option-number {
+  width: 4.5rem;
+  padding: 0.3rem 0.5rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.3rem;
+  font-size: 0.85rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2540);
+  text-align: center;
+}
+
+.prompt-filter {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 0.35rem 0.75rem;
+}
+.prompt-check-item {
+  display: flex;
+  align-items: center;
+  gap: 0.3rem;
+  font-size: 0.8rem;
+  cursor: pointer;
+}
+
+/* ── Run bar ─────────────────────────────────────────────────────────── */
+.run-bar {
+  display: flex;
+  align-items: center;
+  gap: 0.75rem;
+  flex-wrap: wrap;
+}
+
+.btn-run {
+  padding: 0.5rem 1.25rem;
+  background: var(--app-primary, #2A6080);
+  color: #fff;
+  border: none;
+  border-radius: 0.4rem;
+  font-size: 0.9rem;
+  font-weight: 600;
+  cursor: pointer;
+  transition: opacity 0.15s;
+}
+.btn-run:disabled { opacity: 0.45; cursor: not-allowed }
+.btn-run:not(:disabled):hover { opacity: 0.88 }
+
+.btn-cancel {
+  padding: 0.5rem 1rem;
+  background: transparent;
+  border: 1px solid var(--color-danger, #c0392b);
+  color: var(--color-danger, #c0392b);
+  border-radius: 0.4rem;
+  font-size: 0.85rem;
+  cursor: pointer;
+}
+
+.run-hint { font-size: 0.82rem; color: var(--color-muted, #8a98b4) }
+
+/* ── Run log ─────────────────────────────────────────────────────────── */
+.run-log {
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  overflow: hidden;
+}
+.run-log-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.4rem 0.8rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  font-size: 0.8rem;
+  font-weight: 600;
+}
+.run-log-actions {
+  display: flex;
+  gap: 0.5rem;
+}
+.btn-log-action {
+  background: none;
+  border: none;
+  font-size: 0.78rem;
+  color: var(--color-muted, #8a98b4);
+  cursor: pointer;
+  padding: 0;
+}
+.btn-log-action:hover {
+  color: var(--color-text, #1a2540);
+}
+.run-log-body {
+  margin: 0;
+  padding: 0.7rem 0.9rem;
+  font-size: 0.78rem;
+  line-height: 1.5;
+  max-height: 14rem;
+  overflow-y: auto;
+  white-space: pre-wrap;
+  color: var(--color-text, #1a2540);
+  background: var(--color-surface, #fff);
+}
+
+/* ── History bar ─────────────────────────────────────────────────────── */
+.history-bar {
+  display: flex;
+  align-items: center;
+  gap: 0.6rem;
+  flex-wrap: wrap;
+}
+.history-label  { font-size: 0.85rem; font-weight: 500 }
+.history-select {
+  flex: 1;
+  min-width: 14rem;
+  padding: 0.35rem 0.6rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.3rem;
+  font-size: 0.85rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2540);
+}
+
+/* ── Results section ─────────────────────────────────────────────────── */
+.results-section { display: flex; flex-direction: column; gap: 1rem }
+
+.results-title {
+  font-size: 1rem;
+  font-weight: 700;
+  color: var(--app-primary, #2A6080);
+  margin: 0;
+}
+
+.table-wrap { overflow-x: auto }
+
+.results-table {
+  width: 100%;
+  border-collapse: collapse;
+  font-size: 0.85rem;
+}
+.results-table th {
+  text-align: left;
+  padding: 0.45rem 0.6rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  border-bottom: 2px solid var(--color-border, #d0d7e8);
+  font-weight: 600;
+  white-space: nowrap;
+}
+.results-table td {
+  padding: 0.4rem 0.6rem;
+  border-bottom: 1px solid var(--color-border, #d0d7e8);
+  vertical-align: middle;
+}
+
+.col-prompt { min-width: 12rem }
+.col-model  { min-width: 6rem; text-align: right }
+.col-score, .col-words, .col-latency { min-width: 5rem; text-align: right }
+
+.cell-prompt {
+  display: flex;
+  flex-direction: column;
+  gap: 0.1rem;
+}
+.prompt-name { font-weight: 500 }
+.prompt-id   { font-size: 0.75rem; color: var(--color-muted, #8a98b4) }
+
+.cell-score, .col-model { text-align: right }
+
+.best-score {
+  background: color-mix(in srgb, var(--app-primary, #2A6080) 8%, transparent);
+  font-weight: 700;
+}
+
+.avg-row td { font-weight: 700; background: var(--color-surface-alt, #f5f7fc) }
+.avg-label  { font-size: 0.82rem; text-transform: uppercase; letter-spacing: 0.04em }
+
+.score-pill {
+  display: inline-block;
+  min-width: 3.5rem;
+  text-align: center;
+  padding: 0.15rem 0.4rem;
+  border-radius: 0.3rem;
+  font-size: 0.8rem;
+  font-weight: 700;
+}
+
+.result-row { cursor: pointer }
+.result-row:hover td { background: var(--color-hover, #e8ecf7) }
+.good-row td { background: color-mix(in srgb, #27ae60 6%, transparent) }
+.warn-row td { background: color-mix(in srgb, #e74c3c 6%, transparent) }
+.error-row td { background: color-mix(in srgb, #e74c3c 10%, transparent); opacity: 0.7 }
+
+/* ── Rubric breakdown ────────────────────────────────────────────────── */
+.rubric-grid {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 1rem;
+}
+.rubric-card {
+  flex: 1;
+  min-width: 18rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  padding: 0.75rem 1rem;
+  display: flex;
+  flex-direction: column;
+  gap: 0.45rem;
+}
+.single-rubric { max-width: 38rem }
+
+.rubric-card-title {
+  font-weight: 700;
+  font-size: 0.875rem;
+  color: var(--app-primary, #2A6080);
+  margin-bottom: 0.25rem;
+}
+
+.rubric-row {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  font-size: 0.8rem;
+}
+.rubric-label {
+  flex: 0 0 13rem;
+  color: var(--color-text, #1a2540);
+  overflow: hidden;
+  text-overflow: ellipsis;
+  white-space: nowrap;
+}
+.rubric-bar-wrap {
+  flex: 1;
+  height: 0.5rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  border-radius: 0.25rem;
+  overflow: hidden;
+  min-width: 4rem;
+}
+.rubric-bar {
+  height: 100%;
+  background: var(--app-primary, #2A6080);
+  border-radius: 0.25rem;
+  transition: width 0.3s ease;
+}
+.rubric-val {
+  min-width: 2.5rem;
+  text-align: right;
+  font-weight: 600;
+  font-variant-numeric: tabular-nums;
+}
+
+/* ── Prompt detail panel ─────────────────────────────────────────────── */
+.prompt-detail {
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  overflow: hidden;
+}
+.prompt-detail-header {
+  display: flex;
+  justify-content: space-between;
+  align-items: center;
+  padding: 0.5rem 0.9rem;
+  background: var(--color-surface-alt, #f5f7fc);
+  font-weight: 600;
+  font-size: 0.875rem;
+}
+.btn-close {
+  background: none;
+  border: none;
+  cursor: pointer;
+  font-size: 1rem;
+  color: var(--color-muted, #8a98b4);
+}
+
+.detail-grid {
+  display: flex;
+  gap: 1rem;
+  padding: 0.75rem 1rem;
+  flex-wrap: wrap;
+}
+.detail-section {
+  flex: 1;
+  min-width: 14rem;
+  display: flex;
+  flex-direction: column;
+  gap: 0.4rem;
+}
+.detail-label {
+  font-weight: 600;
+  font-size: 0.8rem;
+  color: var(--color-muted, #8a98b4);
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+}
+.detail-response {
+  font-size: 0.78rem;
+  line-height: 1.55;
+  max-height: 20rem;
+  overflow-y: auto;
+  white-space: pre-wrap;
+  color: var(--color-text, #1a2540);
+  margin: 0;
+  background: var(--color-surface, #fff);
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.3rem;
+  padding: 0.5rem 0.75rem;
+}
+</style>