feat: plans benchmark harness — model scoring for CF planning prompts

Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue component, and registers /api/plans-bench in api.py. Also extends models registry (cf-text catalog integration), cforch client, LlmEvalTab, and ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
2026-05-02 23:36:04 -07:00 · 2026-05-02 23:36:04 -07:00 · bce932461a
commit bce932461a
parent e11db5ccd9
14 changed files with 3137 additions and 59 deletions
--- a/.env.example
+++ b/.env.example
@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
 # Set one of these to use a cloud LLM instead of a local model.
 # ANTHROPIC_API_KEY=sk-ant-...
 # OPENAI_API_KEY=sk-...
+
+# ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
+# Generate at https://huggingface.co/settings/tokens and accept model terms first.
+# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx
--- a/app/api.py
+++ b/app/api.py
@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
 from app.train.train import router as train_router
 app.include_router(train_router, prefix="/api/train")

+from app.plans_bench import router as plans_bench_router
+app.include_router(plans_bench_router, prefix="/api/plans-bench")
+
+# In-memory last-action store (single user, local tool — in-memory is fine)
+_last_action: dict | None = None
+
 from app.dashboard import router as dashboard_router
 app.include_router(dashboard_router, prefix="/api")

--- a/app/cforch.py
+++ b/app/cforch.py
@ -17,9 +17,12 @@ import logging
 import os
 import re
 import subprocess as _subprocess
+import tempfile
 from pathlib import Path
 from typing import Any

+import urllib.parse
+
 import yaml
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
        "license_key":     _coalesce(file_cfg.get("license_key", ""),     "CF_LICENSE_KEY"),
        "ollama_url":      _coalesce(file_cfg.get("ollama_url", ""),       "OLLAMA_HOST"),
        "ollama_model":    _coalesce(file_cfg.get("ollama_model", ""),     "OLLAMA_MODEL"),
+        "judge_url":       _coalesce(file_cfg.get("judge_url", ""),        "CF_JUDGE_URL"),
+        "hf_token":        _coalesce(file_cfg.get("hf_token", ""),         "HF_TOKEN"),
    }


+def _validate_service_url(url: str, param_name: str) -> str:
+    """Validate that a URL is a well-formed http/https URL with a hostname.
+
+    Guards against SSRF: only http/https is allowed; the URL must have a
+    non-empty host. Does not enforce an allowlist — call sites are internal
+    tooling, not a public API.
+    """
+    if not url:
+        return url
+    try:
+        parsed = urllib.parse.urlparse(url)
+    except Exception:
+        raise HTTPException(400, f"{param_name}: not a valid URL")
+    if parsed.scheme not in ("http", "https"):
+        raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
+    if not parsed.hostname:
+        raise HTTPException(400, f"{param_name}: URL has no hostname")
+    return url
+
+
 def _strip_ansi(text: str) -> str:
    """Remove ANSI escape codes from a string."""
    return re.sub(r'\x1b\[[0-9;]*m', '', text)
@ -147,48 +172,141 @@ def get_tasks() -> dict:

 # ── GET /models ────────────────────────────────────────────────────────────────

+# Services and roles surfaced in the benchmark model picker.
+# Covers all cf-orch service types that benchmark.py can route tasks to.
+_BENCH_SERVICES = frozenset({
+    "cf-text", "vllm",         # LLM text generation
+    "cf-stt",                  # speech-to-text
+    "cf-tts",                  # text-to-speech
+    "cf-vision",               # image classification / embedding
+    "cf-voice",                # audio context classification
+})
+_BENCH_ROLES = frozenset({
+    "generator", "vlm",        # LLM roles
+    "stt", "alm",              # speech recognition
+    "tts",                     # speech synthesis
+    "vision", "embedding",     # image understanding
+    "classifier",              # audio classification (cf-voice)
+})
+
+
@router.get("/models")
 def get_models() -> dict:
-    """Return model list from bench_models.yaml."""
+    """Return model list from bench_models.yaml merged with locally installed models.
+
+    bench_models.yaml entries are listed first and take precedence; any installed
+    model whose repo_id is already present in the YAML is skipped.  Only models
+    whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
+    cf-voice) are surfaced from the installed registry.
+    """
    cfg = _load_cforch_config()
    models_path = cfg.get("bench_models", "")
-    if not models_path:
-        return {"models": []}

+    models: list[dict] = []
+    bench_ids: set[str] = set()
+
+    if models_path:
        p = Path(models_path)
-    if not p.exists():
-        return {"models": []}
-
+        if p.exists():
            try:
                raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
            except yaml.YAMLError as exc:
                logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
-        return {"models": []}
-
-    models_raw = raw.get("models", []) or []
-    models: list[dict] = []
-    for m in models_raw:
+                raw = {}
+            for m in (raw.get("models", []) or []):
                if not isinstance(m, dict):
                    continue
+                model_id = m.get("id", "")
                models.append({
                    "name": m.get("name", ""),
-            "id": m.get("id", ""),
+                    "id": model_id,
                    "service": m.get("service", "ollama"),
                    "tags": m.get("tags", []) or [],
                    "vram_estimate_mb": m.get("vram_estimate_mb", 0),
                })
+                if model_id:
+                    bench_ids.add(model_id)
+
+    # Merge installed generator models not already in bench_models.yaml.
+    try:
+        from app.models import list_installed  # local import avoids circular dependency at module load
+        for installed in list_installed():
+            model_id: str = installed.get("model_id") or ""
+            service: str = installed.get("service") or ""
+            role: str = installed.get("role") or ""
+            if not model_id:
+                continue
+            if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
+                continue
+            if model_id in bench_ids:
+                continue
+            display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
+            models.append({
+                "name": display_name,
+                "id": model_id,
+                "service": service,
+                "tags": [role],
+                "vram_estimate_mb": installed.get("vram_mb") or 0,
+            })
+            bench_ids.add(model_id)
+    except Exception as exc:
+        logger.warning("Could not merge installed models into model list: %s", exc)

    return {"models": models}


 # ── GET /run ───────────────────────────────────────────────────────────────────

+@router.get("/nodes")
+def get_nodes() -> dict:
+    """Proxy the coordinator's /api/nodes list, returning node_id + online status.
+
+    Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
+    Returns an empty list if the coordinator is unreachable.
+    """
+    cfg = _load_cforch_config()
+    coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
+    if not coordinator_url:
+        return {"nodes": []}
+    try:
+        import httpx as _httpx
+        resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
+        resp.raise_for_status()
+        raw_nodes = resp.json().get("nodes", [])
+        return {
+            "nodes": [
+                {
+                    "node_id": n.get("node_id", ""),
+                    "online": n.get("last_heartbeat") is not None,
+                    "gpus": [
+                        {
+                            "gpu_id": g.get("gpu_id"),
+                            "name": g.get("name", ""),
+                            "vram_total_mb": g.get("vram_total_mb", 0),
+                            "vram_free_mb": g.get("vram_free_mb", 0),
+                        }
+                        for g in n.get("gpus", [])
+                    ],
+                }
+                for n in raw_nodes
+            ]
+        }
+    except Exception as exc:
+        logger.warning("Could not fetch nodes from coordinator: %s", exc)
+        return {"nodes": []}
+
+
@router.get("/run")
 def run_benchmark(
    task_ids: str = "",
+    model_ids: str = "",
    model_tags: str = "",
    coordinator_url: str = "",
    ollama_url: str = "",
+    judge_url: str = "",
+    judge_backend: str = "chat",
+    workers: int = 1,
+    node_ids: str = "",
 ) -> StreamingResponse:
    """Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
    global _BENCH_RUNNING, _bench_proc
@ -205,6 +323,13 @@ def run_benchmark(
    cfg_coordinator = cfg.get("coordinator_url", "")
    cfg_ollama      = cfg.get("ollama_url", "")
    cfg_license_key = cfg.get("license_key", "")
+    cfg_judge_url   = cfg.get("judge_url", "")
+
+    # Validate URL params before spawning the subprocess.
+    # _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
+    _validate_service_url(coordinator_url, "coordinator_url")
+    _validate_service_url(ollama_url, "ollama_url")
+    _validate_service_url(judge_url, "judge_url")

    def generate():
        global _BENCH_RUNNING, _bench_proc
@ -213,16 +338,68 @@ def run_benchmark(
            yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
            return

+        # Build effective models file: bench_models.yaml + any installed models
+        # whose IDs were selected but are absent from the YAML (e.g. downloaded
+        # via the Models view). Written to a temp file so benchmark.py sees one
+        # unified list; cleaned up in the finally block.
+        effective_models_file = bench_models
+        _tmp_models_path: str | None = None
+
+        if model_ids and bench_models and Path(bench_models).exists():
+            requested_ids = set(model_ids.split(","))
+            try:
+                raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
+                bench_entries: list[dict] = raw_bench.get("models", []) or []
+                bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
+                missing_ids = requested_ids - bench_id_set
+                if missing_ids:
+                    from app.models import list_installed
+                    installed_map = {
+                        m["model_id"]: m
+                        for m in list_installed()
+                        if m.get("model_id") and m.get("service") in _BENCH_SERVICES
+                    }
+                    extra: list[dict] = []
+                    for mid in missing_ids:
+                        if mid in installed_map:
+                            inst = installed_map[mid]
+                            entry: dict[str, Any] = {
+                                "id": mid,
+                                "name": mid.split("/", 1)[-1] if "/" in mid else mid,
+                                "service": inst.get("service", "cf-text"),
+                                "vram_estimate_mb": inst.get("vram_mb") or 0,
+                                "tags": [inst.get("role", "generator")],
+                                "temperature": 0.0,
+                            }
+                            local_path = inst.get("path", "") or inst.get("local_path", "")
+                            if local_path:
+                                entry["model_path"] = local_path
+                            extra.append(entry)
+                    if extra:
+                        merged = {"models": bench_entries + extra}
+                        tf = tempfile.NamedTemporaryFile(
+                            mode="w", suffix=".yaml", delete=False,
+                            prefix="avocet_bench_models_",
+                        )
+                        yaml.dump(merged, tf)
+                        tf.close()
+                        _tmp_models_path = tf.name
+                        effective_models_file = _tmp_models_path
+            except Exception as exc:
+                logger.warning("Could not merge installed models into temp bench file: %s", exc)
+
        cmd = [
            python_bin,
            bench_script,
            "--tasks", bench_tasks,
-            "--models", bench_models,
+            "--models", effective_models_file,
            "--output", results_dir,
        ]

        if task_ids:
            cmd.extend(["--filter-tasks"] + task_ids.split(","))
+        if model_ids:
+            cmd.extend(["--filter-models"] + model_ids.split(","))
        if model_tags:
            cmd.extend(["--filter-tags"] + model_tags.split(","))

@ -233,6 +410,15 @@ def run_benchmark(
            cmd.extend(["--coordinator", effective_coordinator])
        if effective_ollama:
            cmd.extend(["--ollama-url", effective_ollama])
+        effective_judge = judge_url if judge_url else cfg_judge_url
+        if effective_judge:
+            cmd.extend(["--judge-url", effective_judge])
+        if judge_backend and judge_backend != "chat":
+            cmd.extend(["--judge-backend", judge_backend])
+        if workers > 1:
+            cmd.extend(["--workers", str(workers)])
+        if node_ids:
+            cmd.extend(["--nodes"] + node_ids.split(","))

        # Pass license key as env var so subprocess can authenticate with cf-orch
        proc_env = {**os.environ}
@ -273,6 +459,11 @@ def run_benchmark(
            yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
        finally:
            _BENCH_RUNNING = False
+            if _tmp_models_path:
+                try:
+                    os.unlink(_tmp_models_path)
+                except OSError:
+                    pass

    return StreamingResponse(
        generate(),
@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
        "coordinator_url": cfg.get("coordinator_url", ""),
        "ollama_url":      cfg.get("ollama_url", ""),
        "ollama_model":    cfg.get("ollama_model", ""),
+        "judge_url":       cfg.get("judge_url", ""),
        "license_key_set": bool(cfg.get("license_key", "")),
        "source": "env" if not _config_file().exists() else "yaml+env",
    }
@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
 # ── GET /results ───────────────────────────────────────────────────────────────

@router.get("/results")
-def get_results() -> dict:
+def get_results() -> list:
    """Return the latest benchmark summary.json from results_dir."""
    cfg = _load_cforch_config()
    results_dir = cfg.get("results_dir", "")
--- a/app/models.py
+++ b/app/models.py
@ -15,6 +15,7 @@ from __future__ import annotations
 import json
 import logging
 import os
+import re
 import shutil
 import threading
 from datetime import datetime, timezone
@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(

 router = APIRouter()

+# ── HuggingFace auth ─────────────────────────────────────────────────────────
+
+def _get_hf_token() -> str | None:
+    """Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
+    config_file = _ROOT / "config" / "label_tool.yaml"
+    if config_file.exists():
+        try:
+            import yaml as _yaml
+            raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
+            token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
+            if token:
+                return token
+        except Exception:
+            pass
+    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
+
+
+# ── GGUF quantization detection ───────────────────────────────────────────────
+# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
+_QUANT_RE = re.compile(
+    r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
+    re.IGNORECASE,
+)
+
 # ── Download progress shared state ────────────────────────────────────────────
 # Updated by the background download thread; read by GET /download/stream.
 _download_progress: dict[str, Any] = {}
@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
    "audio-classification":           {"adapter": None, "role": "classifier", "service": "cf-voice"},
    # TTS — cf-tts text-to-speech service
    "text-to-speech":                 {"adapter": None, "role": "tts",       "service": "cf-tts"},
-    # Vision — cf-vision image classification / embedding / VLM service
+    # Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
    "image-classification":           {"adapter": None, "role": "vision",    "service": "cf-vision"},
    "zero-shot-image-classification": {"adapter": None, "role": "vision",    "service": "cf-vision"},
    "image-feature-extraction":       {"adapter": None, "role": "embedding", "service": "cf-vision"},
-    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "cf-vision"},
-    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "cf-vision"},
+    # Generative VLMs (image+text → text) — run under vllm, not cf-vision.
+    # cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
+    # LLaVA, and InternVL are textgen models that happen to accept image inputs.
+    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "vllm"},
+    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "vllm"},
    # Image generation — cf-image (text → image; distinct from cf-vision image understanding)
    "text-to-image":                  {"adapter": None, "role": "image-gen", "service": "cf-image"},
    # Embedding — cf-core shared embedding layer
@ -197,8 +225,15 @@ def _catalog_key(repo_id: str) -> str:

    ibm-granite/granite-4.1-8b              →  granite-4.1-8b
    facebook/bart-large-cnn                 →  bart-large-cnn
+    WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF  →  opus4.7-gods.ghost.codex-4b
+
+    The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
+    as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
    """
-    return repo_id.split("/", 1)[-1].lower()
+    key = repo_id.split("/", 1)[-1].lower()
+    if key.endswith(".gguf"):
+        key = key[:-5]
+    return key


 def _insert_catalog_entry(content: str, entry_lines: str) -> str:
@ -290,6 +325,15 @@ def _register_in_node_catalogs(
            max_mb: int = cf_text.get("max_mb", 0)
            catalog: dict = cf_text.get("catalog") or {}

+            # If the node has a different local model dir, remap the NFS path.
+            model_base = cf_text.get("model_base_path", "").rstrip("/")
+            if model_base:
+                nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
+                model_name = local_path.name
+                effective_path_str = f"{model_base}/{model_name}"
+            else:
+                effective_path_str = local_path_str
+
            # Skip if key already exists
            if model_key in catalog:
                logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
@ -301,10 +345,10 @@ def _register_in_node_catalogs(
                for entry in catalog.values()
                if isinstance(entry, dict)
            }
-            if local_path_str in registered_paths or any(
-                p.startswith(local_path_str + "/") for p in registered_paths
+            if effective_path_str in registered_paths or any(
+                p.startswith(effective_path_str + "/") for p in registered_paths
            ):
-                logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name)
+                logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
                continue

            # Determine whether model fits at FP16 or needs 4-bit
@ -330,12 +374,18 @@ def _register_in_node_catalogs(
                if needs_4bit
                else f"  # FP16 file-size estimate"
            )
+            env_block = (
+                f"        env:\n"
+                f"          CF_TEXT_4BIT: \"1\"\n"
+                if needs_4bit else ""
+            )
            entry_block = (
                f"      # auto-registered by avocet on download\n"
                f"      {model_key}:\n"
-                f"        path: {local_path_str}\n"
+                f"        path: {effective_path_str}\n"
                f"        vram_mb: {vram_for_node}{vram_comment}\n"
                f"        description: \"{desc}\"\n"
+                f"{env_block}"
            )

            new_content = _insert_catalog_entry(content, entry_block)
@ -388,12 +438,17 @@ def _run_download(
    role: str | None = None,
    service: str | None = None,
    model_size_bytes: int = 0,
+    quant_pattern: str | None = None,
 ) -> None:
    """Background thread: download model via huggingface_hub.snapshot_download.

    model_size_bytes is the sum of file sizes reported by the HF API (siblings).
    It is used to estimate vram_mb and written to model_info.json so cf-orch can
    budget VRAM when allocating a cf-text instance for this model.
+
+    quant_pattern: when set, restricts snapshot_download to only files matching
+    *{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
+    from GGUF-only repos like bartowski/*.
    """
    global _download_progress
    local_dir = _model_dir_for(repo_id, service)
@ -422,10 +477,20 @@ def _run_download(

        local_dir.mkdir(parents=True, exist_ok=True)
        poll_thread.start()
-        snapshot_download(
-            repo_id=repo_id,
-            local_dir=str(local_dir),
-        )
+
+        dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
+        hf_token = _get_hf_token()
+        if hf_token:
+            dl_kwargs["token"] = hf_token
+        if quant_pattern:
+            # Include both cases: repos use mixed conventions (Q6_K vs q6_k).
+            dl_kwargs["allow_patterns"] = [
+                f"*{quant_pattern.upper()}*.gguf",
+                f"*{quant_pattern.lower()}*.gguf",
+                "*.json",
+                "README.md",
+            ]
+        snapshot_download(**dl_kwargs)

        # Estimate VRAM from reported file size.
        # HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
        )
        logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)

-    # Estimate model size from siblings list
+    # Detect GGUF files and parse quant names from siblings list.
+    # For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
+    # a per-quant size picker instead of downloading every variant.
    siblings = data.get("siblings") or []
-    model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
+    gguf_files: list[dict] = []
+    for s in siblings:
+        if not isinstance(s, dict):
+            continue
+        fname: str = s.get("rfilename", "")
+        if not fname.lower().endswith(".gguf"):
+            continue
+        m = _QUANT_RE.search(fname)
+        gguf_files.append({
+            "filename": fname,
+            "size": s.get("size", 0) or 0,
+            "quant_name": m.group(1).upper() if m else None,
+        })
+    gguf_files.sort(key=lambda f: f["size"])
+
+    # model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
+    # For GGUF repos the frontend will substitute the selected quant's size on submit.
+    if gguf_files:
+        model_size_bytes: int = sum(f["size"] for f in gguf_files)
+    else:
+        model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))

    # Description: first 300 chars of card data (modelId field used as fallback)
    card_data = data.get("cardData") or {}
@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
        "compatible": compatible,
        "warning": warning,
        "model_size_bytes": model_size_bytes,
+        "gguf_files": gguf_files if gguf_files else None,
        "description": description,
        "tags": data.get("tags") or [],
        "downloads": data.get("downloads") or 0,
@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
    # Stored in the queue entry so approve can pass it to _run_download
    # without a second HF API round-trip.
    model_size_bytes: int = 0
+    # GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
+    # restricts to *{quant_pattern}*.gguf instead of fetching all variants.
+    quant_pattern: str | None = None


@router.post("/queue", status_code=201)
@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
        "role": req.role,
        "service": req.service,
        "model_size_bytes": req.model_size_bytes,
+        "quant_pattern": req.quant_pattern,
        "status": "pending",
        "queued_at": datetime.now(timezone.utc).isoformat(),
    }
@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
            entry.get("role"),
            entry.get("service"),
            entry.get("model_size_bytes", 0),
+            entry.get("quant_pattern"),
        ),
        daemon=True,
        name=f"model-download-{entry_id}",
@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
    return {"ok": True}


+# ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
+
+class QueuePatchRequest(BaseModel):
+    service: str | None = None
+    role: str | None = None
+
+
+@router.patch("/queue/{entry_id}")
+def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
+    """Update mutable fields (service, role) on a pending queue entry."""
+    entry = _get_queue_entry(entry_id)
+    if entry is None:
+        raise HTTPException(404, f"Queue entry {entry_id!r} not found")
+    if entry.get("status") != "pending":
+        raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
+
+    updates: dict = {}
+    if body.service is not None:
+        updates["service"] = body.service
+    if body.role is not None:
+        updates["role"] = body.role
+
+    updated = _update_queue_entry(entry_id, updates)
+    return updated or {}
+
+
 # ── DELETE /queue/{id} ─────────────────────────────────────────────────────────

@router.delete("/queue/{entry_id}")
--- a/config/label_tool.yaml.example
+++ b/config/label_tool.yaml.example
@ -41,11 +41,15 @@ cforch:
  # Python interpreter with cf-orch installed
  python_bin:   /devl/miniconda3/envs/cf/bin/python

-  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST
+  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
  # coordinator_url: http://localhost:7700
  # license_key:     CFG-AVCT-xxxx-xxxx-xxxx
  # ollama_url:      http://localhost:11434
  # ollama_model:    llama3.2:3b
+  # judge_url:       http://10.1.10.158:8008  # Sif cf-text — LLM-as-judge secondary scorer
+  # judge_url:       http://10.1.10.71:8008   # Heimdall cf-text (alternative)
+  # Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
+  # hf_token:        hf_xxxxxxxxxxxxxxxxxxxx  # HuggingFace token — required for gated/terms-restricted models

 # Imitate tab — pull real samples from sibling CF product APIs and run them
 # through local LLMs to build a corrections dataset.
--- a/manage.sh
+++ b/manage.sh
@ -90,6 +90,12 @@ usage() {
    echo -e "    ${GREEN}score [args]${NC}             Shortcut: --score [args]"
    echo -e "    ${GREEN}compare [args]${NC}           Shortcut: --compare [args]"
    echo ""
+    echo "  Planning Benchmark:"
+    echo -e "    ${GREEN}plans-bench [args]${NC}       Run benchmark_plans.py (args passed through)"
+    echo -e "    ${GREEN}plans-list${NC}               Shortcut: --list-models"
+    echo -e "    ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
+    echo -e "    ${GREEN}plans-compare <m1> <m2> [more]${NC}  Compare models side-by-side"
+    echo ""
    echo "  Writing Style Benchmark:"
    echo -e "    ${GREEN}style-bench [args]${NC}       Run benchmark_style.py (args passed through)"
    echo -e "    ${GREEN}style-list${NC}               List available ollama models for style bench"
@ -127,6 +133,8 @@ case "$CMD" in
        fi
        mkdir -p "$LOG_DIR"
        API_LOG="${LOG_DIR}/api.log"
+        # Load .env if present — sets HF_TOKEN and other optional overrides.
+        [[ -f .env ]] && set -a && source .env && set +a
        info "Building Vue SPA…"
        (cd web && npm run build) >> "$API_LOG" 2>&1
        info "Starting FastAPI on port ${API_PORT}…"
@ -179,6 +187,9 @@ case "$CMD" in
        mkdir -p "$LOG_DIR"
        DEV_API_LOG="${LOG_DIR}/dev-api.log"

+        # Load .env if present — sets HF_TOKEN and other optional overrides.
+        [[ -f .env ]] && set -a && source .env && set +a
+
        if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
            warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
        else
@ -255,6 +266,30 @@ case "$CMD" in
        exec "$0" benchmark --compare "$@"
        ;;

+    plans-bench)
+        info "Running planning benchmark (${ENV_UI})…"
+        "$PYTHON_UI" scripts/benchmark_plans.py "$@"
+        ;;
+
+    plans-list)
+        exec "$0" plans-bench --list-models
+        ;;
+
+    plans-run)
+        if [[ $# -lt 1 ]]; then
+            error "Usage: ./manage.sh plans-run <model-key> [extra args]"
+        fi
+        MODEL="$1"; shift
+        exec "$0" plans-bench --model "$MODEL" --verbose "$@"
+        ;;
+
+    plans-compare)
+        if [[ $# -lt 2 ]]; then
+            error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
+        fi
+        exec "$0" plans-bench --compare "$@" --verbose
+        ;;
+
    style-bench)
        info "Running writing style benchmark (${ENV_BM})…"
        if [[ ! -x "$PYTHON_BM" ]]; then
--- a/scripts/benchmark_plans.py
+++ b/scripts/benchmark_plans.py
@ -0,0 +1,719 @@
+#!/usr/bin/env python
+"""CF-specific planning benchmark — compare base models before fine-tuning.
+
+Sends held-out CircuitForge planning prompts to one or more models via the
+cf-text (local) or cf-orch API, then scores responses against CF-specific
+rubrics. Use this to select the best base model for SFT.
+
+Scoring rubrics (each 0-1, summed to total/N):
+  - task_structure    : uses checkbox syntax (- [ ]), git commit steps
+  - tier_awareness    : mentions Free/Paid/Premium/Ultra tiers
+  - privacy_pillar    : mentions privacy/local-inference/no-logging
+  - safety_pillar     : mentions safety, human approval, or reversibility
+  - accessibility     : mentions ND/accessibility/adaptive needs
+  - license_split     : mentions MIT vs BSL or open-core model
+  - file_paths        : uses plausible file path references
+  - cf_conventions    : uses conda run -n cf, /Library/Development/, or known CF dirs
+  - paired_coherence  : (paired only) plan references the design doc's feature name
+  - length_ok         : 300–2500 words (under-short = hallucination risk; over-long = padding)
+
+Usage
+-----
+    # List available model targets
+    python scripts/benchmark_plans.py --list-models
+
+    # Run all held-out prompts against a single model, print report
+    python scripts/benchmark_plans.py --model llama3.2-3b
+
+    # Compare two models side-by-side
+    python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
+
+    # Run with a custom API base (cf-text default: http://localhost:8080/v1)
+    python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
+
+    # Export detailed results JSON
+    python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from dataclasses import dataclass, field, asdict
+from pathlib import Path
+from typing import Any
+
+import httpx
+
+# ── Paths ──────────────────────────────────────────────────────────────────────
+
+_ROOT = Path(__file__).parent.parent
+_DATA_DIR = _ROOT / "data"
+
+CF_TEXT_BASE   = "http://localhost:8080/v1"
+CF_ORCH_BASE   = "http://localhost:8090/v1"
+CF_COORD_URL   = "http://10.1.10.71:7700"   # cf-orch coordinator (LAN)
+
+# ── Held-out prompts ───────────────────────────────────────────────────────────
+# These are NOT in the training export (no matching docs in circuitforge-plans/).
+# Each prompt exercises a different CF planning domain.
+
+HELD_OUT_PROMPTS: list[dict[str, Any]] = [
+    {
+        "id": "ho_001",
+        "name": "kiwi_barcode_ocr",
+        "domain": "feature_plan",
+        "prompt": (
+            "You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
+            "Write a detailed implementation plan for adding barcode scanning via device camera "
+            "and receipt OCR to the item-add flow.\n\n"
+            "The plan should include: file structure (create/modify), step-by-step task checklist "
+            "with checkboxes, any DB migrations, and git commit steps."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
+    },
+    {
+        "id": "ho_002",
+        "name": "peregrine_ats_scoring",
+        "domain": "feature_design",
+        "prompt": (
+            "Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
+            "Context: Peregrine users paste job descriptions and their resume. "
+            "We want to score how well the resume keywords match the JD and suggest rewrites. "
+            "Describe the architecture, data flow, and key design decisions."
+        ),
+        "expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
+    },
+    {
+        "id": "ho_003",
+        "name": "tier_gate_local_llm",
+        "domain": "architecture",
+        "prompt": (
+            "Design the tier-gating architecture for a new CircuitForge product. "
+            "The product should:\n"
+            "- Default to local LLM inference for all tiers\n"
+            "- Unlock cloud LLM for Paid tier and above\n"
+            "- Keep fine-tuned model weights for Premium/Ultra only\n\n"
+            "Describe how the tier check integrates with the LLM router, "
+            "what happens when a Free user tries a Paid-tier feature, "
+            "and how BYOK (bring-your-own-key) fits in."
+        ),
+        "expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
+    },
+    {
+        "id": "ho_004",
+        "name": "heimdall_webhook_plan",
+        "domain": "feature_plan",
+        "prompt": (
+            "Break the following Heimdall feature into a detailed implementation plan with "
+            "file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
+            "Heimdall is the CircuitForge license server (FastAPI + SQLite). "
+            "The webhook needs to handle checkout.session.completed, "
+            "customer.subscription.updated, and customer.subscription.deleted events."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
+    },
+    {
+        "id": "ho_005",
+        "name": "nd_accessible_onboarding",
+        "domain": "ux_design",
+        "prompt": (
+            "You are a product designer working on Harrier, a CircuitForge tool for "
+            "helping people navigate government benefits applications.\n\n"
+            "Design the onboarding flow for neurodivergent (ND) users. "
+            "Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
+            "and rejection sensitivity. The flow should reduce cognitive load and "
+            "never use urgency or panic patterns."
+        ),
+        "expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
+    },
+    {
+        "id": "ho_006",
+        "name": "circuitforge_core_extraction",
+        "domain": "architecture",
+        "prompt": (
+            "Produce a CircuitForge-style design document for the following circuitforge-core "
+            "feature — shared ActivityPub federation module.\n\n"
+            "Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
+            "to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
+            "Design the module API, describe what belongs in MIT vs BSL, and note federation "
+            "privacy constraints."
+        ),
+        "expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
+    },
+    {
+        "id": "ho_007",
+        "name": "snipe_trust_score_plan",
+        "domain": "feature_plan",
+        "prompt": (
+            "You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
+            "Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
+            "The score should combine: feedback ratio, account age, item-specifics completeness, "
+            "listing photo quality, and shipping time accuracy. "
+            "Include file structure, test plan, and migration steps."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
+    },
+    {
+        "id": "ho_008",
+        "name": "avocet_training_pipeline",
+        "domain": "feature_plan",
+        "prompt": (
+            "Break the following Avocet feature into a detailed implementation plan — "
+            "end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
+            "Avocet is the CircuitForge email classifier training tool. "
+            "The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
+            "quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
+            "in the Avocet model queue if it beats the baseline."
+        ),
+        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
+    },
+    {
+        "id": "ho_009",
+        "name": "privacy_data_flow",
+        "domain": "architecture",
+        "prompt": (
+            "Design the data privacy architecture for a CircuitForge cloud product. "
+            "Describe: what PII is collected, how it's stored, retention policy, "
+            "obfuscation strategy for cloud-side logs, and how consent is obtained "
+            "in plain language. The product handles job applications (resumes, cover letters)."
+        ),
+        "expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
+    },
+    {
+        "id": "ho_010",
+        "name": "git_workflow_doc",
+        "domain": "process_doc",
+        "prompt": (
+            "Write a developer process document for CircuitForge: conventional commit and "
+            "branch workflow for a BSL 1.1 open-core product.\n\n"
+            "Cover: commit message format (type: description), branch naming, "
+            "when to use feature branches vs direct main commits, "
+            "how the MIT/BSL split affects which commits go in which branch, "
+            "and how CI gates on gitleaks for secret scanning."
+        ),
+        "expected_signals": ["license_split", "cf_conventions", "task_structure"],
+    },
+]
+
+# ── Rubric scoring ─────────────────────────────────────────────────────────────
+
+_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
+_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
+_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
+_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
+_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
+_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
+_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
+_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
+_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
+
+
+@dataclass
+class RubricScore:
+    task_structure: float = 0.0
+    tier_awareness: float = 0.0
+    privacy_pillar: float = 0.0
+    safety_pillar: float = 0.0
+    accessibility: float = 0.0
+    license_split: float = 0.0
+    file_paths: float = 0.0
+    cf_conventions: float = 0.0
+    length_ok: float = 0.0
+
+    def total(self) -> float:
+        vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
+                self.safety_pillar, self.accessibility, self.license_split,
+                self.file_paths, self.cf_conventions, self.length_ok]
+        return sum(vals) / len(vals)
+
+    def as_dict(self) -> dict[str, float]:
+        return asdict(self)
+
+
+def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
+    words = len(response.split())
+    s = RubricScore()
+
+    # Task structure: needs checkboxes AND at least one commit step
+    checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
+    has_commit = bool(_COMMIT_RE.search(response))
+    s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
+
+    # Tier awareness
+    s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
+
+    # Privacy pillar
+    s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
+
+    # Safety pillar
+    s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
+
+    # Accessibility
+    s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
+
+    # License split awareness
+    s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
+
+    # File paths: at least 3 plausible path references
+    s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
+
+    # CF conventions
+    s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
+
+    # Length: 200–2500 words is healthy; outside = partial credit
+    if 200 <= words <= 2500:
+        s.length_ok = 1.0
+    elif words < 200:
+        s.length_ok = words / 200
+    else:
+        s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
+
+    return s
+
+
+# ── Model client ───────────────────────────────────────────────────────────────
+
+# Registry of named model targets (shorthand → {api_base, model_name})
+MODEL_REGISTRY: dict[str, dict[str, str]] = {
+    "deepseek-r1-1.5b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-r1-1.5b",
+        "description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
+    },
+    "deepseek-r1-7b-4bit": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-r1-7b-4bit",
+        "description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
+    },
+    "deepseek-coder-6.7b-4bit": {
+        "api_base": CF_TEXT_BASE,
+        "model": "deepseek-coder-6.7b-4bit",
+        "description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
+    },
+    "granite-4.1-8b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "granite-4.1-8b",
+        "description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
+    },
+    "qwen2.5-3b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "qwen2.5-3b",
+        "description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
+    },
+    "qwen2.5-7b": {
+        "api_base": CF_TEXT_BASE,
+        "model": "qwen2.5-7b",
+        "description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
+    },
+}
+
+
+# ── cf-orch allocation ─────────────────────────────────────────────────────────
+
+def _cforch_allocate(
+    model_id: str,
+    cforch_url: str,
+    startup_timeout_s: float = 300.0,
+) -> tuple[str, str] | None:
+    """Allocate a cf-text instance for model_id via the cf-orch coordinator.
+
+    Returns (service_url, allocation_id) on success, None on failure.
+    service_url is the direct node URL exposing /v1/chat/completions.
+    """
+    try:
+        resp = httpx.post(
+            f"{cforch_url}/api/services/cf-text/allocate",
+            json={
+                "model_candidates": [model_id],
+                "caller": "avocet",
+                "pipeline": "plans_benchmark",
+            },
+            timeout=120.0,
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        service_url: str = data["url"]
+        allocation_id: str = data.get("allocation_id", "")
+        node_id: str = data.get("node_id", "")
+        gpu_id: int | None = data.get("gpu_id")
+
+        if data.get("started", False) and not data.get("warm", True):
+            # Use \n so the SSE generator sees the line immediately
+            print(f"  [cold start] loading {model_id!r} — polling every 3s…", flush=True)
+            t0 = time.monotonic()
+            deadline = t0 + startup_timeout_s
+            probe_misses = 0
+
+            while time.monotonic() < deadline:
+                elapsed = time.monotonic() - t0
+                try:
+                    status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
+                    if status.is_success:
+                        instances = status.json().get("instances", [])
+                        match = next(
+                            (i for i in instances
+                             if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
+                            None,
+                        )
+                        if match:
+                            probe_misses = 0
+                            state = match.get("state", "")
+                            if state == "running":
+                                print(f"  [cold start] ready in {elapsed:.0f}s", flush=True)
+                                return service_url, allocation_id
+                            elif state == "stopped":
+                                print(f"  [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
+                                return None
+                            else:
+                                # still starting — emit keepalive so SSE stream stays alive
+                                print(f"  [cold start] state={state!r}  elapsed={elapsed:.0f}s", flush=True)
+                        else:
+                            probe_misses += 1
+                            print(f"  [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
+                            if probe_misses >= 6:
+                                try:
+                                    h = httpx.get(f"{service_url}/health", timeout=3.0)
+                                    if h.is_success:
+                                        print(f"  [cold start] ready via health check in {elapsed:.0f}s", flush=True)
+                                        return service_url, allocation_id
+                                except Exception:
+                                    pass
+                    else:
+                        print(f"  [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
+                except Exception as poll_exc:
+                    print(f"  [cold start] poll error: {poll_exc}  elapsed={elapsed:.0f}s", flush=True)
+                time.sleep(3.0)
+
+            print(f"  [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
+            return None
+
+        return service_url, allocation_id
+    except Exception as exc:
+        print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
+        return None
+
+
+def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
+    """Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
+    t0 = time.monotonic()
+    resp = httpx.post(
+        f"{service_url.rstrip('/')}/v1/chat/completions",
+        json={
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.2,
+        },
+        timeout=timeout,
+    )
+    resp.raise_for_status()
+    latency = time.monotonic() - t0
+    text = resp.json()["choices"][0]["message"]["content"]
+    return text, latency
+
+
+def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
+    """Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
+    t0 = time.monotonic()
+    resp = httpx.post(
+        f"{api_base}/chat/completions",
+        json={
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 2048,
+            "temperature": 0.2,
+        },
+        timeout=timeout,
+    )
+    resp.raise_for_status()
+    latency = time.monotonic() - t0
+    text = resp.json()["choices"][0]["message"]["content"]
+    return text, latency
+
+
+# ── Benchmark runner ───────────────────────────────────────────────────────────
+
+@dataclass
+class PromptResult:
+    prompt_id: str
+    prompt_name: str
+    model_key: str
+    response: str
+    latency_s: float
+    word_count: int
+    scores: dict[str, float]
+    total_score: float
+    error: str | None = None
+
+
+def run_benchmark(
+    model_key: str,
+    model_name: str,
+    prompts: list[dict[str, Any]] | None = None,
+    verbose: bool = False,
+    # cf-orch path
+    use_cforch: bool = False,
+    cforch_url: str = CF_COORD_URL,
+    # direct path (used when not cf-orch)
+    api_base: str = CF_TEXT_BASE,
+) -> list[PromptResult]:
+    """Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
+    if prompts is None:
+        prompts = HELD_OUT_PROMPTS
+
+    # Allocate once per model when using cf-orch
+    service_url: str | None = None
+    if use_cforch:
+        print(f"  Allocating {model_name!r} via cf-orch…", flush=True)
+        alloc = _cforch_allocate(model_name, cforch_url)
+        if alloc is None:
+            # Return all prompts as errors
+            return [
+                PromptResult(
+                    prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
+                    response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
+                    error=f"cf-orch allocation failed for {model_name!r}",
+                )
+                for p in prompts
+            ]
+        service_url, _alloc_id = alloc
+
+    results: list[PromptResult] = []
+    for p in prompts:
+        if verbose:
+            print(f"  [{p['id']}] {p['name']} … ", end="", flush=True)
+        try:
+            if service_url:
+                response, latency = _call_model_direct(service_url, model_name, p["prompt"])
+            else:
+                response, latency = _call_model(api_base, model_name, p["prompt"])
+            rubric = score_response(response, p)
+            result = PromptResult(
+                prompt_id=p["id"],
+                prompt_name=p["name"],
+                model_key=model_key,
+                response=response,
+                latency_s=round(latency, 2),
+                word_count=len(response.split()),
+                scores=rubric.as_dict(),
+                total_score=round(rubric.total(), 3),
+            )
+            if verbose:
+                print(f"score={result.total_score:.3f}  ({result.word_count}w, {latency:.1f}s)")
+        except Exception as exc:
+            result = PromptResult(
+                prompt_id=p["id"],
+                prompt_name=p["name"],
+                model_key=model_key,
+                response="",
+                latency_s=0.0,
+                word_count=0,
+                scores={},
+                total_score=0.0,
+                error=str(exc),
+            )
+            if verbose:
+                print(f"ERROR: {exc}")
+        results.append(result)
+    return results
+
+
+# ── Reporting ──────────────────────────────────────────────────────────────────
+
+def _print_single_report(results: list[PromptResult], model_key: str) -> None:
+    ok = [r for r in results if not r.error]
+    err = [r for r in results if r.error]
+    if not ok:
+        print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
+        return
+
+    avg_total = sum(r.total_score for r in ok) / len(ok)
+    avg_latency = sum(r.latency_s for r in ok) / len(ok)
+
+    # Aggregate per-rubric averages
+    rubric_keys = list(ok[0].scores.keys())
+    rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
+
+    print(f"\n{'='*60}")
+    print(f"  Model : {model_key}")
+    print(f"  Prompts: {len(ok)}/{len(results)} passed  ({len(err)} errors)")
+    print(f"  Overall score : {avg_total:.3f}  (avg latency {avg_latency:.1f}s)")
+    print(f"\n  Rubric breakdown:")
+    for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
+        bar = "█" * int(v * 20)
+        print(f"    {k:<22} {v:.3f}  {bar}")
+    print(f"\n  Per-prompt scores:")
+    for r in sorted(ok, key=lambda x: -x.total_score):
+        flag = "⚠" if r.total_score < 0.3 else " "
+        print(f"    {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f}  ({r.word_count}w)")
+    if err:
+        print(f"\n  Errors:")
+        for r in err:
+            print(f"    {r.prompt_id} {r.prompt_name}: {r.error}")
+    print(f"{'='*60}\n")
+
+
+def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
+    model_keys = list(all_results.keys())
+    prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
+
+    # Scores by (model, prompt_id)
+    score_map: dict[tuple[str, str], float] = {}
+    for mk, results in all_results.items():
+        for r in results:
+            score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
+
+    col_w = 10
+    header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
+    print(f"\n{'='*len(header)}")
+    print("  COMPARISON TABLE")
+    print(f"{'='*len(header)}")
+    print(f"  {header}")
+    print(f"  {'-'*len(header)}")
+
+    for pid in prompt_ids:
+        pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
+        row = f"  {pname:<35}"
+        best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
+        for mk in model_keys:
+            v = score_map.get((mk, pid), 0.0)
+            marker = "*" if v == best and len(model_keys) > 1 else " "
+            row += f"{v:.3f}{marker}   "
+        print(row)
+
+    print(f"  {'-'*len(header)}")
+    avgs_row = f"  {'AVERAGE':<35}"
+    best_avg = -1.0
+    avgs: dict[str, float] = {}
+    for mk in model_keys:
+        vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
+        avgs[mk] = sum(vals) / len(vals)
+        best_avg = max(best_avg, avgs[mk])
+    for mk in model_keys:
+        marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
+        avgs_row += f"{avgs[mk]:.3f}{marker}   "
+    print(avgs_row)
+    print(f"{'='*len(header)}\n")
+    if len(model_keys) > 1:
+        winner = max(avgs, key=lambda k: avgs[k])
+        print(f"  Winner: {winner}  (avg {avgs[winner]:.3f})\n")
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--list-models", action="store_true",
+                        help="Print registered model shortcuts and exit")
+    parser.add_argument("--model", metavar="KEY",
+                        help="Benchmark a single model (registry key or raw model name)")
+    parser.add_argument("--compare", nargs="+", metavar="KEY",
+                        help="Compare two or more models side-by-side")
+    parser.add_argument("--cforch", action="store_true",
+                        help="Route inference through cf-orch coordinator (allocate per model)")
+    parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
+                        help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
+    parser.add_argument("--api-base", default=None,
+                        help="Direct API base URL when not using cf-orch")
+    parser.add_argument("--model-name", default=None,
+                        help="Override model name sent to API (single-model runs only)")
+    parser.add_argument("--prompts", nargs="+", metavar="ID",
+                        help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
+    parser.add_argument("--output", type=Path, default=None,
+                        help="Write detailed JSON results to this path")
+    parser.add_argument("--workers", type=int, default=1, metavar="N",
+                        help="Run N models concurrently (default 1). Set to number of available nodes.")
+    parser.add_argument("--verbose", "-v", action="store_true",
+                        help="Print per-prompt progress")
+    args = parser.parse_args()
+
+    if args.list_models:
+        print("\nRegistered model shortcuts:")
+        for key, info in MODEL_REGISTRY.items():
+            print(f"  {key:<20} {info['description']}")
+        print(f"\nDefault endpoints:")
+        print(f"  direct    {CF_TEXT_BASE}")
+        print(f"  cf-orch   {CF_COORD_URL}")
+        return
+
+    prompts = HELD_OUT_PROMPTS
+    if args.prompts:
+        ids = set(args.prompts)
+        prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
+        if not prompts:
+            print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
+            sys.exit(1)
+
+    model_keys: list[str] = []
+    if args.compare:
+        model_keys = args.compare
+    elif args.model:
+        model_keys = [args.model]
+    else:
+        parser.print_help()
+        sys.exit(0)
+
+    all_results: dict[str, list[PromptResult]] = {}
+    print_lock = threading.Lock()
+
+    def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
+        if mk in MODEL_REGISTRY:
+            reg = MODEL_REGISTRY[mk]
+            model_name = args.model_name or reg["model"]
+            direct_base = args.api_base or reg["api_base"]
+        else:
+            model_name = args.model_name or mk
+            direct_base = args.api_base or CF_TEXT_BASE
+
+        if args.cforch:
+            with print_lock:
+                print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url})  model={model_name}")
+            results = run_benchmark(
+                mk, model_name, prompts=prompts, verbose=args.verbose,
+                use_cforch=True, cforch_url=args.cforch_url,
+            )
+        else:
+            with print_lock:
+                print(f"\nRunning [{mk}] → {direct_base}  model={model_name}")
+            results = run_benchmark(
+                mk, model_name, prompts=prompts, verbose=args.verbose,
+                api_base=direct_base,
+            )
+
+        with print_lock:
+            _print_single_report(results, mk)
+        return mk, results
+
+    workers = max(1, args.workers)
+    if workers == 1 or len(model_keys) == 1:
+        for mk in model_keys:
+            mk_out, results = _run_one(mk)
+            all_results[mk_out] = results
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as pool:
+            futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
+            for fut in as_completed(futures):
+                mk_out, results = fut.result()
+                all_results[mk_out] = results
+
+    if len(model_keys) > 1:
+        _print_comparison_table(all_results)
+
+    if args.output:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        payload = {
+            mk: [asdict(r) for r in results]
+            for mk, results in all_results.items()
+        }
+        with open(args.output, "w", encoding="utf-8") as f:
+            json.dump(payload, f, indent=2, ensure_ascii=False)
+        print(f"Wrote detailed results to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/export_plans.py
+++ b/scripts/export_plans.py
@ -0,0 +1,458 @@
+"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
+
+Each record is a HuggingFace chat-format example:
+
+    {
+      "id": "<sha256>",
+      "messages": [
+        {"role": "user", "content": "<reconstructed planning prompt>"},
+        {"role": "assistant", "content": "<cleaned document content>"}
+      ],
+      "meta": {
+        "source": "peregrine/2026-03-03-feedback-button-design.md",
+        "product": "peregrine",
+        "doc_type": "design",     # design | plan | spec | implementation | other
+        "date": "2026-03-03",
+        "paired_with": "...",     # sibling path, or null
+        "word_count": 1847,
+        "pair_role": "context"    # "context" | "target" | "standalone"
+      }
+    }
+
+Pairing strategy
+----------------
+When a design doc and a plan doc share the same date + feature-name prefix,
+they are treated as a pair:
+  - design → plan: instruction = "Given this design doc, write the implementation plan."
+    context appended = full design doc content.
+  - Solo docs get a synthetic instruction from the title + first overview section.
+
+Usage
+-----
+    # Preview stats and 5 sample records
+    python scripts/export_plans.py --preview
+
+    # Write full output
+    python scripts/export_plans.py --output data/plan_pairs.jsonl
+
+    # Restrict to specific products
+    python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
+"""
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import re
+import sys
+from pathlib import Path
+from typing import Iterator
+
+# ── Paths ──────────────────────────────────────────────────────────────────────
+
+_SCRIPT_DIR = Path(__file__).parent
+_AVOCET_ROOT = _SCRIPT_DIR.parent
+_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
+_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
+
+# ── Doc type detection ─────────────────────────────────────────────────────────
+
+_TYPE_RE = re.compile(
+    r"-(design|plan|spec|implementation|specs|plans)s?$",
+    re.IGNORECASE,
+)
+
+_SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
+
+# Boilerplate lines to strip from document content before using as output.
+_BOILERPLATE_RE = re.compile(
+    r"""
+    ^\s*>\s*\*\*For\s+agentic\s+workers.*         # superpowers agent hints
+    |^\s*>\s*REQUIRED\s+SUB-SKILL.*
+    |^\s*\*\*Date:\*\*.*                           # metadata header lines
+    |\*\*Status:\*\*\s*Complete.*                  # completed-feature noise
+    |\*\*Status:\*\*\s*Done.*
+    |\*\*Product:\*\*.*
+    |\*\*Repo:\*\*.*
+    |\*\*Tech\s+Stack:\*\*.*
+    |\*\*Candidate:\*\*.*                          # old synthetic personas
+    |^Candidate:.*
+    |^Team:.*
+    """,
+    re.VERBOSE | re.MULTILINE,
+)
+
+# Old repo/path names to normalise to current equivalents.
+_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
+    (re.compile(r"/devl/job-seeker", re.IGNORECASE),      "/Library/Development/CircuitForge/peregrine"),
+    (re.compile(r"\bjob-seeker\b",   re.IGNORECASE),      "peregrine"),
+    (re.compile(r"Alex Rivera",      re.IGNORECASE),      "[user]"),
+]
+
+# Instruction paraphrase templates per doc type.
+# Each entry is (user_prefix, paired_prefix).
+# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
+_DESIGN_INSTRUCTIONS = [
+    "Write a design document for {product}: {title}.\n\nContext: {overview}",
+    "You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
+    "Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
+]
+
+_PLAN_INSTRUCTIONS = [
+    "Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
+    "Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
+    "You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
+]
+
+_PAIRED_INSTRUCTIONS = [
+    (
+        "You are a software architect working on {product}, a CircuitForge product. "
+        "Given the following design document, write a detailed implementation plan "
+        "(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
+        "---\n{design_context}\n---"
+    ),
+    (
+        "The following is a design spec for a {product} feature. "
+        "Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
+        "---\n{design_context}\n---"
+    ),
+    (
+        "Convert this {product} design document into an actionable implementation plan. "
+        "Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
+        "---\n{design_context}\n---"
+    ),
+]
+
+
+def _doc_type(stem: str) -> str:
+    m = _TYPE_RE.search(stem)
+    if not m:
+        return "other"
+    raw = m.group(1).lower().rstrip("s")
+    return {"implementation": "plan"}.get(raw, raw)
+
+
+def _date_feature(stem: str) -> tuple[str, str]:
+    """Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
+    m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
+    if m:
+        return m.group(1), m.group(2)
+    return "", stem
+
+
+# ── Content extraction ─────────────────────────────────────────────────────────
+
+def _extract_title(content: str) -> str:
+    m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
+    return m.group(1).strip() if m else ""
+
+
+def _extract_overview(content: str) -> str:
+    """Return first substantive paragraph or h2 section body (≤300 chars)."""
+    # Superpowers plans have an explicit **Goal:** line — prefer that.
+    goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
+    if goal_m:
+        return goal_m.group(1).strip()[:300]
+
+    # Otherwise use the body of the first h2 section.
+    h2_m = re.search(
+        r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
+        content,
+        re.MULTILINE,
+    )
+    if h2_m:
+        body = h2_m.group(1).strip()
+        # Strip markdown bullet/code noise for the instruction
+        body = re.sub(r"```[\s\S]*?```", "", body)
+        body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
+        body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
+        body = re.sub(r"\s+", " ", body).strip()
+        return body[:300]
+
+    return ""
+
+
+def _clean_content(content: str) -> str:
+    """Remove boilerplate, normalize old paths/names, collapse whitespace."""
+    cleaned = _BOILERPLATE_RE.sub("", content)
+    for pattern, replacement in _PATH_NORMALIZATIONS:
+        cleaned = pattern.sub(replacement, cleaned)
+    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
+    return cleaned.strip()
+
+
+def _quality_flags(content: str) -> list[str]:
+    """Return a list of quality issue labels found in cleaned content."""
+    flags = []
+    if "Alex Rivera" in content or "[user]" in content:
+        flags.append("persona-residue")
+    if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
+        flags.append("completed-status")
+    return flags
+
+
+def _make_instruction(
+    title: str,
+    product: str,
+    doc_type: str,
+    overview: str,
+    design_context: str | None = None,
+    variant: int = 0,
+) -> str:
+    """Synthesise a natural planning prompt for this document.
+
+    variant: 0-2 selects which paraphrase template to use. Caller cycles
+    through all three to produce multiple training examples per document.
+    """
+    product_label = product.replace("-", " ").title() if product else "CircuitForge"
+    idx = variant % 3
+
+    if design_context:
+        tmpl = _PAIRED_INSTRUCTIONS[idx]
+        return tmpl.format(
+            product=product_label,
+            design_context=design_context[:2500],
+        )
+
+    templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
+    tmpl = templates[idx]
+    return tmpl.format(
+        product=product_label,
+        title=title,
+        overview=overview or "",
+        type_phrase="planning document",
+    )
+
+
+def _record_id(content: str, source: str) -> str:
+    return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
+
+
+# ── Pair discovery ─────────────────────────────────────────────────────────────
+
+def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
+    """Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
+    by_prefix: dict[str, list[tuple[str, Path]]] = {}
+    for path in plans_dir.rglob("*.md"):
+        if any(part in _SKIP_DIRS for part in path.parts):
+            continue
+        if path.name == "README.md":
+            continue
+        stem = path.stem
+        date, feature = _date_feature(stem)
+        if not date:
+            continue
+        key = str(path.parent / f"{date}-{feature}")
+        by_prefix.setdefault(key, []).append((_doc_type(stem), path))
+    return by_prefix
+
+
+# ── Record generation ──────────────────────────────────────────────────────────
+
+def _records_for_group(
+    doc_type_paths: list[tuple[str, Path]],
+    plans_dir: Path,
+) -> Iterator[dict]:
+    """Yield one or more training records for a group of related docs."""
+    # Separate design vs plan docs within this group
+    designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
+    plans_  = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
+    others  = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
+
+    all_paths = doc_type_paths
+
+    if designs and plans_:
+        # Paired: yield a design→plan record (3 instruction variants)
+        design_type, design_path = designs[0]
+        plan_type,   plan_path   = plans_[0]
+        design_content = design_path.read_text(encoding="utf-8")
+        plan_content   = plan_path.read_text(encoding="utf-8")
+
+        product        = _product_from_path(plan_path, plans_dir)
+        title          = _extract_title(plan_content) or plan_path.stem
+        cleaned        = _clean_content(plan_content)
+        design_cleaned = _clean_content(design_content)
+        flags          = _quality_flags(cleaned)
+
+        if len(cleaned.split()) >= 80:
+            rel_src    = str(plan_path.relative_to(plans_dir))
+            rel_design = str(design_path.relative_to(plans_dir))
+            for variant in range(3):
+                instruction = _make_instruction(
+                    title=title,
+                    product=product,
+                    doc_type="plan",
+                    overview=_extract_overview(design_content),
+                    design_context=design_cleaned,
+                    variant=variant,
+                )
+                yield {
+                    "id": _record_id(f"v{variant}:{cleaned}", rel_src),
+                    "messages": [
+                        {"role": "user", "content": instruction},
+                        {"role": "assistant", "content": cleaned},
+                    ],
+                    "meta": {
+                        "source": rel_src,
+                        "product": product,
+                        "doc_type": "plan",
+                        "date": _date_feature(plan_path.stem)[0],
+                        "paired_with": rel_design,
+                        "word_count": len(cleaned.split()),
+                        "pair_role": "target",
+                        "variant": variant,
+                        "quality_flags": flags,
+                    },
+                }
+
+        # Also yield the design doc as standalone variants
+        all_paths = [(t, p) for t, p in all_paths if p != plan_path]
+
+    # Remaining docs as standalone records (3 instruction variants each)
+    for doc_type, path in all_paths:
+        content = path.read_text(encoding="utf-8")
+        cleaned = _clean_content(content)
+        if len(cleaned.split()) < 80:
+            continue
+
+        product  = _product_from_path(path, plans_dir)
+        title    = _extract_title(content) or path.stem
+        overview = _extract_overview(content)
+        flags    = _quality_flags(cleaned)
+        rel_src  = str(path.relative_to(plans_dir))
+
+        for variant in range(3):
+            instruction = _make_instruction(
+                title=title,
+                product=product,
+                doc_type=doc_type,
+                overview=overview,
+                variant=variant,
+            )
+            yield {
+                "id": _record_id(f"v{variant}:{cleaned}", rel_src),
+                "messages": [
+                    {"role": "user", "content": instruction},
+                    {"role": "assistant", "content": cleaned},
+                ],
+                "meta": {
+                    "source": rel_src,
+                    "product": product,
+                    "doc_type": doc_type,
+                    "date": _date_feature(path.stem)[0],
+                    "paired_with": None,
+                    "word_count": len(cleaned.split()),
+                    "pair_role": "standalone",
+                    "variant": variant,
+                    "quality_flags": flags,
+                },
+            }
+
+
+def _product_from_path(path: Path, plans_dir: Path) -> str:
+    rel = path.relative_to(plans_dir)
+    return rel.parts[0] if len(rel.parts) > 1 else "shared"
+
+
+# ── Main export ────────────────────────────────────────────────────────────────
+
+def export(
+    plans_dir: Path,
+    products: list[str] | None = None,
+) -> list[dict]:
+    groups = _find_pairs(plans_dir)
+    records: list[dict] = []
+    seen_ids: set[str] = set()
+
+    for group_key, doc_type_paths in groups.items():
+        # Filter by product if requested
+        if products:
+            paths = [p for _, p in doc_type_paths]
+            prods = {_product_from_path(p, plans_dir) for p in paths}
+            if not prods.intersection(products):
+                continue
+
+        for record in _records_for_group(doc_type_paths, plans_dir):
+            if record["id"] not in seen_ids:
+                seen_ids.add(record["id"])
+                records.append(record)
+
+    return records
+
+
+# ── CLI ────────────────────────────────────────────────────────────────────────
+
+def _print_stats(records: list[dict]) -> None:
+    from collections import Counter
+    products  = Counter(r["meta"]["product"]   for r in records)
+    doc_types = Counter(r["meta"]["doc_type"]  for r in records)
+    pair_roles = Counter(r["meta"]["pair_role"] for r in records)
+    wc = [r["meta"]["word_count"] for r in records]
+    wc.sort()
+
+    print(f"\n{'='*55}")
+    print(f"  Total records: {len(records)}")
+    print(f"  Word counts  : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
+    print(f"\n  By product:")
+    for p, n in products.most_common():
+        print(f"    {p:<22} {n}")
+    print(f"\n  By doc type:")
+    for t, n in doc_types.most_common():
+        print(f"    {t:<22} {n}")
+    print(f"\n  Pair roles:")
+    for r, n in pair_roles.most_common():
+        print(f"    {r:<22} {n}")
+    print(f"{'='*55}\n")
+
+
+def _print_sample(records: list[dict], n: int = 3) -> None:
+    import random
+    sample = random.sample(records, min(n, len(records)))
+    for i, rec in enumerate(sample, 1):
+        meta = rec["meta"]
+        user_msg = rec["messages"][0]["content"]
+        asst_msg = rec["messages"][1]["content"]
+        print(f"\n{'─'*55}")
+        print(f"SAMPLE {i}/{n}  [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
+        print(f"source: {meta['source']}")
+        print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
+        print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
+    print(f"\n{'─'*55}\n")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
+    parser.add_argument("--output", type=Path, default=None,
+                        help="Write JSONL to this path (omit for preview-only)")
+    parser.add_argument("--products", default=None,
+                        help="Comma-separated product filter, e.g. peregrine,kiwi")
+    parser.add_argument("--preview", action="store_true",
+                        help="Print stats + sample records, don't write output")
+    parser.add_argument("--samples", type=int, default=3,
+                        help="Number of sample records to show in preview (default 3)")
+    args = parser.parse_args()
+
+    products = [p.strip() for p in args.products.split(",")] if args.products else None
+
+    print(f"Scanning {args.plans_dir} …", file=sys.stderr)
+    records = export(args.plans_dir, products=products)
+
+    _print_stats(records)
+
+    if args.preview or args.output is None:
+        _print_sample(records, n=args.samples)
+        if args.output is None:
+            print("(Pass --output <path> to write JSONL)")
+        return
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with open(args.output, "w", encoding="utf-8") as f:
+        for rec in records:
+            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
+
+    print(f"Wrote {len(records)} records to {args.output}")
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/test_cforch.py
+++ b/tests/test_cforch.py
@ -14,7 +14,9 @@ from fastapi.testclient import TestClient

@pytest.fixture(autouse=True)
 def reset_cforch_globals(tmp_path):
-    """Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
+    """Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
+    list_installed to return [] so real disk model directories don't bleed into
+    tests that don't exercise the installed-model merge path."""
    from app import cforch as cforch_module

    prev_config_dir = cforch_module._CONFIG_DIR
@ -25,6 +27,7 @@ def reset_cforch_globals(tmp_path):
    cforch_module._BENCH_RUNNING = False
    cforch_module._bench_proc = None

+    with patch("app.models.list_installed", return_value=[]):
        yield tmp_path

    cforch_module.set_config_dir(prev_config_dir)
@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
    assert m["vram_estimate_mb"] == 6000


+def test_models_merges_installed_generators(client, config_dir, tmp_path):
+    """Installed cf-text/vllm generator models appear in the model list,
+    deduplicated against bench_models.yaml entries."""
+    models_file = tmp_path / "bench_models.yaml"
+    _write_models_yaml(models_file, [
+        {"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
+        {"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
+    ])
+    _write_config(config_dir, {"bench_models": str(models_file)})
+
+    fake_installed = [
+        # should be included — cf-text generator not already in YAML
+        {"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
+        # should be deduped — repo_id matches a YAML entry
+        {"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
+        # should be excluded — classifier, not a generator
+        {"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
+    ]
+    with patch("app.models.list_installed", return_value=fake_installed):
+        r = client.get("/api/cforch/models")
+    assert r.status_code == 200
+    ids = [m["id"] for m in r.json()["models"]]
+    assert "llama3:8b" in ids                           # from YAML
+    assert "ibm-granite/granite-4.1-8b" in ids          # from YAML (not duplicated)
+    assert "meta-llama/Llama-3.1-8B" in ids             # merged from installed
+    assert "cross-encoder/ms-marco-MiniLM-L6" not in ids  # filtered out (reranker)
+    assert ids.count("ibm-granite/granite-4.1-8b") == 1  # no duplicate
+
+
 # ── GET /run ───────────────────────────────────────────────────────────────────

 def test_run_returns_409_when_already_running(client):
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
        except _HTTPException as exc:
            assert exc.status_code in (400, 404)
            raise
+
+
+# ── Catalog registration ───────────────────────────────────────────────────────
+
+_MINIMAL_YAML = """\
+services:
+  cf-text:
+    max_mb: {max_mb}
+    catalog:
+      existing-model:
+        path: /some/path
+        vram_mb: 1000
+        description: "placeholder"
+"""
+
+
+def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
+    p = tmp_path / "testnode.yaml"
+    p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
+    return p
+
+
+def test_catalog_registration_fp16_no_env_block(tmp_path):
+    """When model fits at FP16, no env block should be written."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/SmallModel",
+            local_path=tmp_path / "org--SmallModel",
+            vram_mb_fp16=4000,
+            role="generator",
+        )
+
+    assert "testnode" in updated
+    content = node_yaml.read_text()
+    # _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
+    assert "smallmodel:" in content
+    assert "CF_TEXT_4BIT" not in content
+    assert "env:" not in content
+
+
+def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
+    """When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/BigModel",
+            local_path=tmp_path / "org--BigModel",
+            vram_mb_fp16=20000,  # won't fit at FP16 on 8 GB
+            role="generator",
+        )
+
+    assert "testnode" in updated
+    content = node_yaml.read_text()
+    # _catalog_key: "org/BigModel" → "bigmodel"
+    assert "bigmodel:" in content
+    assert "env:" in content
+    assert 'CF_TEXT_4BIT: "1"' in content
+    assert "CF_TEXT_4BIT=1 required" in content  # description note
+
+
+def test_catalog_registration_too_large_skipped(tmp_path):
+    """Model too large even at 4-bit should not be registered."""
+    from app import models as models_module
+
+    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
+    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
+        updated = models_module._register_in_node_catalogs(
+            repo_id="org/HugeModel",
+            local_path=tmp_path / "org--HugeModel",
+            vram_mb_fp16=80000,  # 4-bit ~22 GB, still won't fit on 8 GB
+            role="generator",
+        )
+
+    assert updated == []
+    content = node_yaml.read_text()
+    assert "hugemodel" not in content
--- a/web/src/views/BenchmarkView.vue
+++ b/web/src/views/BenchmarkView.vue
@ -21,11 +21,17 @@
        :class="{ active: benchMode === 'style' }"
        @click="benchMode = 'style'"
      >✍️ Writing Style</button>
+      <button
+        class="mode-btn"
+        :class="{ active: benchMode === 'plans' }"
+        @click="benchMode = 'plans'"
+      >📐 Planning</button>
    </div>

    <ClassifierTab  v-if="benchMode === 'classifier'" />
    <LlmEvalTab     v-if="benchMode === 'llm'" />
    <StyleTab       v-if="benchMode === 'style'" />
+    <PlansBenchTab  v-if="benchMode === 'plans'" />
  </div>
 </template>

@ -34,8 +40,9 @@ import { ref } from 'vue'
 import ClassifierTab  from './ClassifierTab.vue'
 import LlmEvalTab     from './LlmEvalTab.vue'
 import StyleTab       from './StyleTab.vue'
+import PlansBenchTab  from './PlansBenchTab.vue'

-type BenchMode = 'classifier' | 'llm' | 'style'
+type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
 const benchMode = ref<BenchMode>('classifier')
 </script>

--- a/web/src/views/LlmEvalTab.vue
+++ b/web/src/views/LlmEvalTab.vue
@ -6,6 +6,8 @@
      <summary class="picker-summary">
        <span class="picker-title">📋 Task Selection</span>
        <span class="picker-badge">{{ llmTaskBadge }}</span>
+        <button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
+        <button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
      </summary>
      <div class="picker-body">
        <div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
@ -44,6 +46,8 @@
      <summary class="picker-summary">
        <span class="picker-title">🎯 Model Selection</span>
        <span class="picker-badge">{{ llmModelBadge }}</span>
+        <button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
+        <button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
      </summary>
      <div class="picker-body">
        <div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
@ -78,6 +82,33 @@
      </div>
    </details>

+    <!-- Node Selection -->
+    <div class="node-picker" v-if="llmNodes.length > 0">
+      <span class="node-picker-label">Nodes:</span>
+      <label
+        v-for="node in llmNodes"
+        :key="node.node_id"
+        class="node-chip"
+        :class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
+        :title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
+      >
+        <input
+          type="checkbox"
+          class="node-chip-check"
+          :checked="enabledNodes.has(node.node_id)"
+          :disabled="!node.online || llmRunning"
+          @change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
+        />
+        {{ node.node_id }}
+        <span class="node-chip-status" v-if="!node.online">offline</span>
+      </label>
+      <span class="node-picker-hint">
+        {{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
+            ? 'auto-routing (all nodes)'
+            : `restricted to: ${enabledNodeIds.join(', ')}` }}
+      </span>
+    </div>
+
    <!-- Run Controls -->
    <div class="run-controls">
      <button
@ -88,6 +119,24 @@
        {{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
      </button>
      <button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark">✕ Cancel</button>
+      <input
+        v-model="llmJudgeUrl"
+        class="judge-url-input"
+        placeholder="Judge URL — leave empty to skip LLM judge scoring"
+        :disabled="llmRunning"
+        title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
+      />
+      <label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
+        <span class="workers-prefix">workers</span>
+        <input
+          v-model.number="llmWorkers"
+          type="number"
+          min="1"
+          max="8"
+          class="workers-input"
+          :disabled="llmRunning"
+        />
+      </label>
      <span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
        Select at least one task and one model to run.
      </span>
@ -119,6 +168,7 @@
            <tr>
              <th class="hm-label-col">Model</th>
              <th class="hm-model-col">overall</th>
+              <th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
              <th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
              <th class="hm-model-col">tok/s</th>
            </tr>
@ -130,6 +180,12 @@
                class="hm-value-cell"
                :class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
              >{{ pct(row.avg_quality_score) }}</td>
+              <td
+                v-if="llmHasJudge"
+                class="hm-value-cell hm-judge-cell"
+                :class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
+                title="LLM-as-judge secondary score"
+              >{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
              <td
                v-for="col in llmTaskTypeCols"
                :key="col"
@ -168,6 +224,12 @@ interface CfOrchModel {
  vram_estimate_mb?: number
 }

+interface CfOrchNode {
+  node_id: string
+  online: boolean
+  gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
+}
+
 interface LlmModelResult {
  model_name: string
  model_id: string
@ -175,9 +237,11 @@ interface LlmModelResult {
  avg_tokens_per_sec: number
  avg_completion_ms: number
  avg_quality_score: number
+  avg_judge_score: number | null
  finetune_candidates: number
  error_count: number
  quality_by_task_type: Record<string, number>
+  judge_score_by_task_type?: Record<string, number>
 }

 // ── State ───────────────────────────────────────────────────────────────────
@ -195,6 +259,10 @@ const llmError       = ref('')
 const llmResults     = ref<LlmModelResult[]>([])
 const llmEventSource = ref<EventSource | null>(null)
 const llmLogEl       = ref<HTMLElement | null>(null)
+const llmJudgeUrl    = ref('')
+const llmWorkers     = ref(1)
+const llmNodes       = ref<CfOrchNode[]>([])
+const enabledNodes   = ref<Set<string>>(new Set())

 // ── Computed ────────────────────────────────────────────────────────────────
 const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
  return [...types].sort()
 })

+const llmHasJudge = computed(() =>
+  llmResults.value.some(r => r.avg_judge_score != null)
+)
+
+const enabledNodeIds = computed(() =>
+  llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
+)
+
 const llmBestByCol = computed((): Record<string, string> => {
  const best: Record<string, string> = {}
  if (llmResults.value.length === 0) return best
@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
  }
  best['overall'] = bestId

+  if (llmHasJudge.value) {
+    bestId = ''; bestVal = -Infinity
+    for (const r of llmResults.value) {
+      if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
+        bestVal = r.avg_judge_score; bestId = r.model_id
+      }
+    }
+    best['judge'] = bestId
+  }
+
  for (const col of llmTaskTypeCols.value) {
    bestId = ''; bestVal = -Infinity
    for (const r of llmResults.value) {
@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
  }
  selectedLlmModels.value = next
 }
+function selectAllTasks()  { selectedLlmTasks.value  = new Set(llmTasks.value.map(t => t.id)) }
+function clearAllTasks()   { selectedLlmTasks.value  = new Set() }
+function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
+function clearAllModels()  { selectedLlmModels.value = new Set() }
+function toggleNode(id: string, checked: boolean) {
+  const next = new Set(enabledNodes.value)
+  checked ? next.add(id) : next.delete(id)
+  enabledNodes.value = next
+}

 // ── Data loaders ─────────────────────────────────────────────────────────────
 async function loadLlmTasks() {
@ -335,6 +430,21 @@ async function loadLlmResults() {
  }
 }

+async function loadLlmConfig() {
+  const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
+  if (data?.judge_url && !llmJudgeUrl.value) {
+    llmJudgeUrl.value = data.judge_url
+  }
+}
+
+async function loadLlmNodes() {
+  const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
+  if (data?.nodes) {
+    llmNodes.value = data.nodes
+    enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
+  }
+}
+
 // ── Run / cancel ──────────────────────────────────────────────────────────────
 function startLlmBenchmark() {
  llmRunning.value = true
@ -344,6 +454,15 @@ function startLlmBenchmark() {
  const params = new URLSearchParams()
  const taskIds = [...selectedLlmTasks.value].join(',')
  if (taskIds) params.set('task_ids', taskIds)
+  const modelIds = [...selectedLlmModels.value].join(',')
+  if (modelIds) params.set('model_ids', modelIds)
+  if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
+  if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
+  const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
+  const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
+  if (isRestricted && enabledNodeIds.value.length > 0) {
+    params.set('node_ids', enabledNodeIds.value.join(','))
+  }

  const es = new EventSource(`/api/cforch/run?${params}`)
  llmEventSource.value = es
@ -387,6 +506,8 @@ onMounted(() => {
  loadLlmTasks()
  loadLlmModels()
  loadLlmResults()
+  loadLlmConfig()
+  loadLlmNodes()
 })
 </script>

@ -451,6 +572,43 @@ onMounted(() => {
  color: var(--color-text-secondary, #6b7a99);
 }

+.judge-url-input {
+  flex: 1;
+  min-width: 14rem;
+  max-width: 24rem;
+  padding: 0.35rem 0.6rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.375rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2338);
+  font-size: 0.8rem;
+  font-family: var(--font-mono, monospace);
+}
+.judge-url-input:disabled { opacity: 0.5; }
+.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
+
+.workers-label {
+  display: flex;
+  align-items: center;
+  gap: 0.3rem;
+  font-size: 0.8rem;
+  color: var(--color-text-secondary, #6b7a99);
+  white-space: nowrap;
+}
+.workers-prefix { font-family: var(--font-mono, monospace); }
+.workers-input {
+  width: 3.2rem;
+  padding: 0.35rem 0.4rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.375rem;
+  background: var(--color-surface, #fff);
+  color: var(--color-text, #1a2338);
+  font-size: 0.8rem;
+  font-family: var(--font-mono, monospace);
+  text-align: center;
+}
+.workers-input:disabled { opacity: 0.5; }
+
 /* ── Run log ────────────────────────────────────────────── */
 .run-log {
  border: 1px solid var(--color-border, #d0d7e8);
@ -592,6 +750,15 @@ onMounted(() => {
  white-space: nowrap;
 }

+.hm-judge-col {
+  background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
+}
+.hm-judge-cell {
+  background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
+  font-style: italic;
+  opacity: 0.9;
+}
+
 /* ── Model Picker ───────────────────────────────────────── */
 .model-picker {
  border: 1px solid var(--color-border, #d0d7e8);
@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼  '; }
  margin-left: auto;
 }

+.picker-bulk-btn {
+  padding: 0.1rem 0.45rem;
+  font-size: 0.7rem;
+  font-family: var(--font-mono, monospace);
+  background: var(--color-surface, #fff);
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.25rem;
+  color: var(--color-text-secondary, #6b7a99);
+  cursor: pointer;
+  transition: background 0.12s, color 0.12s;
+  flex-shrink: 0;
+}
+.picker-bulk-btn:hover {
+  background: var(--app-primary, #2A6080);
+  color: #fff;
+  border-color: var(--app-primary, #2A6080);
+}
+
 .picker-body {
  padding: 0.75rem;
  border-top: 1px solid var(--color-border, #d0d7e8);
@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼  '; }
  .picker-model-list { padding-left: 0; }
  .picker-model-name { max-width: 14ch; }
 }
+
+/* ── Node picker ────────────────────────────────────── */
+.node-picker {
+  display: flex;
+  align-items: center;
+  gap: 0.5rem;
+  flex-wrap: wrap;
+  padding: 0.5rem 0.75rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 0.5rem;
+  background: var(--color-surface-raised, #e4ebf5);
+}
+
+.node-picker-label {
+  font-size: 0.78rem;
+  font-weight: 600;
+  color: var(--color-text-secondary, #6b7a99);
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+  white-space: nowrap;
+}
+
+.node-chip {
+  display: inline-flex;
+  align-items: center;
+  gap: 0.3rem;
+  padding: 0.2rem 0.55rem;
+  border: 1px solid var(--color-border, #d0d7e8);
+  border-radius: 1rem;
+  background: var(--color-surface, #fff);
+  font-size: 0.78rem;
+  font-family: var(--font-mono, monospace);
+  color: var(--color-text, #1a2338);
+  cursor: pointer;
+  transition: background 0.12s, opacity 0.12s;
+}
+.node-chip--off {
+  opacity: 0.45;
+  background: transparent;
+}
+.node-chip--offline {
+  opacity: 0.35;
+  cursor: not-allowed;
+  font-style: italic;
+}
+.node-chip-check { accent-color: var(--app-primary, #2A6080); }
+.node-chip-status {
+  font-size: 0.66rem;
+  color: var(--color-text-secondary, #6b7a99);
+}
+
+.node-picker-hint {
+  font-size: 0.72rem;
+  color: var(--color-text-secondary, #6b7a99);
+  font-family: var(--font-mono, monospace);
+  margin-left: auto;
+}
 </style>
--- a/web/src/views/ModelsView.vue
+++ b/web/src/views/ModelsView.vue
@ -51,8 +51,31 @@
          <span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
            {{ lookupResult.adapter_recommendation }}
          </span>
-          <span v-if="lookupResult.size != null" class="preview-size">
-            {{ humanBytes(lookupResult.size) }}
+          <span v-if="selectedQuantSize > 0" class="preview-size">
+            {{ humanBytes(selectedQuantSize) }}
+          </span>
+        </div>
+
+        <!-- GGUF quantization picker — only shown for GGUF repos -->
+        <div v-if="lookupResult.gguf_files?.length" class="quant-picker">
+          <label class="quant-label" for="quant-select">Quantization</label>
+          <select
+            id="quant-select"
+            v-model="selectedQuant"
+            class="quant-select"
+            aria-label="Select quantization variant"
+          >
+            <option :value="null" disabled>Select quantization…</option>
+            <option
+              v-for="f in lookupResult.gguf_files"
+              :key="f.filename"
+              :value="f.quant_name ?? f.filename"
+            >
+              {{ f.quant_name ?? f.filename }} — {{ humanBytes(f.size) }}
+            </option>
+          </select>
+          <span class="quant-hint">
+            Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
          </span>
        </div>

@ -67,7 +90,7 @@

        <button
          class="btn-primary btn-add-queue"
-          :disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue"
+          :disabled="!canAddToQueue"
          @click="addToQueue"
        >
          {{ addingToQueue ? 'Adding…' : 'Add to queue' }}
@ -99,9 +122,39 @@
          <span v-if="model.role"                  class="chip chip-role">{{ model.role }}</span>
          <span v-if="model.service"               class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
          <span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
+          <span v-if="model.quant_pattern"          class="chip chip-quant">{{ model.quant_pattern }}</span>
+        </div>
+        <!-- Allow manual service/role assignment for unrecognized pipeline tags -->
+        <div v-if="!model.service" class="classify-row queue-classify">
+          <select
+            class="classify-select"
+            :value="classifyDraft[model.id]?.service ?? ''"
+            @change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
+            aria-label="Assign service"
+          >
+            <option value="" disabled>Service…</option>
+            <option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
+          </select>
+          <select
+            class="classify-select"
+            :value="classifyDraft[model.id]?.role ?? ''"
+            :disabled="!classifyDraft[model.id]?.service"
+            @change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
+            aria-label="Assign role"
+          >
+            <option value="" disabled>Role…</option>
+            <option
+              v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
+              :key="role"
+              :value="role"
+            >{{ role }}</option>
+          </select>
        </div>
        <div class="model-card-actions">
-          <button class="btn-primary btn-sm" @click="approveModel(model.id)">
+          <button
+            class="btn-primary btn-sm"
+            @click="approveModel(model.id, classifyDraft[model.id])"
+          >
            Approve download
          </button>
        </div>
@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'

 // ── Type definitions ──────────────────────────────────

+interface GgufFile {
+  filename: string
+  size: number
+  quant_name: string | null
+}
+
 interface LookupResult {
  repo_id: string
  pipeline_tag: string | null
@ -260,7 +319,8 @@ interface LookupResult {
  service: string | null
  compatible: boolean
  warning: string | null
-  size: number | null
+  model_size_bytes: number
+  gguf_files: GgufFile[] | null
  description: string | null
  already_installed: boolean
  already_queued: boolean
@ -274,6 +334,7 @@ interface QueuedModel {
  adapter_recommendation: string | null
  role: string | null
  service: string | null
+  quant_pattern: string | null
 }

 interface InstalledModel {
@ -302,6 +363,26 @@ const lookupLoading = ref(false)
 const lookupError   = ref<string | null>(null)
 const lookupResult  = ref<LookupResult | null>(null)
 const addingToQueue = ref(false)
+const selectedQuant = ref<string | null>(null)
+
+// Size of the selected GGUF file, or total model size for non-GGUF repos.
+const selectedQuantSize = computed<number>(() => {
+  const r = lookupResult.value
+  if (!r) return 0
+  if (r.gguf_files?.length && selectedQuant.value) {
+    const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
+    return f?.size ?? r.model_size_bytes
+  }
+  return r.model_size_bytes
+})
+
+// Disable "Add to queue" when a GGUF repo but no quant chosen yet.
+const canAddToQueue = computed(() => {
+  const r = lookupResult.value
+  if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
+  if (r.gguf_files?.length && !selectedQuant.value) return false
+  return true
+})

 const queuedModels    = ref<QueuedModel[]>([])
 const installedModels = ref<InstalledModel[]>([])
@ -411,6 +492,7 @@ async function doLookup() {
  lookupLoading.value = true
  lookupError.value   = null
  lookupResult.value  = null
+  selectedQuant.value = null

  try {
    const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
@ -442,7 +524,15 @@ async function addToQueue() {
    const res = await fetch('/api/models/queue', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }),
+      body: JSON.stringify({
+        repo_id,
+        pipeline_tag,
+        adapter_recommendation,
+        role,
+        service,
+        model_size_bytes: selectedQuantSize.value,
+        quant_pattern: selectedQuant.value,
+      }),
    })
    if (res.ok) {
      lookupResult.value = { ...lookupResult.value, already_queued: true }
@ -454,8 +544,16 @@ async function addToQueue() {
  }
 }

-async function approveModel(id: string) {
+async function approveModel(id: string, draft?: { service: string; role: string }) {
  try {
+    // If the user picked a service/role for an unrecognized model, patch it first.
+    if (draft?.service && draft?.role) {
+      await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
+        method: 'PATCH',
+        headers: { 'Content-Type': 'application/json' },
+        body: JSON.stringify({ service: draft.service, role: draft.role }),
+      })
+    }
    const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
    if (res.ok) {
      await loadQueue()
@ -774,6 +872,44 @@ onUnmounted(() => {
  align-self: flex-start;
 }

+/* ── Quant picker ── */
+.quant-picker {
+  display: flex;
+  flex-direction: column;
+  gap: 0.35rem;
+}
+
+.quant-label {
+  font-size: 0.8rem;
+  font-weight: 600;
+  color: var(--color-text-muted, #4a5c7a);
+  text-transform: uppercase;
+  letter-spacing: 0.04em;
+}
+
+.quant-select {
+  padding: 0.4rem 0.6rem;
+  border: 1px solid var(--color-border, #a8b8d0);
+  border-radius: var(--radius-md, 0.5rem);
+  background: var(--color-surface, #f0f4fb);
+  color: var(--color-text, #1a2338);
+  font-size: 0.9rem;
+  font-family: var(--font-mono, monospace);
+  cursor: pointer;
+}
+
+.quant-hint {
+  font-size: 0.78rem;
+  color: var(--color-text-muted, #4a5c7a);
+}
+
+.chip-quant {
+  background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
+  color: var(--color-primary, #2A6080);
+  font-family: var(--font-mono, monospace);
+  font-size: 0.75rem;
+}
+
 /* ── Model cards (queue + downloads) ── */
 .model-card {
  border: 1px solid var(--color-border, #a8b8d0);
--- a/web/src/views/PlansBenchTab.vue
+++ b/web/src/views/PlansBenchTab.vue