feat: plans benchmark harness — model scoring for CF planning prompts

Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue component, and registers /api/plans-bench in api.py. Also extends models registry (cf-text catalog integration), cforch client, LlmEvalTab, and ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView.
2026-05-02 23:36:04 -07:00 · 2026-05-02 23:36:04 -07:00 · bce932461a
commit bce932461a
parent e11db5ccd9
14 changed files with 3137 additions and 59 deletions
--- a/.env.example
+++ b/.env.example
@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx
 # Set one of these to use a cloud LLM instead of a local model.
 # ANTHROPIC_API_KEY=sk-ant-...
 # OPENAI_API_KEY=sk-...
 # ── HuggingFace (required for gated/terms-restricted model downloads) ─────────
 # Generate at https://huggingface.co/settings/tokens and accept model terms first.
 # HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx
--- a/app/api.py
+++ b/app/api.py
@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api")
 from app.train.train import router as train_router
 app.include_router(train_router, prefix="/api/train")
 from app.plans_bench import router as plans_bench_router
 app.include_router(plans_bench_router, prefix="/api/plans-bench")
 # In-memory last-action store (single user, local tool — in-memory is fine)
 _last_action: dict | None = None
 from app.dashboard import router as dashboard_router
 app.include_router(dashboard_router, prefix="/api")
--- a/app/cforch.py
+++ b/app/cforch.py
@ -17,9 +17,12 @@ import logging
 import os
 import re
 import subprocess as _subprocess
 import tempfile
 from pathlib import Path
 from typing import Any
 import urllib.parse
 import yaml
 from fastapi import APIRouter, HTTPException
 from fastapi.responses import StreamingResponse
@ -75,9 +78,31 @@ def _load_cforch_config() -> dict:
        "license_key":     _coalesce(file_cfg.get("license_key", ""),     "CF_LICENSE_KEY"),
        "ollama_url":      _coalesce(file_cfg.get("ollama_url", ""),       "OLLAMA_HOST"),
        "ollama_model":    _coalesce(file_cfg.get("ollama_model", ""),     "OLLAMA_MODEL"),
        "judge_url":       _coalesce(file_cfg.get("judge_url", ""),        "CF_JUDGE_URL"),
        "hf_token":        _coalesce(file_cfg.get("hf_token", ""),         "HF_TOKEN"),
    }
 def _validate_service_url(url: str, param_name: str) -> str:
    """Validate that a URL is a well-formed http/https URL with a hostname.
    Guards against SSRF: only http/https is allowed; the URL must have a
    non-empty host. Does not enforce an allowlist — call sites are internal
    tooling, not a public API.
    """
    if not url:
        return url
    try:
        parsed = urllib.parse.urlparse(url)
    except Exception:
        raise HTTPException(400, f"{param_name}: not a valid URL")
    if parsed.scheme not in ("http", "https"):
        raise HTTPException(400, f"{param_name}: URL must start with http:// or https://")
    if not parsed.hostname:
        raise HTTPException(400, f"{param_name}: URL has no hostname")
    return url
 def _strip_ansi(text: str) -> str:
    """Remove ANSI escape codes from a string."""
    return re.sub(r'\x1b\[[0-9;]*m', '', text)
@ -147,48 +172,141 @@ def get_tasks() -> dict:
 # ── GET /models ────────────────────────────────────────────────────────────────
 # Services and roles surfaced in the benchmark model picker.
 # Covers all cf-orch service types that benchmark.py can route tasks to.
 _BENCH_SERVICES = frozenset({
    "cf-text", "vllm",         # LLM text generation
    "cf-stt",                  # speech-to-text
    "cf-tts",                  # text-to-speech
    "cf-vision",               # image classification / embedding
    "cf-voice",                # audio context classification
 })
 _BENCH_ROLES = frozenset({
    "generator", "vlm",        # LLM roles
    "stt", "alm",              # speech recognition
    "tts",                     # speech synthesis
    "vision", "embedding",     # image understanding
    "classifier",              # audio classification (cf-voice)
 })
@router.get("/models")
 def get_models() -> dict:
-    """Return model list from bench_models.yaml."""
+    """Return model list from bench_models.yaml merged with locally installed models.
    bench_models.yaml entries are listed first and take precedence; any installed
    model whose repo_id is already present in the YAML is skipped.  Only models
    whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision,
    cf-voice) are surfaced from the installed registry.
    """
    cfg = _load_cforch_config()
    models_path = cfg.get("bench_models", "")
    if not models_path:
        return {"models": []}
    p = Path(models_path)
    if not p.exists():
        return {"models": []}
    try:
        raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
    except yaml.YAMLError as exc:
        logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
        return {"models": []}
    models_raw = raw.get("models", []) or []
    models: list[dict] = []
-    for m in models_raw:
+    bench_ids: set[str] = set()
-        if not isinstance(m, dict):
+
-            continue
+    if models_path:
-        models.append({
+        p = Path(models_path)
-            "name": m.get("name", ""),
+        if p.exists():
-            "id": m.get("id", ""),
+            try:
-            "service": m.get("service", "ollama"),
+                raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {}
-            "tags": m.get("tags", []) or [],
+            except yaml.YAMLError as exc:
-            "vram_estimate_mb": m.get("vram_estimate_mb", 0),
+                logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc)
-        })
+                raw = {}
            for m in (raw.get("models", []) or []):
                if not isinstance(m, dict):
                    continue
                model_id = m.get("id", "")
                models.append({
                    "name": m.get("name", ""),
                    "id": model_id,
                    "service": m.get("service", "ollama"),
                    "tags": m.get("tags", []) or [],
                    "vram_estimate_mb": m.get("vram_estimate_mb", 0),
                })
                if model_id:
                    bench_ids.add(model_id)
    # Merge installed generator models not already in bench_models.yaml.
    try:
        from app.models import list_installed  # local import avoids circular dependency at module load
        for installed in list_installed():
            model_id: str = installed.get("model_id") or ""
            service: str = installed.get("service") or ""
            role: str = installed.get("role") or ""
            if not model_id:
                continue
            if service not in _BENCH_SERVICES or role not in _BENCH_ROLES:
                continue
            if model_id in bench_ids:
                continue
            display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id
            models.append({
                "name": display_name,
                "id": model_id,
                "service": service,
                "tags": [role],
                "vram_estimate_mb": installed.get("vram_mb") or 0,
            })
            bench_ids.add(model_id)
    except Exception as exc:
        logger.warning("Could not merge installed models into model list: %s", exc)
    return {"models": models}
 # ── GET /run ───────────────────────────────────────────────────────────────────
@router.get("/nodes")
 def get_nodes() -> dict:
    """Proxy the coordinator's /api/nodes list, returning node_id + online status.
    Online is inferred from last_heartbeat: any node with a recent heartbeat is online.
    Returns an empty list if the coordinator is unreachable.
    """
    cfg = _load_cforch_config()
    coordinator_url = cfg.get("coordinator_url", "").rstrip("/")
    if not coordinator_url:
        return {"nodes": []}
    try:
        import httpx as _httpx
        resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0)
        resp.raise_for_status()
        raw_nodes = resp.json().get("nodes", [])
        return {
            "nodes": [
                {
                    "node_id": n.get("node_id", ""),
                    "online": n.get("last_heartbeat") is not None,
                    "gpus": [
                        {
                            "gpu_id": g.get("gpu_id"),
                            "name": g.get("name", ""),
                            "vram_total_mb": g.get("vram_total_mb", 0),
                            "vram_free_mb": g.get("vram_free_mb", 0),
                        }
                        for g in n.get("gpus", [])
                    ],
                }
                for n in raw_nodes
            ]
        }
    except Exception as exc:
        logger.warning("Could not fetch nodes from coordinator: %s", exc)
        return {"nodes": []}
@router.get("/run")
 def run_benchmark(
    task_ids: str = "",
    model_ids: str = "",
    model_tags: str = "",
    coordinator_url: str = "",
    ollama_url: str = "",
    judge_url: str = "",
    judge_backend: str = "chat",
    workers: int = 1,
    node_ids: str = "",
 ) -> StreamingResponse:
    """Spawn cf-orch benchmark.py and stream stdout as SSE progress events."""
    global _BENCH_RUNNING, _bench_proc
@ -205,6 +323,13 @@ def run_benchmark(
    cfg_coordinator = cfg.get("coordinator_url", "")
    cfg_ollama      = cfg.get("ollama_url", "")
    cfg_license_key = cfg.get("license_key", "")
    cfg_judge_url   = cfg.get("judge_url", "")
    # Validate URL params before spawning the subprocess.
    # _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts).
    _validate_service_url(coordinator_url, "coordinator_url")
    _validate_service_url(ollama_url, "ollama_url")
    _validate_service_url(judge_url, "judge_url")
    def generate():
        global _BENCH_RUNNING, _bench_proc
@ -213,16 +338,68 @@ def run_benchmark(
            yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n"
            return
        # Build effective models file: bench_models.yaml + any installed models
        # whose IDs were selected but are absent from the YAML (e.g. downloaded
        # via the Models view). Written to a temp file so benchmark.py sees one
        # unified list; cleaned up in the finally block.
        effective_models_file = bench_models
        _tmp_models_path: str | None = None
        if model_ids and bench_models and Path(bench_models).exists():
            requested_ids = set(model_ids.split(","))
            try:
                raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {}
                bench_entries: list[dict] = raw_bench.get("models", []) or []
                bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)}
                missing_ids = requested_ids - bench_id_set
                if missing_ids:
                    from app.models import list_installed
                    installed_map = {
                        m["model_id"]: m
                        for m in list_installed()
                        if m.get("model_id") and m.get("service") in _BENCH_SERVICES
                    }
                    extra: list[dict] = []
                    for mid in missing_ids:
                        if mid in installed_map:
                            inst = installed_map[mid]
                            entry: dict[str, Any] = {
                                "id": mid,
                                "name": mid.split("/", 1)[-1] if "/" in mid else mid,
                                "service": inst.get("service", "cf-text"),
                                "vram_estimate_mb": inst.get("vram_mb") or 0,
                                "tags": [inst.get("role", "generator")],
                                "temperature": 0.0,
                            }
                            local_path = inst.get("path", "") or inst.get("local_path", "")
                            if local_path:
                                entry["model_path"] = local_path
                            extra.append(entry)
                    if extra:
                        merged = {"models": bench_entries + extra}
                        tf = tempfile.NamedTemporaryFile(
                            mode="w", suffix=".yaml", delete=False,
                            prefix="avocet_bench_models_",
                        )
                        yaml.dump(merged, tf)
                        tf.close()
                        _tmp_models_path = tf.name
                        effective_models_file = _tmp_models_path
            except Exception as exc:
                logger.warning("Could not merge installed models into temp bench file: %s", exc)
        cmd = [
            python_bin,
            bench_script,
            "--tasks", bench_tasks,
-            "--models", bench_models,
+            "--models", effective_models_file,
            "--output", results_dir,
        ]
        if task_ids:
            cmd.extend(["--filter-tasks"] + task_ids.split(","))
        if model_ids:
            cmd.extend(["--filter-models"] + model_ids.split(","))
        if model_tags:
            cmd.extend(["--filter-tags"] + model_tags.split(","))
@ -233,6 +410,15 @@ def run_benchmark(
            cmd.extend(["--coordinator", effective_coordinator])
        if effective_ollama:
            cmd.extend(["--ollama-url", effective_ollama])
        effective_judge = judge_url if judge_url else cfg_judge_url
        if effective_judge:
            cmd.extend(["--judge-url", effective_judge])
        if judge_backend and judge_backend != "chat":
            cmd.extend(["--judge-backend", judge_backend])
        if workers > 1:
            cmd.extend(["--workers", str(workers)])
        if node_ids:
            cmd.extend(["--nodes"] + node_ids.split(","))
        # Pass license key as env var so subprocess can authenticate with cf-orch
        proc_env = {**os.environ}
@ -273,6 +459,11 @@ def run_benchmark(
            yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n"
        finally:
            _BENCH_RUNNING = False
            if _tmp_models_path:
                try:
                    os.unlink(_tmp_models_path)
                except OSError:
                    pass
    return StreamingResponse(
        generate(),
@ -295,6 +486,7 @@ def get_cforch_config() -> dict:
        "coordinator_url": cfg.get("coordinator_url", ""),
        "ollama_url":      cfg.get("ollama_url", ""),
        "ollama_model":    cfg.get("ollama_model", ""),
        "judge_url":       cfg.get("judge_url", ""),
        "license_key_set": bool(cfg.get("license_key", "")),
        "source": "env" if not _config_file().exists() else "yaml+env",
    }
@ -303,7 +495,7 @@ def get_cforch_config() -> dict:
 # ── GET /results ───────────────────────────────────────────────────────────────
@router.get("/results")
-def get_results() -> dict:
+def get_results() -> list:
    """Return the latest benchmark summary.json from results_dir."""
    cfg = _load_cforch_config()
    results_dir = cfg.get("results_dir", "")
--- a/app/models.py
+++ b/app/models.py
@ -15,6 +15,7 @@ from __future__ import annotations
 import json
 import logging
 import os
 import re
 import shutil
 import threading
 from datetime import datetime, timezone
@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path(
 router = APIRouter()
 # ── HuggingFace auth ─────────────────────────────────────────────────────────
 def _get_hf_token() -> str | None:
    """Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars."""
    config_file = _ROOT / "config" / "label_tool.yaml"
    if config_file.exists():
        try:
            import yaml as _yaml
            raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {}
            token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip()
            if token:
                return token
        except Exception:
            pass
    return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None
 # ── GGUF quantization detection ───────────────────────────────────────────────
 # Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc.
 _QUANT_RE = re.compile(
    r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$',
    re.IGNORECASE,
 )
 # ── Download progress shared state ────────────────────────────────────────────
 # Updated by the background download thread; read by GET /download/stream.
 _download_progress: dict[str, Any] = {}
@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = {
    "audio-classification":           {"adapter": None, "role": "classifier", "service": "cf-voice"},
    # TTS — cf-tts text-to-speech service
    "text-to-speech":                 {"adapter": None, "role": "tts",       "service": "cf-tts"},
-    # Vision — cf-vision image classification / embedding / VLM service
+    # Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models)
    "image-classification":           {"adapter": None, "role": "vision",    "service": "cf-vision"},
    "zero-shot-image-classification": {"adapter": None, "role": "vision",    "service": "cf-vision"},
    "image-feature-extraction":       {"adapter": None, "role": "embedding", "service": "cf-vision"},
-    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "cf-vision"},
+    # Generative VLMs (image+text → text) — run under vllm, not cf-vision.
-    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "cf-vision"},
+    # cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL,
    # LLaVA, and InternVL are textgen models that happen to accept image inputs.
    "image-text-to-text":             {"adapter": None, "role": "vlm",       "service": "vllm"},
    "visual-question-answering":      {"adapter": None, "role": "vlm",       "service": "vllm"},
    # Image generation — cf-image (text → image; distinct from cf-vision image understanding)
    "text-to-image":                  {"adapter": None, "role": "image-gen", "service": "cf-image"},
    # Embedding — cf-core shared embedding layer
@ -195,10 +223,17 @@ def _get_queue_entry(entry_id: str) -> dict | None:
 def _catalog_key(repo_id: str) -> str:
    """Derive a readable catalog key from repo_id.
-    ibm-granite/granite-4.1-8b  →  granite-4.1-8b
+    ibm-granite/granite-4.1-8b              →  granite-4.1-8b
-    facebook/bart-large-cnn     →  bart-large-cnn
+    facebook/bart-large-cnn                 →  bart-large-cnn
    WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF  →  opus4.7-gods.ghost.codex-4b
    The coordinator skips catalog lookup for keys ending in ".gguf" (treats them
    as direct file paths). Strip the suffix so GGUF repo names produce valid keys.
    """
-    return repo_id.split("/", 1)[-1].lower()
+    key = repo_id.split("/", 1)[-1].lower()
    if key.endswith(".gguf"):
        key = key[:-5]
    return key
 def _insert_catalog_entry(content: str, entry_lines: str) -> str:
@ -290,6 +325,15 @@ def _register_in_node_catalogs(
            max_mb: int = cf_text.get("max_mb", 0)
            catalog: dict = cf_text.get("catalog") or {}
            # If the node has a different local model dir, remap the NFS path.
            model_base = cf_text.get("model_base_path", "").rstrip("/")
            if model_base:
                nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/")
                model_name = local_path.name
                effective_path_str = f"{model_base}/{model_name}"
            else:
                effective_path_str = local_path_str
            # Skip if key already exists
            if model_key in catalog:
                logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name)
@ -301,10 +345,10 @@ def _register_in_node_catalogs(
                for entry in catalog.values()
                if isinstance(entry, dict)
            }
-            if local_path_str in registered_paths or any(
+            if effective_path_str in registered_paths or any(
-                p.startswith(local_path_str + "/") for p in registered_paths
+                p.startswith(effective_path_str + "/") for p in registered_paths
            ):
-                logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name)
+                logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name)
                continue
            # Determine whether model fits at FP16 or needs 4-bit
@ -330,12 +374,18 @@ def _register_in_node_catalogs(
                if needs_4bit
                else f"  # FP16 file-size estimate"
            )
            env_block = (
                f"        env:\n"
                f"          CF_TEXT_4BIT: \"1\"\n"
                if needs_4bit else ""
            )
            entry_block = (
                f"      # auto-registered by avocet on download\n"
                f"      {model_key}:\n"
-                f"        path: {local_path_str}\n"
+                f"        path: {effective_path_str}\n"
                f"        vram_mb: {vram_for_node}{vram_comment}\n"
                f"        description: \"{desc}\"\n"
                f"{env_block}"
            )
            new_content = _insert_catalog_entry(content, entry_block)
@ -388,12 +438,17 @@ def _run_download(
    role: str | None = None,
    service: str | None = None,
    model_size_bytes: int = 0,
    quant_pattern: str | None = None,
 ) -> None:
    """Background thread: download model via huggingface_hub.snapshot_download.
    model_size_bytes is the sum of file sizes reported by the HF API (siblings).
    It is used to estimate vram_mb and written to model_info.json so cf-orch can
    budget VRAM when allocating a cf-text instance for this model.
    quant_pattern: when set, restricts snapshot_download to only files matching
    *{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant
    from GGUF-only repos like bartowski/*.
    """
    global _download_progress
    local_dir = _model_dir_for(repo_id, service)
@ -422,10 +477,20 @@ def _run_download(
        local_dir.mkdir(parents=True, exist_ok=True)
        poll_thread.start()
-        snapshot_download(
+
-            repo_id=repo_id,
+        dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)}
-            local_dir=str(local_dir),
+        hf_token = _get_hf_token()
-        )
+        if hf_token:
            dl_kwargs["token"] = hf_token
        if quant_pattern:
            # Include both cases: repos use mixed conventions (Q6_K vs q6_k).
            dl_kwargs["allow_patterns"] = [
                f"*{quant_pattern.upper()}*.gguf",
                f"*{quant_pattern.lower()}*.gguf",
                "*.json",
                "README.md",
            ]
        snapshot_download(**dl_kwargs)
        # Estimate VRAM from reported file size.
        # HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache
@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict:
        )
        logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id)
-    # Estimate model size from siblings list
+    # Detect GGUF files and parse quant names from siblings list.
    # For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show
    # a per-quant size picker instead of downloading every variant.
    siblings = data.get("siblings") or []
-    model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
+    gguf_files: list[dict] = []
    for s in siblings:
        if not isinstance(s, dict):
            continue
        fname: str = s.get("rfilename", "")
        if not fname.lower().endswith(".gguf"):
            continue
        m = _QUANT_RE.search(fname)
        gguf_files.append({
            "filename": fname,
            "size": s.get("size", 0) or 0,
            "quant_name": m.group(1).upper() if m else None,
        })
    gguf_files.sort(key=lambda f: f["size"])
    # model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only.
    # For GGUF repos the frontend will substitute the selected quant's size on submit.
    if gguf_files:
        model_size_bytes: int = sum(f["size"] for f in gguf_files)
    else:
        model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict))
    # Description: first 300 chars of card data (modelId field used as fallback)
    card_data = data.get("cardData") or {}
@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict:
        "compatible": compatible,
        "warning": warning,
        "model_size_bytes": model_size_bytes,
        "gguf_files": gguf_files if gguf_files else None,
        "description": description,
        "tags": data.get("tags") or [],
        "downloads": data.get("downloads") or 0,
@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel):
    # Stored in the queue entry so approve can pass it to _run_download
    # without a second HF API round-trip.
    model_size_bytes: int = 0
    # GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download
    # restricts to *{quant_pattern}*.gguf instead of fetching all variants.
    quant_pattern: str | None = None
@router.post("/queue", status_code=201)
@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict:
        "role": req.role,
        "service": req.service,
        "model_size_bytes": req.model_size_bytes,
        "quant_pattern": req.quant_pattern,
        "status": "pending",
        "queued_at": datetime.now(timezone.utc).isoformat(),
    }
@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict:
            entry.get("role"),
            entry.get("service"),
            entry.get("model_size_bytes", 0),
            entry.get("quant_pattern"),
        ),
        daemon=True,
        name=f"model-download-{entry_id}",
@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict:
    return {"ok": True}
 # ── PATCH /queue/{id} ─────────────────────────────────────────────────────────
 class QueuePatchRequest(BaseModel):
    service: str | None = None
    role: str | None = None
@router.patch("/queue/{entry_id}")
 def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict:
    """Update mutable fields (service, role) on a pending queue entry."""
    entry = _get_queue_entry(entry_id)
    if entry is None:
        raise HTTPException(404, f"Queue entry {entry_id!r} not found")
    if entry.get("status") != "pending":
        raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})")
    updates: dict = {}
    if body.service is not None:
        updates["service"] = body.service
    if body.role is not None:
        updates["role"] = body.role
    updated = _update_queue_entry(entry_id, updates)
    return updated or {}
 # ── DELETE /queue/{id} ─────────────────────────────────────────────────────────
@router.delete("/queue/{entry_id}")
--- a/config/label_tool.yaml.example
+++ b/config/label_tool.yaml.example
@ -41,11 +41,15 @@ cforch:
  # Python interpreter with cf-orch installed
  python_bin:   /devl/miniconda3/envs/cf/bin/python
-  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST
+  # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN
  # coordinator_url: http://localhost:7700
  # license_key:     CFG-AVCT-xxxx-xxxx-xxxx
  # ollama_url:      http://localhost:11434
  # ollama_model:    llama3.2:3b
  # judge_url:       http://10.1.10.158:8008  # Sif cf-text — LLM-as-judge secondary scorer
  # judge_url:       http://10.1.10.71:8008   # Heimdall cf-text (alternative)
  # Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically.
  # hf_token:        hf_xxxxxxxxxxxxxxxxxxxx  # HuggingFace token — required for gated/terms-restricted models
 # Imitate tab — pull real samples from sibling CF product APIs and run them
 # through local LLMs to build a corrections dataset.
--- a/manage.sh
+++ b/manage.sh
@ -90,6 +90,12 @@ usage() {
    echo -e "    ${GREEN}score [args]${NC}             Shortcut: --score [args]"
    echo -e "    ${GREEN}compare [args]${NC}           Shortcut: --compare [args]"
    echo ""
    echo "  Planning Benchmark:"
    echo -e "    ${GREEN}plans-bench [args]${NC}       Run benchmark_plans.py (args passed through)"
    echo -e "    ${GREEN}plans-list${NC}               Shortcut: --list-models"
    echo -e "    ${GREEN}plans-run <model> [args]${NC} Run a single model (--verbose auto-added)"
    echo -e "    ${GREEN}plans-compare <m1> <m2> [more]${NC}  Compare models side-by-side"
    echo ""
    echo "  Writing Style Benchmark:"
    echo -e "    ${GREEN}style-bench [args]${NC}       Run benchmark_style.py (args passed through)"
    echo -e "    ${GREEN}style-list${NC}               List available ollama models for style bench"
@ -127,6 +133,8 @@ case "$CMD" in
        fi
        mkdir -p "$LOG_DIR"
        API_LOG="${LOG_DIR}/api.log"
        # Load .env if present — sets HF_TOKEN and other optional overrides.
        [[ -f .env ]] && set -a && source .env && set +a
        info "Building Vue SPA…"
        (cd web && npm run build) >> "$API_LOG" 2>&1
        info "Starting FastAPI on port ${API_PORT}…"
@ -179,6 +187,9 @@ case "$CMD" in
        mkdir -p "$LOG_DIR"
        DEV_API_LOG="${LOG_DIR}/dev-api.log"
        # Load .env if present — sets HF_TOKEN and other optional overrides.
        [[ -f .env ]] && set -a && source .env && set +a
        if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then
            warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))"
        else
@ -255,6 +266,30 @@ case "$CMD" in
        exec "$0" benchmark --compare "$@"
        ;;
    plans-bench)
        info "Running planning benchmark (${ENV_UI})…"
        "$PYTHON_UI" scripts/benchmark_plans.py "$@"
        ;;
    plans-list)
        exec "$0" plans-bench --list-models
        ;;
    plans-run)
        if [[ $# -lt 1 ]]; then
            error "Usage: ./manage.sh plans-run <model-key> [extra args]"
        fi
        MODEL="$1"; shift
        exec "$0" plans-bench --model "$MODEL" --verbose "$@"
        ;;
    plans-compare)
        if [[ $# -lt 2 ]]; then
            error "Usage: ./manage.sh plans-compare <model1> <model2> [more…]"
        fi
        exec "$0" plans-bench --compare "$@" --verbose
        ;;
    style-bench)
        info "Running writing style benchmark (${ENV_BM})…"
        if [[ ! -x "$PYTHON_BM" ]]; then
--- a/scripts/benchmark_plans.py
+++ b/scripts/benchmark_plans.py
@ -0,0 +1,719 @@
 #!/usr/bin/env python
 """CF-specific planning benchmark — compare base models before fine-tuning.
 Sends held-out CircuitForge planning prompts to one or more models via the
 cf-text (local) or cf-orch API, then scores responses against CF-specific
 rubrics. Use this to select the best base model for SFT.
 Scoring rubrics (each 0-1, summed to total/N):
  - task_structure    : uses checkbox syntax (- [ ]), git commit steps
  - tier_awareness    : mentions Free/Paid/Premium/Ultra tiers
  - privacy_pillar    : mentions privacy/local-inference/no-logging
  - safety_pillar     : mentions safety, human approval, or reversibility
  - accessibility     : mentions ND/accessibility/adaptive needs
  - license_split     : mentions MIT vs BSL or open-core model
  - file_paths        : uses plausible file path references
  - cf_conventions    : uses conda run -n cf, /Library/Development/, or known CF dirs
  - paired_coherence  : (paired only) plan references the design doc's feature name
  - length_ok         : 300–2500 words (under-short = hallucination risk; over-long = padding)
 Usage
 -----
    # List available model targets
    python scripts/benchmark_plans.py --list-models
    # Run all held-out prompts against a single model, print report
    python scripts/benchmark_plans.py --model llama3.2-3b
    # Compare two models side-by-side
    python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b
    # Run with a custom API base (cf-text default: http://localhost:8080/v1)
    python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1
    # Export detailed results JSON
    python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from dataclasses import dataclass, field, asdict
 from pathlib import Path
 from typing import Any
 import httpx
 # ── Paths ──────────────────────────────────────────────────────────────────────
 _ROOT = Path(__file__).parent.parent
 _DATA_DIR = _ROOT / "data"
 CF_TEXT_BASE   = "http://localhost:8080/v1"
 CF_ORCH_BASE   = "http://localhost:8090/v1"
 CF_COORD_URL   = "http://10.1.10.71:7700"   # cf-orch coordinator (LAN)
 # ── Held-out prompts ───────────────────────────────────────────────────────────
 # These are NOT in the training export (no matching docs in circuitforge-plans/).
 # Each prompt exercises a different CF planning domain.
 HELD_OUT_PROMPTS: list[dict[str, Any]] = [
    {
        "id": "ho_001",
        "name": "kiwi_barcode_ocr",
        "domain": "feature_plan",
        "prompt": (
            "You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. "
            "Write a detailed implementation plan for adding barcode scanning via device camera "
            "and receipt OCR to the item-add flow.\n\n"
            "The plan should include: file structure (create/modify), step-by-step task checklist "
            "with checkboxes, any DB migrations, and git commit steps."
        ),
        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
    },
    {
        "id": "ho_002",
        "name": "peregrine_ats_scoring",
        "domain": "feature_design",
        "prompt": (
            "Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n"
            "Context: Peregrine users paste job descriptions and their resume. "
            "We want to score how well the resume keywords match the JD and suggest rewrites. "
            "Describe the architecture, data flow, and key design decisions."
        ),
        "expected_signals": ["privacy_pillar", "tier_awareness", "license_split"],
    },
    {
        "id": "ho_003",
        "name": "tier_gate_local_llm",
        "domain": "architecture",
        "prompt": (
            "Design the tier-gating architecture for a new CircuitForge product. "
            "The product should:\n"
            "- Default to local LLM inference for all tiers\n"
            "- Unlock cloud LLM for Paid tier and above\n"
            "- Keep fine-tuned model weights for Premium/Ultra only\n\n"
            "Describe how the tier check integrates with the LLM router, "
            "what happens when a Free user tries a Paid-tier feature, "
            "and how BYOK (bring-your-own-key) fits in."
        ),
        "expected_signals": ["tier_awareness", "privacy_pillar", "license_split"],
    },
    {
        "id": "ho_004",
        "name": "heimdall_webhook_plan",
        "domain": "feature_plan",
        "prompt": (
            "Break the following Heimdall feature into a detailed implementation plan with "
            "file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n"
            "Heimdall is the CircuitForge license server (FastAPI + SQLite). "
            "The webhook needs to handle checkout.session.completed, "
            "customer.subscription.updated, and customer.subscription.deleted events."
        ),
        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
    },
    {
        "id": "ho_005",
        "name": "nd_accessible_onboarding",
        "domain": "ux_design",
        "prompt": (
            "You are a product designer working on Harrier, a CircuitForge tool for "
            "helping people navigate government benefits applications.\n\n"
            "Design the onboarding flow for neurodivergent (ND) users. "
            "Consider: ADHD time-blindness, executive function challenges, demand avoidance, "
            "and rejection sensitivity. The flow should reduce cognitive load and "
            "never use urgency or panic patterns."
        ),
        "expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"],
    },
    {
        "id": "ho_006",
        "name": "circuitforge_core_extraction",
        "domain": "architecture",
        "prompt": (
            "Produce a CircuitForge-style design document for the following circuitforge-core "
            "feature — shared ActivityPub federation module.\n\n"
            "Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates "
            "to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. "
            "Design the module API, describe what belongs in MIT vs BSL, and note federation "
            "privacy constraints."
        ),
        "expected_signals": ["license_split", "privacy_pillar", "cf_conventions"],
    },
    {
        "id": "ho_007",
        "name": "snipe_trust_score_plan",
        "domain": "feature_plan",
        "prompt": (
            "You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. "
            "Write a step-by-step engineering plan for: seller trust score calculation.\n\n"
            "The score should combine: feedback ratio, account age, item-specifics completeness, "
            "listing photo quality, and shipping time accuracy. "
            "Include file structure, test plan, and migration steps."
        ),
        "expected_signals": ["task_structure", "file_paths", "safety_pillar"],
    },
    {
        "id": "ho_008",
        "name": "avocet_training_pipeline",
        "domain": "feature_plan",
        "prompt": (
            "Break the following Avocet feature into a detailed implementation plan — "
            "end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n"
            "Avocet is the CircuitForge email classifier training tool. "
            "The pipeline should: validate the dataset, run LoRA SFT via unsloth, "
            "quantize to Q5_K_M GGUF, run the benchmark harness, and register the model "
            "in the Avocet model queue if it beats the baseline."
        ),
        "expected_signals": ["task_structure", "file_paths", "cf_conventions"],
    },
    {
        "id": "ho_009",
        "name": "privacy_data_flow",
        "domain": "architecture",
        "prompt": (
            "Design the data privacy architecture for a CircuitForge cloud product. "
            "Describe: what PII is collected, how it's stored, retention policy, "
            "obfuscation strategy for cloud-side logs, and how consent is obtained "
            "in plain language. The product handles job applications (resumes, cover letters)."
        ),
        "expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"],
    },
    {
        "id": "ho_010",
        "name": "git_workflow_doc",
        "domain": "process_doc",
        "prompt": (
            "Write a developer process document for CircuitForge: conventional commit and "
            "branch workflow for a BSL 1.1 open-core product.\n\n"
            "Cover: commit message format (type: description), branch naming, "
            "when to use feature branches vs direct main commits, "
            "how the MIT/BSL split affects which commits go in which branch, "
            "and how CI gates on gitleaks for secret scanning."
        ),
        "expected_signals": ["license_split", "cf_conventions", "task_structure"],
    },
 ]
 # ── Rubric scoring ─────────────────────────────────────────────────────────────
 _TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE)
 _COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE)
 _TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE)
 _PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE)
 _SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE)
 _A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE)
 _LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE)
 _FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE)
 _CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE)
@dataclass
 class RubricScore:
    task_structure: float = 0.0
    tier_awareness: float = 0.0
    privacy_pillar: float = 0.0
    safety_pillar: float = 0.0
    accessibility: float = 0.0
    license_split: float = 0.0
    file_paths: float = 0.0
    cf_conventions: float = 0.0
    length_ok: float = 0.0
    def total(self) -> float:
        vals = [self.task_structure, self.tier_awareness, self.privacy_pillar,
                self.safety_pillar, self.accessibility, self.license_split,
                self.file_paths, self.cf_conventions, self.length_ok]
        return sum(vals) / len(vals)
    def as_dict(self) -> dict[str, float]:
        return asdict(self)
 def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore:
    words = len(response.split())
    s = RubricScore()
    # Task structure: needs checkboxes AND at least one commit step
    checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response))
    has_commit = bool(_COMMIT_RE.search(response))
    s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0)
    # Tier awareness
    s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2)
    # Privacy pillar
    s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3)
    # Safety pillar
    s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2)
    # Accessibility
    s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2)
    # License split awareness
    s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2)
    # File paths: at least 3 plausible path references
    s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3)
    # CF conventions
    s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2)
    # Length: 200–2500 words is healthy; outside = partial credit
    if 200 <= words <= 2500:
        s.length_ok = 1.0
    elif words < 200:
        s.length_ok = words / 200
    else:
        s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500)
    return s
 # ── Model client ───────────────────────────────────────────────────────────────
 # Registry of named model targets (shorthand → {api_base, model_name})
 MODEL_REGISTRY: dict[str, dict[str, str]] = {
    "deepseek-r1-1.5b": {
        "api_base": CF_TEXT_BASE,
        "model": "deepseek-r1-1.5b",
        "description": "DeepSeek R1 1.5B distill (cf-orch catalog key)",
    },
    "deepseek-r1-7b-4bit": {
        "api_base": CF_TEXT_BASE,
        "model": "deepseek-r1-7b-4bit",
        "description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)",
    },
    "deepseek-coder-6.7b-4bit": {
        "api_base": CF_TEXT_BASE,
        "model": "deepseek-coder-6.7b-4bit",
        "description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)",
    },
    "granite-4.1-8b": {
        "api_base": CF_TEXT_BASE,
        "model": "granite-4.1-8b",
        "description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)",
    },
    "qwen2.5-3b": {
        "api_base": CF_TEXT_BASE,
        "model": "qwen2.5-3b",
        "description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)",
    },
    "qwen2.5-7b": {
        "api_base": CF_TEXT_BASE,
        "model": "qwen2.5-7b",
        "description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)",
    },
 }
 # ── cf-orch allocation ─────────────────────────────────────────────────────────
 def _cforch_allocate(
    model_id: str,
    cforch_url: str,
    startup_timeout_s: float = 300.0,
 ) -> tuple[str, str] | None:
    """Allocate a cf-text instance for model_id via the cf-orch coordinator.
    Returns (service_url, allocation_id) on success, None on failure.
    service_url is the direct node URL exposing /v1/chat/completions.
    """
    try:
        resp = httpx.post(
            f"{cforch_url}/api/services/cf-text/allocate",
            json={
                "model_candidates": [model_id],
                "caller": "avocet",
                "pipeline": "plans_benchmark",
            },
            timeout=120.0,
        )
        resp.raise_for_status()
        data = resp.json()
        service_url: str = data["url"]
        allocation_id: str = data.get("allocation_id", "")
        node_id: str = data.get("node_id", "")
        gpu_id: int | None = data.get("gpu_id")
        if data.get("started", False) and not data.get("warm", True):
            # Use \n so the SSE generator sees the line immediately
            print(f"  [cold start] loading {model_id!r} — polling every 3s…", flush=True)
            t0 = time.monotonic()
            deadline = t0 + startup_timeout_s
            probe_misses = 0
            while time.monotonic() < deadline:
                elapsed = time.monotonic() - t0
                try:
                    status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0)
                    if status.is_success:
                        instances = status.json().get("instances", [])
                        match = next(
                            (i for i in instances
                             if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id),
                            None,
                        )
                        if match:
                            probe_misses = 0
                            state = match.get("state", "")
                            if state == "running":
                                print(f"  [cold start] ready in {elapsed:.0f}s", flush=True)
                                return service_url, allocation_id
                            elif state == "stopped":
                                print(f"  [cold start] failed — service stopped after {elapsed:.0f}s", flush=True)
                                return None
                            else:
                                # still starting — emit keepalive so SSE stream stays alive
                                print(f"  [cold start] state={state!r}  elapsed={elapsed:.0f}s", flush=True)
                        else:
                            probe_misses += 1
                            print(f"  [cold start] waiting… elapsed={elapsed:.0f}s", flush=True)
                            if probe_misses >= 6:
                                try:
                                    h = httpx.get(f"{service_url}/health", timeout=3.0)
                                    if h.is_success:
                                        print(f"  [cold start] ready via health check in {elapsed:.0f}s", flush=True)
                                        return service_url, allocation_id
                                except Exception:
                                    pass
                    else:
                        print(f"  [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True)
                except Exception as poll_exc:
                    print(f"  [cold start] poll error: {poll_exc}  elapsed={elapsed:.0f}s", flush=True)
                time.sleep(3.0)
            print(f"  [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True)
            return None
        return service_url, allocation_id
    except Exception as exc:
        print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr)
        return None
 def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]:
    """Call an OpenAI-compatible /v1/chat/completions on a direct service URL."""
    t0 = time.monotonic()
    resp = httpx.post(
        f"{service_url.rstrip('/')}/v1/chat/completions",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 2048,
            "temperature": 0.2,
        },
        timeout=timeout,
    )
    resp.raise_for_status()
    latency = time.monotonic() - t0
    text = resp.json()["choices"][0]["message"]["content"]
    return text, latency
 def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]:
    """Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s)."""
    t0 = time.monotonic()
    resp = httpx.post(
        f"{api_base}/chat/completions",
        json={
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "max_tokens": 2048,
            "temperature": 0.2,
        },
        timeout=timeout,
    )
    resp.raise_for_status()
    latency = time.monotonic() - t0
    text = resp.json()["choices"][0]["message"]["content"]
    return text, latency
 # ── Benchmark runner ───────────────────────────────────────────────────────────
@dataclass
 class PromptResult:
    prompt_id: str
    prompt_name: str
    model_key: str
    response: str
    latency_s: float
    word_count: int
    scores: dict[str, float]
    total_score: float
    error: str | None = None
 def run_benchmark(
    model_key: str,
    model_name: str,
    prompts: list[dict[str, Any]] | None = None,
    verbose: bool = False,
    # cf-orch path
    use_cforch: bool = False,
    cforch_url: str = CF_COORD_URL,
    # direct path (used when not cf-orch)
    api_base: str = CF_TEXT_BASE,
 ) -> list[PromptResult]:
    """Run all prompts through one model. Uses cf-orch allocation when use_cforch=True."""
    if prompts is None:
        prompts = HELD_OUT_PROMPTS
    # Allocate once per model when using cf-orch
    service_url: str | None = None
    if use_cforch:
        print(f"  Allocating {model_name!r} via cf-orch…", flush=True)
        alloc = _cforch_allocate(model_name, cforch_url)
        if alloc is None:
            # Return all prompts as errors
            return [
                PromptResult(
                    prompt_id=p["id"], prompt_name=p["name"], model_key=model_key,
                    response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0,
                    error=f"cf-orch allocation failed for {model_name!r}",
                )
                for p in prompts
            ]
        service_url, _alloc_id = alloc
    results: list[PromptResult] = []
    for p in prompts:
        if verbose:
            print(f"  [{p['id']}] {p['name']} … ", end="", flush=True)
        try:
            if service_url:
                response, latency = _call_model_direct(service_url, model_name, p["prompt"])
            else:
                response, latency = _call_model(api_base, model_name, p["prompt"])
            rubric = score_response(response, p)
            result = PromptResult(
                prompt_id=p["id"],
                prompt_name=p["name"],
                model_key=model_key,
                response=response,
                latency_s=round(latency, 2),
                word_count=len(response.split()),
                scores=rubric.as_dict(),
                total_score=round(rubric.total(), 3),
            )
            if verbose:
                print(f"score={result.total_score:.3f}  ({result.word_count}w, {latency:.1f}s)")
        except Exception as exc:
            result = PromptResult(
                prompt_id=p["id"],
                prompt_name=p["name"],
                model_key=model_key,
                response="",
                latency_s=0.0,
                word_count=0,
                scores={},
                total_score=0.0,
                error=str(exc),
            )
            if verbose:
                print(f"ERROR: {exc}")
        results.append(result)
    return results
 # ── Reporting ──────────────────────────────────────────────────────────────────
 def _print_single_report(results: list[PromptResult], model_key: str) -> None:
    ok = [r for r in results if not r.error]
    err = [r for r in results if r.error]
    if not ok:
        print(f"\n[{model_key}] All {len(err)} prompts failed.\n")
        return
    avg_total = sum(r.total_score for r in ok) / len(ok)
    avg_latency = sum(r.latency_s for r in ok) / len(ok)
    # Aggregate per-rubric averages
    rubric_keys = list(ok[0].scores.keys())
    rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys}
    print(f"\n{'='*60}")
    print(f"  Model : {model_key}")
    print(f"  Prompts: {len(ok)}/{len(results)} passed  ({len(err)} errors)")
    print(f"  Overall score : {avg_total:.3f}  (avg latency {avg_latency:.1f}s)")
    print(f"\n  Rubric breakdown:")
    for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]):
        bar = "█" * int(v * 20)
        print(f"    {k:<22} {v:.3f}  {bar}")
    print(f"\n  Per-prompt scores:")
    for r in sorted(ok, key=lambda x: -x.total_score):
        flag = "⚠" if r.total_score < 0.3 else " "
        print(f"    {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f}  ({r.word_count}w)")
    if err:
        print(f"\n  Errors:")
        for r in err:
            print(f"    {r.prompt_id} {r.prompt_name}: {r.error}")
    print(f"{'='*60}\n")
 def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None:
    model_keys = list(all_results.keys())
    prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS]
    # Scores by (model, prompt_id)
    score_map: dict[tuple[str, str], float] = {}
    for mk, results in all_results.items():
        for r in results:
            score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0
    col_w = 10
    header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys)
    print(f"\n{'='*len(header)}")
    print("  COMPARISON TABLE")
    print(f"{'='*len(header)}")
    print(f"  {header}")
    print(f"  {'-'*len(header)}")
    for pid in prompt_ids:
        pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid)
        row = f"  {pname:<35}"
        best = max(score_map.get((mk, pid), 0.0) for mk in model_keys)
        for mk in model_keys:
            v = score_map.get((mk, pid), 0.0)
            marker = "*" if v == best and len(model_keys) > 1 else " "
            row += f"{v:.3f}{marker}   "
        print(row)
    print(f"  {'-'*len(header)}")
    avgs_row = f"  {'AVERAGE':<35}"
    best_avg = -1.0
    avgs: dict[str, float] = {}
    for mk in model_keys:
        vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids]
        avgs[mk] = sum(vals) / len(vals)
        best_avg = max(best_avg, avgs[mk])
    for mk in model_keys:
        marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " "
        avgs_row += f"{avgs[mk]:.3f}{marker}   "
    print(avgs_row)
    print(f"{'='*len(header)}\n")
    if len(model_keys) > 1:
        winner = max(avgs, key=lambda k: avgs[k])
        print(f"  Winner: {winner}  (avg {avgs[winner]:.3f})\n")
 # ── CLI ────────────────────────────────────────────────────────────────────────
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--list-models", action="store_true",
                        help="Print registered model shortcuts and exit")
    parser.add_argument("--model", metavar="KEY",
                        help="Benchmark a single model (registry key or raw model name)")
    parser.add_argument("--compare", nargs="+", metavar="KEY",
                        help="Compare two or more models side-by-side")
    parser.add_argument("--cforch", action="store_true",
                        help="Route inference through cf-orch coordinator (allocate per model)")
    parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL",
                        help=f"cf-orch coordinator URL (default: {CF_COORD_URL})")
    parser.add_argument("--api-base", default=None,
                        help="Direct API base URL when not using cf-orch")
    parser.add_argument("--model-name", default=None,
                        help="Override model name sent to API (single-model runs only)")
    parser.add_argument("--prompts", nargs="+", metavar="ID",
                        help="Run only specific prompt IDs (e.g. ho_001 ho_003)")
    parser.add_argument("--output", type=Path, default=None,
                        help="Write detailed JSON results to this path")
    parser.add_argument("--workers", type=int, default=1, metavar="N",
                        help="Run N models concurrently (default 1). Set to number of available nodes.")
    parser.add_argument("--verbose", "-v", action="store_true",
                        help="Print per-prompt progress")
    args = parser.parse_args()
    if args.list_models:
        print("\nRegistered model shortcuts:")
        for key, info in MODEL_REGISTRY.items():
            print(f"  {key:<20} {info['description']}")
        print(f"\nDefault endpoints:")
        print(f"  direct    {CF_TEXT_BASE}")
        print(f"  cf-orch   {CF_COORD_URL}")
        return
    prompts = HELD_OUT_PROMPTS
    if args.prompts:
        ids = set(args.prompts)
        prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids]
        if not prompts:
            print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr)
            sys.exit(1)
    model_keys: list[str] = []
    if args.compare:
        model_keys = args.compare
    elif args.model:
        model_keys = [args.model]
    else:
        parser.print_help()
        sys.exit(0)
    all_results: dict[str, list[PromptResult]] = {}
    print_lock = threading.Lock()
    def _run_one(mk: str) -> tuple[str, list[PromptResult]]:
        if mk in MODEL_REGISTRY:
            reg = MODEL_REGISTRY[mk]
            model_name = args.model_name or reg["model"]
            direct_base = args.api_base or reg["api_base"]
        else:
            model_name = args.model_name or mk
            direct_base = args.api_base or CF_TEXT_BASE
        if args.cforch:
            with print_lock:
                print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url})  model={model_name}")
            results = run_benchmark(
                mk, model_name, prompts=prompts, verbose=args.verbose,
                use_cforch=True, cforch_url=args.cforch_url,
            )
        else:
            with print_lock:
                print(f"\nRunning [{mk}] → {direct_base}  model={model_name}")
            results = run_benchmark(
                mk, model_name, prompts=prompts, verbose=args.verbose,
                api_base=direct_base,
            )
        with print_lock:
            _print_single_report(results, mk)
        return mk, results
    workers = max(1, args.workers)
    if workers == 1 or len(model_keys) == 1:
        for mk in model_keys:
            mk_out, results = _run_one(mk)
            all_results[mk_out] = results
    else:
        with ThreadPoolExecutor(max_workers=workers) as pool:
            futures = {pool.submit(_run_one, mk): mk for mk in model_keys}
            for fut in as_completed(futures):
                mk_out, results = fut.result()
                all_results[mk_out] = results
    if len(model_keys) > 1:
        _print_comparison_table(all_results)
    if args.output:
        args.output.parent.mkdir(parents=True, exist_ok=True)
        payload = {
            mk: [asdict(r) for r in results]
            for mk, results in all_results.items()
        }
        with open(args.output, "w", encoding="utf-8") as f:
            json.dump(payload, f, indent=2, ensure_ascii=False)
        print(f"Wrote detailed results to {args.output}")
 if __name__ == "__main__":
    main()
--- a/scripts/export_plans.py
+++ b/scripts/export_plans.py
@ -0,0 +1,458 @@
 """Export circuitforge-plans/ documents as instruction-tuning JSONL pairs.
 Each record is a HuggingFace chat-format example:
    {
      "id": "<sha256>",
      "messages": [
        {"role": "user", "content": "<reconstructed planning prompt>"},
        {"role": "assistant", "content": "<cleaned document content>"}
      ],
      "meta": {
        "source": "peregrine/2026-03-03-feedback-button-design.md",
        "product": "peregrine",
        "doc_type": "design",     # design | plan | spec | implementation | other
        "date": "2026-03-03",
        "paired_with": "...",     # sibling path, or null
        "word_count": 1847,
        "pair_role": "context"    # "context" | "target" | "standalone"
      }
    }
 Pairing strategy
 ----------------
 When a design doc and a plan doc share the same date + feature-name prefix,
 they are treated as a pair:
  - design → plan: instruction = "Given this design doc, write the implementation plan."
    context appended = full design doc content.
  - Solo docs get a synthetic instruction from the title + first overview section.
 Usage
 -----
    # Preview stats and 5 sample records
    python scripts/export_plans.py --preview
    # Write full output
    python scripts/export_plans.py --output data/plan_pairs.jsonl
    # Restrict to specific products
    python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl
 """
 from __future__ import annotations
 import argparse
 import hashlib
 import json
 import re
 import sys
 from pathlib import Path
 from typing import Iterator
 # ── Paths ──────────────────────────────────────────────────────────────────────
 _SCRIPT_DIR = Path(__file__).parent
 _AVOCET_ROOT = _SCRIPT_DIR.parent
 _DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans")
 _DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl"
 # ── Doc type detection ─────────────────────────────────────────────────────────
 _TYPE_RE = re.compile(
    r"-(design|plan|spec|implementation|specs|plans)s?$",
    re.IGNORECASE,
 )
 _SKIP_DIRS = {"__pycache__", ".git", "node_modules"}
 # Boilerplate lines to strip from document content before using as output.
 _BOILERPLATE_RE = re.compile(
    r"""
    ^\s*>\s*\*\*For\s+agentic\s+workers.*         # superpowers agent hints
    |^\s*>\s*REQUIRED\s+SUB-SKILL.*
    |^\s*\*\*Date:\*\*.*                           # metadata header lines
    |\*\*Status:\*\*\s*Complete.*                  # completed-feature noise
    |\*\*Status:\*\*\s*Done.*
    |\*\*Product:\*\*.*
    |\*\*Repo:\*\*.*
    |\*\*Tech\s+Stack:\*\*.*
    |\*\*Candidate:\*\*.*                          # old synthetic personas
    |^Candidate:.*
    |^Team:.*
    """,
    re.VERBOSE | re.MULTILINE,
 )
 # Old repo/path names to normalise to current equivalents.
 _PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [
    (re.compile(r"/devl/job-seeker", re.IGNORECASE),      "/Library/Development/CircuitForge/peregrine"),
    (re.compile(r"\bjob-seeker\b",   re.IGNORECASE),      "peregrine"),
    (re.compile(r"Alex Rivera",      re.IGNORECASE),      "[user]"),
 ]
 # Instruction paraphrase templates per doc type.
 # Each entry is (user_prefix, paired_prefix).
 # {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted.
 _DESIGN_INSTRUCTIONS = [
    "Write a design document for {product}: {title}.\n\nContext: {overview}",
    "You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}",
    "Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}",
 ]
 _PLAN_INSTRUCTIONS = [
    "Write an implementation plan for {product}: {title}.\n\nContext: {overview}",
    "Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}",
    "You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}",
 ]
 _PAIRED_INSTRUCTIONS = [
    (
        "You are a software architect working on {product}, a CircuitForge product. "
        "Given the following design document, write a detailed implementation plan "
        "(file structure, task breakdown with checkboxes, migration steps if needed).\n\n"
        "---\n{design_context}\n---"
    ),
    (
        "The following is a design spec for a {product} feature. "
        "Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n"
        "---\n{design_context}\n---"
    ),
    (
        "Convert this {product} design document into an actionable implementation plan. "
        "Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n"
        "---\n{design_context}\n---"
    ),
 ]
 def _doc_type(stem: str) -> str:
    m = _TYPE_RE.search(stem)
    if not m:
        return "other"
    raw = m.group(1).lower().rstrip("s")
    return {"implementation": "plan"}.get(raw, raw)
 def _date_feature(stem: str) -> tuple[str, str]:
    """Return (date, feature_slug) from '2026-03-03-feedback-button-design'."""
    m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I)
    if m:
        return m.group(1), m.group(2)
    return "", stem
 # ── Content extraction ─────────────────────────────────────────────────────────
 def _extract_title(content: str) -> str:
    m = re.search(r"^#\s+(.+)", content, re.MULTILINE)
    return m.group(1).strip() if m else ""
 def _extract_overview(content: str) -> str:
    """Return first substantive paragraph or h2 section body (≤300 chars)."""
    # Superpowers plans have an explicit **Goal:** line — prefer that.
    goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content)
    if goal_m:
        return goal_m.group(1).strip()[:300]
    # Otherwise use the body of the first h2 section.
    h2_m = re.search(
        r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)",
        content,
        re.MULTILINE,
    )
    if h2_m:
        body = h2_m.group(1).strip()
        # Strip markdown bullet/code noise for the instruction
        body = re.sub(r"```[\s\S]*?```", "", body)
        body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body)
        body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body)
        body = re.sub(r"\s+", " ", body).strip()
        return body[:300]
    return ""
 def _clean_content(content: str) -> str:
    """Remove boilerplate, normalize old paths/names, collapse whitespace."""
    cleaned = _BOILERPLATE_RE.sub("", content)
    for pattern, replacement in _PATH_NORMALIZATIONS:
        cleaned = pattern.sub(replacement, cleaned)
    cleaned = re.sub(r"\n{3,}", "\n\n", cleaned)
    return cleaned.strip()
 def _quality_flags(content: str) -> list[str]:
    """Return a list of quality issue labels found in cleaned content."""
    flags = []
    if "Alex Rivera" in content or "[user]" in content:
        flags.append("persona-residue")
    if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content):
        flags.append("completed-status")
    return flags
 def _make_instruction(
    title: str,
    product: str,
    doc_type: str,
    overview: str,
    design_context: str | None = None,
    variant: int = 0,
 ) -> str:
    """Synthesise a natural planning prompt for this document.
    variant: 0-2 selects which paraphrase template to use. Caller cycles
    through all three to produce multiple training examples per document.
    """
    product_label = product.replace("-", " ").title() if product else "CircuitForge"
    idx = variant % 3
    if design_context:
        tmpl = _PAIRED_INSTRUCTIONS[idx]
        return tmpl.format(
            product=product_label,
            design_context=design_context[:2500],
        )
    templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS
    tmpl = templates[idx]
    return tmpl.format(
        product=product_label,
        title=title,
        overview=overview or "",
        type_phrase="planning document",
    )
 def _record_id(content: str, source: str) -> str:
    return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16]
 # ── Pair discovery ─────────────────────────────────────────────────────────────
 def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]:
    """Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature."""
    by_prefix: dict[str, list[tuple[str, Path]]] = {}
    for path in plans_dir.rglob("*.md"):
        if any(part in _SKIP_DIRS for part in path.parts):
            continue
        if path.name == "README.md":
            continue
        stem = path.stem
        date, feature = _date_feature(stem)
        if not date:
            continue
        key = str(path.parent / f"{date}-{feature}")
        by_prefix.setdefault(key, []).append((_doc_type(stem), path))
    return by_prefix
 # ── Record generation ──────────────────────────────────────────────────────────
 def _records_for_group(
    doc_type_paths: list[tuple[str, Path]],
    plans_dir: Path,
 ) -> Iterator[dict]:
    """Yield one or more training records for a group of related docs."""
    # Separate design vs plan docs within this group
    designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")]
    plans_  = [(t, p) for t, p in doc_type_paths if t in ("plan",)]
    others  = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")]
    all_paths = doc_type_paths
    if designs and plans_:
        # Paired: yield a design→plan record (3 instruction variants)
        design_type, design_path = designs[0]
        plan_type,   plan_path   = plans_[0]
        design_content = design_path.read_text(encoding="utf-8")
        plan_content   = plan_path.read_text(encoding="utf-8")
        product        = _product_from_path(plan_path, plans_dir)
        title          = _extract_title(plan_content) or plan_path.stem
        cleaned        = _clean_content(plan_content)
        design_cleaned = _clean_content(design_content)
        flags          = _quality_flags(cleaned)
        if len(cleaned.split()) >= 80:
            rel_src    = str(plan_path.relative_to(plans_dir))
            rel_design = str(design_path.relative_to(plans_dir))
            for variant in range(3):
                instruction = _make_instruction(
                    title=title,
                    product=product,
                    doc_type="plan",
                    overview=_extract_overview(design_content),
                    design_context=design_cleaned,
                    variant=variant,
                )
                yield {
                    "id": _record_id(f"v{variant}:{cleaned}", rel_src),
                    "messages": [
                        {"role": "user", "content": instruction},
                        {"role": "assistant", "content": cleaned},
                    ],
                    "meta": {
                        "source": rel_src,
                        "product": product,
                        "doc_type": "plan",
                        "date": _date_feature(plan_path.stem)[0],
                        "paired_with": rel_design,
                        "word_count": len(cleaned.split()),
                        "pair_role": "target",
                        "variant": variant,
                        "quality_flags": flags,
                    },
                }
        # Also yield the design doc as standalone variants
        all_paths = [(t, p) for t, p in all_paths if p != plan_path]
    # Remaining docs as standalone records (3 instruction variants each)
    for doc_type, path in all_paths:
        content = path.read_text(encoding="utf-8")
        cleaned = _clean_content(content)
        if len(cleaned.split()) < 80:
            continue
        product  = _product_from_path(path, plans_dir)
        title    = _extract_title(content) or path.stem
        overview = _extract_overview(content)
        flags    = _quality_flags(cleaned)
        rel_src  = str(path.relative_to(plans_dir))
        for variant in range(3):
            instruction = _make_instruction(
                title=title,
                product=product,
                doc_type=doc_type,
                overview=overview,
                variant=variant,
            )
            yield {
                "id": _record_id(f"v{variant}:{cleaned}", rel_src),
                "messages": [
                    {"role": "user", "content": instruction},
                    {"role": "assistant", "content": cleaned},
                ],
                "meta": {
                    "source": rel_src,
                    "product": product,
                    "doc_type": doc_type,
                    "date": _date_feature(path.stem)[0],
                    "paired_with": None,
                    "word_count": len(cleaned.split()),
                    "pair_role": "standalone",
                    "variant": variant,
                    "quality_flags": flags,
                },
            }
 def _product_from_path(path: Path, plans_dir: Path) -> str:
    rel = path.relative_to(plans_dir)
    return rel.parts[0] if len(rel.parts) > 1 else "shared"
 # ── Main export ────────────────────────────────────────────────────────────────
 def export(
    plans_dir: Path,
    products: list[str] | None = None,
 ) -> list[dict]:
    groups = _find_pairs(plans_dir)
    records: list[dict] = []
    seen_ids: set[str] = set()
    for group_key, doc_type_paths in groups.items():
        # Filter by product if requested
        if products:
            paths = [p for _, p in doc_type_paths]
            prods = {_product_from_path(p, plans_dir) for p in paths}
            if not prods.intersection(products):
                continue
        for record in _records_for_group(doc_type_paths, plans_dir):
            if record["id"] not in seen_ids:
                seen_ids.add(record["id"])
                records.append(record)
    return records
 # ── CLI ────────────────────────────────────────────────────────────────────────
 def _print_stats(records: list[dict]) -> None:
    from collections import Counter
    products  = Counter(r["meta"]["product"]   for r in records)
    doc_types = Counter(r["meta"]["doc_type"]  for r in records)
    pair_roles = Counter(r["meta"]["pair_role"] for r in records)
    wc = [r["meta"]["word_count"] for r in records]
    wc.sort()
    print(f"\n{'='*55}")
    print(f"  Total records: {len(records)}")
    print(f"  Word counts  : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}")
    print(f"\n  By product:")
    for p, n in products.most_common():
        print(f"    {p:<22} {n}")
    print(f"\n  By doc type:")
    for t, n in doc_types.most_common():
        print(f"    {t:<22} {n}")
    print(f"\n  Pair roles:")
    for r, n in pair_roles.most_common():
        print(f"    {r:<22} {n}")
    print(f"{'='*55}\n")
 def _print_sample(records: list[dict], n: int = 3) -> None:
    import random
    sample = random.sample(records, min(n, len(records)))
    for i, rec in enumerate(sample, 1):
        meta = rec["meta"]
        user_msg = rec["messages"][0]["content"]
        asst_msg = rec["messages"][1]["content"]
        print(f"\n{'─'*55}")
        print(f"SAMPLE {i}/{n}  [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]")
        print(f"source: {meta['source']}")
        print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}")
        print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}")
    print(f"\n{'─'*55}\n")
 def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR)
    parser.add_argument("--output", type=Path, default=None,
                        help="Write JSONL to this path (omit for preview-only)")
    parser.add_argument("--products", default=None,
                        help="Comma-separated product filter, e.g. peregrine,kiwi")
    parser.add_argument("--preview", action="store_true",
                        help="Print stats + sample records, don't write output")
    parser.add_argument("--samples", type=int, default=3,
                        help="Number of sample records to show in preview (default 3)")
    args = parser.parse_args()
    products = [p.strip() for p in args.products.split(",")] if args.products else None
    print(f"Scanning {args.plans_dir} …", file=sys.stderr)
    records = export(args.plans_dir, products=products)
    _print_stats(records)
    if args.preview or args.output is None:
        _print_sample(records, n=args.samples)
        if args.output is None:
            print("(Pass --output <path> to write JSONL)")
        return
    args.output.parent.mkdir(parents=True, exist_ok=True)
    with open(args.output, "w", encoding="utf-8") as f:
        for rec in records:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"Wrote {len(records)} records to {args.output}")
 if __name__ == "__main__":
    main()
--- a/tests/test_cforch.py
+++ b/tests/test_cforch.py
@ -14,7 +14,9 @@ from fastapi.testclient import TestClient
@pytest.fixture(autouse=True)
 def reset_cforch_globals(tmp_path):
-    """Redirect _CONFIG_DIR to tmp_path and reset running-state globals."""
+    """Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub
    list_installed to return [] so real disk model directories don't bleed into
    tests that don't exercise the installed-model merge path."""
    from app import cforch as cforch_module
    prev_config_dir = cforch_module._CONFIG_DIR
@ -25,7 +27,8 @@ def reset_cforch_globals(tmp_path):
    cforch_module._BENCH_RUNNING = False
    cforch_module._bench_proc = None
-    yield tmp_path
+    with patch("app.models.list_installed", return_value=[]):
        yield tmp_path
    cforch_module.set_config_dir(prev_config_dir)
    cforch_module._BENCH_RUNNING = prev_running
@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path):
    assert m["vram_estimate_mb"] == 6000
 def test_models_merges_installed_generators(client, config_dir, tmp_path):
    """Installed cf-text/vllm generator models appear in the model list,
    deduplicated against bench_models.yaml entries."""
    models_file = tmp_path / "bench_models.yaml"
    _write_models_yaml(models_file, [
        {"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000},
        {"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000},
    ])
    _write_config(config_dir, {"bench_models": str(models_file)})
    fake_installed = [
        # should be included — cf-text generator not already in YAML
        {"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000},
        # should be deduped — repo_id matches a YAML entry
        {"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000},
        # should be excluded — classifier, not a generator
        {"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500},
    ]
    with patch("app.models.list_installed", return_value=fake_installed):
        r = client.get("/api/cforch/models")
    assert r.status_code == 200
    ids = [m["id"] for m in r.json()["models"]]
    assert "llama3:8b" in ids                           # from YAML
    assert "ibm-granite/granite-4.1-8b" in ids          # from YAML (not duplicated)
    assert "meta-llama/Llama-3.1-8B" in ids             # merged from installed
    assert "cross-encoder/ms-marco-MiniLM-L6" not in ids  # filtered out (reranker)
    assert ids.count("ibm-granite/granite-4.1-8b") == 1  # no duplicate
 # ── GET /run ───────────────────────────────────────────────────────────────────
 def test_run_returns_409_when_already_running(client):
--- a/tests/test_models.py
+++ b/tests/test_models.py
@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client):
        except _HTTPException as exc:
            assert exc.status_code in (400, 404)
            raise
 # ── Catalog registration ───────────────────────────────────────────────────────
 _MINIMAL_YAML = """\
 services:
  cf-text:
    max_mb: {max_mb}
    catalog:
      existing-model:
        path: /some/path
        vram_mb: 1000
        description: "placeholder"
 """
 def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path:
    p = tmp_path / "testnode.yaml"
    p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8")
    return p
 def test_catalog_registration_fp16_no_env_block(tmp_path):
    """When model fits at FP16, no env block should be written."""
    from app import models as models_module
    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
        updated = models_module._register_in_node_catalogs(
            repo_id="org/SmallModel",
            local_path=tmp_path / "org--SmallModel",
            vram_mb_fp16=4000,
            role="generator",
        )
    assert "testnode" in updated
    content = node_yaml.read_text()
    # _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel"
    assert "smallmodel:" in content
    assert "CF_TEXT_4BIT" not in content
    assert "env:" not in content
 def test_catalog_registration_needs_4bit_writes_env_block(tmp_path):
    """When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written."""
    from app import models as models_module
    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
        updated = models_module._register_in_node_catalogs(
            repo_id="org/BigModel",
            local_path=tmp_path / "org--BigModel",
            vram_mb_fp16=20000,  # won't fit at FP16 on 8 GB
            role="generator",
        )
    assert "testnode" in updated
    content = node_yaml.read_text()
    # _catalog_key: "org/BigModel" → "bigmodel"
    assert "bigmodel:" in content
    assert "env:" in content
    assert 'CF_TEXT_4BIT: "1"' in content
    assert "CF_TEXT_4BIT=1 required" in content  # description note
 def test_catalog_registration_too_large_skipped(tmp_path):
    """Model too large even at 4-bit should not be registered."""
    from app import models as models_module
    node_yaml = _make_node_yaml(tmp_path, max_mb=8192)
    with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path):
        updated = models_module._register_in_node_catalogs(
            repo_id="org/HugeModel",
            local_path=tmp_path / "org--HugeModel",
            vram_mb_fp16=80000,  # 4-bit ~22 GB, still won't fit on 8 GB
            role="generator",
        )
    assert updated == []
    content = node_yaml.read_text()
    assert "hugemodel" not in content
--- a/web/src/views/BenchmarkView.vue
+++ b/web/src/views/BenchmarkView.vue
@ -21,21 +21,28 @@
        :class="{ active: benchMode === 'style' }"
        @click="benchMode = 'style'"
      >✍️ Writing Style</button>
      <button
        class="mode-btn"
        :class="{ active: benchMode === 'plans' }"
        @click="benchMode = 'plans'"
      >📐 Planning</button>
    </div>
-    <ClassifierTab v-if="benchMode === 'classifier'" />
+    <ClassifierTab  v-if="benchMode === 'classifier'" />
-    <LlmEvalTab   v-if="benchMode === 'llm'" />
+    <LlmEvalTab     v-if="benchMode === 'llm'" />
-    <StyleTab     v-if="benchMode === 'style'" />
+    <StyleTab       v-if="benchMode === 'style'" />
    <PlansBenchTab  v-if="benchMode === 'plans'" />
  </div>
 </template>
 <script setup lang="ts">
 import { ref } from 'vue'
-import ClassifierTab from './ClassifierTab.vue'
+import ClassifierTab  from './ClassifierTab.vue'
-import LlmEvalTab   from './LlmEvalTab.vue'
+import LlmEvalTab     from './LlmEvalTab.vue'
-import StyleTab     from './StyleTab.vue'
+import StyleTab       from './StyleTab.vue'
 import PlansBenchTab  from './PlansBenchTab.vue'
-type BenchMode = 'classifier' | 'llm' | 'style'
+type BenchMode = 'classifier' | 'llm' | 'style' | 'plans'
 const benchMode = ref<BenchMode>('classifier')
 </script>
--- a/web/src/views/LlmEvalTab.vue
+++ b/web/src/views/LlmEvalTab.vue
@ -6,6 +6,8 @@
      <summary class="picker-summary">
        <span class="picker-title">📋 Task Selection</span>
        <span class="picker-badge">{{ llmTaskBadge }}</span>
        <button class="picker-bulk-btn" @click.stop.prevent="selectAllTasks()">All</button>
        <button class="picker-bulk-btn" @click.stop.prevent="clearAllTasks()">None</button>
      </summary>
      <div class="picker-body">
        <div v-if="llmTasksLoading" class="picker-loading">Loading tasks…</div>
@ -44,6 +46,8 @@
      <summary class="picker-summary">
        <span class="picker-title">🎯 Model Selection</span>
        <span class="picker-badge">{{ llmModelBadge }}</span>
        <button class="picker-bulk-btn" @click.stop.prevent="selectAllModels()">All</button>
        <button class="picker-bulk-btn" @click.stop.prevent="clearAllModels()">None</button>
      </summary>
      <div class="picker-body">
        <div v-if="llmModelsLoading" class="picker-loading">Loading models…</div>
@ -78,6 +82,33 @@
      </div>
    </details>
    <!-- Node Selection -->
    <div class="node-picker" v-if="llmNodes.length > 0">
      <span class="node-picker-label">Nodes:</span>
      <label
        v-for="node in llmNodes"
        :key="node.node_id"
        class="node-chip"
        :class="{ 'node-chip--off': !enabledNodes.has(node.node_id), 'node-chip--offline': !node.online }"
        :title="node.online ? `${node.node_id} — ${node.gpus.length} GPU(s)` : `${node.node_id} — offline`"
      >
        <input
          type="checkbox"
          class="node-chip-check"
          :checked="enabledNodes.has(node.node_id)"
          :disabled="!node.online || llmRunning"
          @change="toggleNode(node.node_id, ($event.target as HTMLInputElement).checked)"
        />
        {{ node.node_id }}
        <span class="node-chip-status" v-if="!node.online">offline</span>
      </label>
      <span class="node-picker-hint">
        {{ enabledNodeIds.length === llmNodes.filter(n => n.online).length
            ? 'auto-routing (all nodes)'
            : `restricted to: ${enabledNodeIds.join(', ')}` }}
      </span>
    </div>
    <!-- Run Controls -->
    <div class="run-controls">
      <button
@ -88,6 +119,24 @@
        {{ llmRunning ? '⏳ Running…' : '▶ Run LLM Eval' }}
      </button>
      <button v-if="llmRunning" class="btn-cancel" @click="cancelLlmBenchmark">✕ Cancel</button>
      <input
        v-model="llmJudgeUrl"
        class="judge-url-input"
        placeholder="Judge URL — leave empty to skip LLM judge scoring"
        :disabled="llmRunning"
        title="Optional: URL of a running cf-text service (e.g. http://10.1.10.158:8008). When set, each LLM response gets a secondary score from the judge model — adds a 'judge' column to results. Empty = primary quality scoring only."
      />
      <label class="workers-label" title="Run this many models concurrently (requires multiple GPUs)">
        <span class="workers-prefix">workers</span>
        <input
          v-model.number="llmWorkers"
          type="number"
          min="1"
          max="8"
          class="workers-input"
          :disabled="llmRunning"
        />
      </label>
      <span v-if="selectedLlmTasks.size === 0 || selectedLlmModels.size === 0" class="run-hint">
        Select at least one task and one model to run.
      </span>
@ -119,6 +168,7 @@
            <tr>
              <th class="hm-label-col">Model</th>
              <th class="hm-model-col">overall</th>
              <th v-if="llmHasJudge" class="hm-model-col hm-judge-col">judge</th>
              <th v-for="col in llmTaskTypeCols" :key="col" class="hm-model-col">{{ col }}</th>
              <th class="hm-model-col">tok/s</th>
            </tr>
@ -130,6 +180,12 @@
                class="hm-value-cell"
                :class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }"
              >{{ pct(row.avg_quality_score) }}</td>
              <td
                v-if="llmHasJudge"
                class="hm-value-cell hm-judge-cell"
                :class="{ 'bt-best': llmBestByCol['judge'] === row.model_id }"
                title="LLM-as-judge secondary score"
              >{{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }}</td>
              <td
                v-for="col in llmTaskTypeCols"
                :key="col"
@ -168,6 +224,12 @@ interface CfOrchModel {
  vram_estimate_mb?: number
 }
 interface CfOrchNode {
  node_id: string
  online: boolean
  gpus: { gpu_id: number; name: string; vram_total_mb: number; vram_free_mb: number }[]
 }
 interface LlmModelResult {
  model_name: string
  model_id: string
@ -175,9 +237,11 @@ interface LlmModelResult {
  avg_tokens_per_sec: number
  avg_completion_ms: number
  avg_quality_score: number
  avg_judge_score: number | null
  finetune_candidates: number
  error_count: number
  quality_by_task_type: Record<string, number>
  judge_score_by_task_type?: Record<string, number>
 }
 // ── State ───────────────────────────────────────────────────────────────────
@ -195,6 +259,10 @@ const llmError       = ref('')
 const llmResults     = ref<LlmModelResult[]>([])
 const llmEventSource = ref<EventSource | null>(null)
 const llmLogEl       = ref<HTMLElement | null>(null)
 const llmJudgeUrl    = ref('')
 const llmWorkers     = ref(1)
 const llmNodes       = ref<CfOrchNode[]>([])
 const enabledNodes   = ref<Set<string>>(new Set())
 // ── Computed ────────────────────────────────────────────────────────────────
 const llmTasksByType = computed((): Record<string, CfOrchTask[]> => {
@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => {
  return [...types].sort()
 })
 const llmHasJudge = computed(() =>
  llmResults.value.some(r => r.avg_judge_score != null)
 )
 const enabledNodeIds = computed(() =>
  llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id)
 )
 const llmBestByCol = computed((): Record<string, string> => {
  const best: Record<string, string> = {}
  if (llmResults.value.length === 0) return best
@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record<string, string> => {
  }
  best['overall'] = bestId
  if (llmHasJudge.value) {
    bestId = ''; bestVal = -Infinity
    for (const r of llmResults.value) {
      if (r.avg_judge_score != null && r.avg_judge_score > bestVal) {
        bestVal = r.avg_judge_score; bestId = r.model_id
      }
    }
    best['judge'] = bestId
  }
  for (const col of llmTaskTypeCols.value) {
    bestId = ''; bestVal = -Infinity
    for (const r of llmResults.value) {
@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) {
  }
  selectedLlmModels.value = next
 }
 function selectAllTasks()  { selectedLlmTasks.value  = new Set(llmTasks.value.map(t => t.id)) }
 function clearAllTasks()   { selectedLlmTasks.value  = new Set() }
 function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) }
 function clearAllModels()  { selectedLlmModels.value = new Set() }
 function toggleNode(id: string, checked: boolean) {
  const next = new Set(enabledNodes.value)
  checked ? next.add(id) : next.delete(id)
  enabledNodes.value = next
 }
 // ── Data loaders ─────────────────────────────────────────────────────────────
 async function loadLlmTasks() {
@ -335,6 +430,21 @@ async function loadLlmResults() {
  }
 }
 async function loadLlmConfig() {
  const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config')
  if (data?.judge_url && !llmJudgeUrl.value) {
    llmJudgeUrl.value = data.judge_url
  }
 }
 async function loadLlmNodes() {
  const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes')
  if (data?.nodes) {
    llmNodes.value = data.nodes
    enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id))
  }
 }
 // ── Run / cancel ──────────────────────────────────────────────────────────────
 function startLlmBenchmark() {
  llmRunning.value = true
@ -344,6 +454,15 @@ function startLlmBenchmark() {
  const params = new URLSearchParams()
  const taskIds = [...selectedLlmTasks.value].join(',')
  if (taskIds) params.set('task_ids', taskIds)
  const modelIds = [...selectedLlmModels.value].join(',')
  if (modelIds) params.set('model_ids', modelIds)
  if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim())
  if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value))
  const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id)
  const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length
  if (isRestricted && enabledNodeIds.value.length > 0) {
    params.set('node_ids', enabledNodeIds.value.join(','))
  }
  const es = new EventSource(`/api/cforch/run?${params}`)
  llmEventSource.value = es
@ -387,6 +506,8 @@ onMounted(() => {
  loadLlmTasks()
  loadLlmModels()
  loadLlmResults()
  loadLlmConfig()
  loadLlmNodes()
 })
 </script>
@ -451,6 +572,43 @@ onMounted(() => {
  color: var(--color-text-secondary, #6b7a99);
 }
 .judge-url-input {
  flex: 1;
  min-width: 14rem;
  max-width: 24rem;
  padding: 0.35rem 0.6rem;
  border: 1px solid var(--color-border, #d0d7e8);
  border-radius: 0.375rem;
  background: var(--color-surface, #fff);
  color: var(--color-text, #1a2338);
  font-size: 0.8rem;
  font-family: var(--font-mono, monospace);
 }
 .judge-url-input:disabled { opacity: 0.5; }
 .judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); }
 .workers-label {
  display: flex;
  align-items: center;
  gap: 0.3rem;
  font-size: 0.8rem;
  color: var(--color-text-secondary, #6b7a99);
  white-space: nowrap;
 }
 .workers-prefix { font-family: var(--font-mono, monospace); }
 .workers-input {
  width: 3.2rem;
  padding: 0.35rem 0.4rem;
  border: 1px solid var(--color-border, #d0d7e8);
  border-radius: 0.375rem;
  background: var(--color-surface, #fff);
  color: var(--color-text, #1a2338);
  font-size: 0.8rem;
  font-family: var(--font-mono, monospace);
  text-align: center;
 }
 .workers-input:disabled { opacity: 0.5; }
 /* ── Run log ────────────────────────────────────────────── */
 .run-log {
  border: 1px solid var(--color-border, #d0d7e8);
@ -592,6 +750,15 @@ onMounted(() => {
  white-space: nowrap;
 }
 .hm-judge-col {
  background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5);
 }
 .hm-judge-cell {
  background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5);
  font-style: italic;
  opacity: 0.9;
 }
 /* ── Model Picker ───────────────────────────────────────── */
 .model-picker {
  border: 1px solid var(--color-border, #d0d7e8);
@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼  '; }
  margin-left: auto;
 }
 .picker-bulk-btn {
  padding: 0.1rem 0.45rem;
  font-size: 0.7rem;
  font-family: var(--font-mono, monospace);
  background: var(--color-surface, #fff);
  border: 1px solid var(--color-border, #d0d7e8);
  border-radius: 0.25rem;
  color: var(--color-text-secondary, #6b7a99);
  cursor: pointer;
  transition: background 0.12s, color 0.12s;
  flex-shrink: 0;
 }
 .picker-bulk-btn:hover {
  background: var(--app-primary, #2A6080);
  color: #fff;
  border-color: var(--app-primary, #2A6080);
 }
 .picker-body {
  padding: 0.75rem;
  border-top: 1px solid var(--color-border, #d0d7e8);
@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼  '; }
  .picker-model-list { padding-left: 0; }
  .picker-model-name { max-width: 14ch; }
 }
 /* ── Node picker ────────────────────────────────────── */
 .node-picker {
  display: flex;
  align-items: center;
  gap: 0.5rem;
  flex-wrap: wrap;
  padding: 0.5rem 0.75rem;
  border: 1px solid var(--color-border, #d0d7e8);
  border-radius: 0.5rem;
  background: var(--color-surface-raised, #e4ebf5);
 }
 .node-picker-label {
  font-size: 0.78rem;
  font-weight: 600;
  color: var(--color-text-secondary, #6b7a99);
  text-transform: uppercase;
  letter-spacing: 0.04em;
  white-space: nowrap;
 }
 .node-chip {
  display: inline-flex;
  align-items: center;
  gap: 0.3rem;
  padding: 0.2rem 0.55rem;
  border: 1px solid var(--color-border, #d0d7e8);
  border-radius: 1rem;
  background: var(--color-surface, #fff);
  font-size: 0.78rem;
  font-family: var(--font-mono, monospace);
  color: var(--color-text, #1a2338);
  cursor: pointer;
  transition: background 0.12s, opacity 0.12s;
 }
 .node-chip--off {
  opacity: 0.45;
  background: transparent;
 }
 .node-chip--offline {
  opacity: 0.35;
  cursor: not-allowed;
  font-style: italic;
 }
 .node-chip-check { accent-color: var(--app-primary, #2A6080); }
 .node-chip-status {
  font-size: 0.66rem;
  color: var(--color-text-secondary, #6b7a99);
 }
 .node-picker-hint {
  font-size: 0.72rem;
  color: var(--color-text-secondary, #6b7a99);
  font-family: var(--font-mono, monospace);
  margin-left: auto;
 }
 </style>
--- a/web/src/views/ModelsView.vue
+++ b/web/src/views/ModelsView.vue
@ -51,8 +51,31 @@
          <span v-if="lookupResult.adapter_recommendation" class="chip chip-adapter">
            {{ lookupResult.adapter_recommendation }}
          </span>
-          <span v-if="lookupResult.size != null" class="preview-size">
+          <span v-if="selectedQuantSize > 0" class="preview-size">
-            {{ humanBytes(lookupResult.size) }}
+            {{ humanBytes(selectedQuantSize) }}
          </span>
        </div>
        <!-- GGUF quantization picker — only shown for GGUF repos -->
        <div v-if="lookupResult.gguf_files?.length" class="quant-picker">
          <label class="quant-label" for="quant-select">Quantization</label>
          <select
            id="quant-select"
            v-model="selectedQuant"
            class="quant-select"
            aria-label="Select quantization variant"
          >
            <option :value="null" disabled>Select quantization…</option>
            <option
              v-for="f in lookupResult.gguf_files"
              :key="f.filename"
              :value="f.quant_name ?? f.filename"
            >
              {{ f.quant_name ?? f.filename }} — {{ humanBytes(f.size) }}
            </option>
          </select>
          <span class="quant-hint">
            Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
          </span>
        </div>
@ -67,7 +90,7 @@
        <button
          class="btn-primary btn-add-queue"
-          :disabled="lookupResult.already_installed || lookupResult.already_queued || addingToQueue"
+          :disabled="!canAddToQueue"
          @click="addToQueue"
        >
          {{ addingToQueue ? 'Adding…' : 'Add to queue' }}
@ -99,9 +122,39 @@
          <span v-if="model.role"                  class="chip chip-role">{{ model.role }}</span>
          <span v-if="model.service"               class="chip" :class="serviceChipClass(model.service)">{{ model.service }}</span>
          <span v-if="model.adapter_recommendation" class="chip chip-adapter">{{ model.adapter_recommendation }}</span>
          <span v-if="model.quant_pattern"          class="chip chip-quant">{{ model.quant_pattern }}</span>
        </div>
        <!-- Allow manual service/role assignment for unrecognized pipeline tags -->
        <div v-if="!model.service" class="classify-row queue-classify">
          <select
            class="classify-select"
            :value="classifyDraft[model.id]?.service ?? ''"
            @change="onServiceChange(model.id, ($event.target as HTMLSelectElement).value)"
            aria-label="Assign service"
          >
            <option value="" disabled>Service…</option>
            <option v-for="svc in CLASSIFIABLE_SERVICES" :key="svc.value" :value="svc.value">{{ svc.label }}</option>
          </select>
          <select
            class="classify-select"
            :value="classifyDraft[model.id]?.role ?? ''"
            :disabled="!classifyDraft[model.id]?.service"
            @change="(e) => setClassifyRole(model.id, (e.target as HTMLSelectElement).value)"
            aria-label="Assign role"
          >
            <option value="" disabled>Role…</option>
            <option
              v-for="role in rolesForService(classifyDraft[model.id]?.service ?? '')"
              :key="role"
              :value="role"
            >{{ role }}</option>
          </select>
        </div>
        <div class="model-card-actions">
-          <button class="btn-primary btn-sm" @click="approveModel(model.id)">
+          <button
            class="btn-primary btn-sm"
            @click="approveModel(model.id, classifyDraft[model.id])"
          >
            Approve download
          </button>
        </div>
@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue'
 // ── Type definitions ──────────────────────────────────
 interface GgufFile {
  filename: string
  size: number
  quant_name: string | null
 }
 interface LookupResult {
  repo_id: string
  pipeline_tag: string | null
@ -260,7 +319,8 @@ interface LookupResult {
  service: string | null
  compatible: boolean
  warning: string | null
-  size: number | null
+  model_size_bytes: number
  gguf_files: GgufFile[] | null
  description: string | null
  already_installed: boolean
  already_queued: boolean
@ -274,6 +334,7 @@ interface QueuedModel {
  adapter_recommendation: string | null
  role: string | null
  service: string | null
  quant_pattern: string | null
 }
 interface InstalledModel {
@ -302,6 +363,26 @@ const lookupLoading = ref(false)
 const lookupError   = ref<string | null>(null)
 const lookupResult  = ref<LookupResult | null>(null)
 const addingToQueue = ref(false)
 const selectedQuant = ref<string | null>(null)
 // Size of the selected GGUF file, or total model size for non-GGUF repos.
 const selectedQuantSize = computed<number>(() => {
  const r = lookupResult.value
  if (!r) return 0
  if (r.gguf_files?.length && selectedQuant.value) {
    const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value)
    return f?.size ?? r.model_size_bytes
  }
  return r.model_size_bytes
 })
 // Disable "Add to queue" when a GGUF repo but no quant chosen yet.
 const canAddToQueue = computed(() => {
  const r = lookupResult.value
  if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false
  if (r.gguf_files?.length && !selectedQuant.value) return false
  return true
 })
 const queuedModels    = ref<QueuedModel[]>([])
 const installedModels = ref<InstalledModel[]>([])
@ -411,6 +492,7 @@ async function doLookup() {
  lookupLoading.value = true
  lookupError.value   = null
  lookupResult.value  = null
  selectedQuant.value = null
  try {
    const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`)
@ -442,7 +524,15 @@ async function addToQueue() {
    const res = await fetch('/api/models/queue', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }),
+      body: JSON.stringify({
        repo_id,
        pipeline_tag,
        adapter_recommendation,
        role,
        service,
        model_size_bytes: selectedQuantSize.value,
        quant_pattern: selectedQuant.value,
      }),
    })
    if (res.ok) {
      lookupResult.value = { ...lookupResult.value, already_queued: true }
@ -454,8 +544,16 @@ async function addToQueue() {
  }
 }
-async function approveModel(id: string) {
+async function approveModel(id: string, draft?: { service: string; role: string }) {
  try {
    // If the user picked a service/role for an unrecognized model, patch it first.
    if (draft?.service && draft?.role) {
      await fetch(`/api/models/queue/${encodeURIComponent(id)}`, {
        method: 'PATCH',
        headers: { 'Content-Type': 'application/json' },
        body: JSON.stringify({ service: draft.service, role: draft.role }),
      })
    }
    const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' })
    if (res.ok) {
      await loadQueue()
@ -774,6 +872,44 @@ onUnmounted(() => {
  align-self: flex-start;
 }
 /* ── Quant picker ── */
 .quant-picker {
  display: flex;
  flex-direction: column;
  gap: 0.35rem;
 }
 .quant-label {
  font-size: 0.8rem;
  font-weight: 600;
  color: var(--color-text-muted, #4a5c7a);
  text-transform: uppercase;
  letter-spacing: 0.04em;
 }
 .quant-select {
  padding: 0.4rem 0.6rem;
  border: 1px solid var(--color-border, #a8b8d0);
  border-radius: var(--radius-md, 0.5rem);
  background: var(--color-surface, #f0f4fb);
  color: var(--color-text, #1a2338);
  font-size: 0.9rem;
  font-family: var(--font-mono, monospace);
  cursor: pointer;
 }
 .quant-hint {
  font-size: 0.78rem;
  color: var(--color-text-muted, #4a5c7a);
 }
 .chip-quant {
  background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent);
  color: var(--color-primary, #2A6080);
  font-family: var(--font-mono, monospace);
  font-size: 0.75rem;
 }
 /* ── Model cards (queue + downloads) ── */
 .model-card {
  border: 1px solid var(--color-border, #a8b8d0);
--- a/web/src/views/PlansBenchTab.vue
+++ b/web/src/views/PlansBenchTab.vue