From bce932461a762f66bf4064243d98b1e3af2f1c48 Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Sat, 2 May 2026 23:36:04 -0700 Subject: [PATCH] =?UTF-8?q?feat:=20plans=20benchmark=20harness=20=E2=80=94?= =?UTF-8?q?=20model=20scoring=20for=20CF=20planning=20prompts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds benchmark_plans.py script, plans_bench API router, PlansBenchTab Vue component, and registers /api/plans-bench in api.py. Also extends models registry (cf-text catalog integration), cforch client, LlmEvalTab, and ModelsView with cf-orch fleet support. Wires Planning mode into BenchmarkView. --- .env.example | 4 + app/api.py | 6 + app/cforch.py | 244 +++++++- app/models.py | 151 ++++- config/label_tool.yaml.example | 6 +- manage.sh | 35 ++ scripts/benchmark_plans.py | 719 +++++++++++++++++++++ scripts/export_plans.py | 458 ++++++++++++++ tests/test_cforch.py | 36 +- tests/test_models.py | 81 +++ web/src/views/BenchmarkView.vue | 21 +- web/src/views/LlmEvalTab.vue | 242 +++++++ web/src/views/ModelsView.vue | 150 ++++- web/src/views/PlansBenchTab.vue | 1043 +++++++++++++++++++++++++++++++ 14 files changed, 3137 insertions(+), 59 deletions(-) create mode 100644 scripts/benchmark_plans.py create mode 100644 scripts/export_plans.py create mode 100644 web/src/views/PlansBenchTab.vue diff --git a/.env.example b/.env.example index 03c3c9f..cf441a2 100644 --- a/.env.example +++ b/.env.example @@ -17,3 +17,7 @@ CF_LICENSE_KEY=CFG-AVCT-xxxx-xxxx-xxxx # Set one of these to use a cloud LLM instead of a local model. # ANTHROPIC_API_KEY=sk-ant-... # OPENAI_API_KEY=sk-... + +# ── HuggingFace (required for gated/terms-restricted model downloads) ───────── +# Generate at https://huggingface.co/settings/tokens and accept model terms first. +# HF_TOKEN=hf_xxxxxxxxxxxxxxxxxxxx diff --git a/app/api.py b/app/api.py index 9709115..eb2e151 100644 --- a/app/api.py +++ b/app/api.py @@ -34,6 +34,12 @@ app.include_router(eval_router, prefix="/api") from app.train.train import router as train_router app.include_router(train_router, prefix="/api/train") +from app.plans_bench import router as plans_bench_router +app.include_router(plans_bench_router, prefix="/api/plans-bench") + +# In-memory last-action store (single user, local tool — in-memory is fine) +_last_action: dict | None = None + from app.dashboard import router as dashboard_router app.include_router(dashboard_router, prefix="/api") diff --git a/app/cforch.py b/app/cforch.py index ccab705..60c8221 100644 --- a/app/cforch.py +++ b/app/cforch.py @@ -17,9 +17,12 @@ import logging import os import re import subprocess as _subprocess +import tempfile from pathlib import Path from typing import Any +import urllib.parse + import yaml from fastapi import APIRouter, HTTPException from fastapi.responses import StreamingResponse @@ -75,9 +78,31 @@ def _load_cforch_config() -> dict: "license_key": _coalesce(file_cfg.get("license_key", ""), "CF_LICENSE_KEY"), "ollama_url": _coalesce(file_cfg.get("ollama_url", ""), "OLLAMA_HOST"), "ollama_model": _coalesce(file_cfg.get("ollama_model", ""), "OLLAMA_MODEL"), + "judge_url": _coalesce(file_cfg.get("judge_url", ""), "CF_JUDGE_URL"), + "hf_token": _coalesce(file_cfg.get("hf_token", ""), "HF_TOKEN"), } +def _validate_service_url(url: str, param_name: str) -> str: + """Validate that a URL is a well-formed http/https URL with a hostname. + + Guards against SSRF: only http/https is allowed; the URL must have a + non-empty host. Does not enforce an allowlist — call sites are internal + tooling, not a public API. + """ + if not url: + return url + try: + parsed = urllib.parse.urlparse(url) + except Exception: + raise HTTPException(400, f"{param_name}: not a valid URL") + if parsed.scheme not in ("http", "https"): + raise HTTPException(400, f"{param_name}: URL must start with http:// or https://") + if not parsed.hostname: + raise HTTPException(400, f"{param_name}: URL has no hostname") + return url + + def _strip_ansi(text: str) -> str: """Remove ANSI escape codes from a string.""" return re.sub(r'\x1b\[[0-9;]*m', '', text) @@ -147,48 +172,141 @@ def get_tasks() -> dict: # ── GET /models ──────────────────────────────────────────────────────────────── +# Services and roles surfaced in the benchmark model picker. +# Covers all cf-orch service types that benchmark.py can route tasks to. +_BENCH_SERVICES = frozenset({ + "cf-text", "vllm", # LLM text generation + "cf-stt", # speech-to-text + "cf-tts", # text-to-speech + "cf-vision", # image classification / embedding + "cf-voice", # audio context classification +}) +_BENCH_ROLES = frozenset({ + "generator", "vlm", # LLM roles + "stt", "alm", # speech recognition + "tts", # speech synthesis + "vision", "embedding", # image understanding + "classifier", # audio classification (cf-voice) +}) + + @router.get("/models") def get_models() -> dict: - """Return model list from bench_models.yaml.""" + """Return model list from bench_models.yaml merged with locally installed models. + + bench_models.yaml entries are listed first and take precedence; any installed + model whose repo_id is already present in the YAML is skipped. Only models + whose service is in _BENCH_SERVICES (cf-text, vllm, cf-stt, cf-tts, cf-vision, + cf-voice) are surfaced from the installed registry. + """ cfg = _load_cforch_config() models_path = cfg.get("bench_models", "") - if not models_path: - return {"models": []} - p = Path(models_path) - if not p.exists(): - return {"models": []} - - try: - raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {} - except yaml.YAMLError as exc: - logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc) - return {"models": []} - - models_raw = raw.get("models", []) or [] models: list[dict] = [] - for m in models_raw: - if not isinstance(m, dict): - continue - models.append({ - "name": m.get("name", ""), - "id": m.get("id", ""), - "service": m.get("service", "ollama"), - "tags": m.get("tags", []) or [], - "vram_estimate_mb": m.get("vram_estimate_mb", 0), - }) + bench_ids: set[str] = set() + + if models_path: + p = Path(models_path) + if p.exists(): + try: + raw = yaml.safe_load(p.read_text(encoding="utf-8")) or {} + except yaml.YAMLError as exc: + logger.warning("Failed to parse bench_models.yaml %s: %s", p, exc) + raw = {} + for m in (raw.get("models", []) or []): + if not isinstance(m, dict): + continue + model_id = m.get("id", "") + models.append({ + "name": m.get("name", ""), + "id": model_id, + "service": m.get("service", "ollama"), + "tags": m.get("tags", []) or [], + "vram_estimate_mb": m.get("vram_estimate_mb", 0), + }) + if model_id: + bench_ids.add(model_id) + + # Merge installed generator models not already in bench_models.yaml. + try: + from app.models import list_installed # local import avoids circular dependency at module load + for installed in list_installed(): + model_id: str = installed.get("model_id") or "" + service: str = installed.get("service") or "" + role: str = installed.get("role") or "" + if not model_id: + continue + if service not in _BENCH_SERVICES or role not in _BENCH_ROLES: + continue + if model_id in bench_ids: + continue + display_name = model_id.split("/", 1)[-1] if "/" in model_id else model_id + models.append({ + "name": display_name, + "id": model_id, + "service": service, + "tags": [role], + "vram_estimate_mb": installed.get("vram_mb") or 0, + }) + bench_ids.add(model_id) + except Exception as exc: + logger.warning("Could not merge installed models into model list: %s", exc) return {"models": models} # ── GET /run ─────────────────────────────────────────────────────────────────── +@router.get("/nodes") +def get_nodes() -> dict: + """Proxy the coordinator's /api/nodes list, returning node_id + online status. + + Online is inferred from last_heartbeat: any node with a recent heartbeat is online. + Returns an empty list if the coordinator is unreachable. + """ + cfg = _load_cforch_config() + coordinator_url = cfg.get("coordinator_url", "").rstrip("/") + if not coordinator_url: + return {"nodes": []} + try: + import httpx as _httpx + resp = _httpx.get(f"{coordinator_url}/api/nodes", timeout=5.0) + resp.raise_for_status() + raw_nodes = resp.json().get("nodes", []) + return { + "nodes": [ + { + "node_id": n.get("node_id", ""), + "online": n.get("last_heartbeat") is not None, + "gpus": [ + { + "gpu_id": g.get("gpu_id"), + "name": g.get("name", ""), + "vram_total_mb": g.get("vram_total_mb", 0), + "vram_free_mb": g.get("vram_free_mb", 0), + } + for g in n.get("gpus", []) + ], + } + for n in raw_nodes + ] + } + except Exception as exc: + logger.warning("Could not fetch nodes from coordinator: %s", exc) + return {"nodes": []} + + @router.get("/run") def run_benchmark( task_ids: str = "", + model_ids: str = "", model_tags: str = "", coordinator_url: str = "", ollama_url: str = "", + judge_url: str = "", + judge_backend: str = "chat", + workers: int = 1, + node_ids: str = "", ) -> StreamingResponse: """Spawn cf-orch benchmark.py and stream stdout as SSE progress events.""" global _BENCH_RUNNING, _bench_proc @@ -205,6 +323,13 @@ def run_benchmark( cfg_coordinator = cfg.get("coordinator_url", "") cfg_ollama = cfg.get("ollama_url", "") cfg_license_key = cfg.get("license_key", "") + cfg_judge_url = cfg.get("judge_url", "") + + # Validate URL params before spawning the subprocess. + # _validate_service_url raises HTTPException on bad input (caught by FastAPI before streaming starts). + _validate_service_url(coordinator_url, "coordinator_url") + _validate_service_url(ollama_url, "ollama_url") + _validate_service_url(judge_url, "judge_url") def generate(): global _BENCH_RUNNING, _bench_proc @@ -213,16 +338,68 @@ def run_benchmark( yield f"data: {json.dumps({'type': 'error', 'message': 'bench_script not configured or not found'})}\n\n" return + # Build effective models file: bench_models.yaml + any installed models + # whose IDs were selected but are absent from the YAML (e.g. downloaded + # via the Models view). Written to a temp file so benchmark.py sees one + # unified list; cleaned up in the finally block. + effective_models_file = bench_models + _tmp_models_path: str | None = None + + if model_ids and bench_models and Path(bench_models).exists(): + requested_ids = set(model_ids.split(",")) + try: + raw_bench = yaml.safe_load(Path(bench_models).read_text(encoding="utf-8")) or {} + bench_entries: list[dict] = raw_bench.get("models", []) or [] + bench_id_set = {m.get("id", "") for m in bench_entries if isinstance(m, dict)} + missing_ids = requested_ids - bench_id_set + if missing_ids: + from app.models import list_installed + installed_map = { + m["model_id"]: m + for m in list_installed() + if m.get("model_id") and m.get("service") in _BENCH_SERVICES + } + extra: list[dict] = [] + for mid in missing_ids: + if mid in installed_map: + inst = installed_map[mid] + entry: dict[str, Any] = { + "id": mid, + "name": mid.split("/", 1)[-1] if "/" in mid else mid, + "service": inst.get("service", "cf-text"), + "vram_estimate_mb": inst.get("vram_mb") or 0, + "tags": [inst.get("role", "generator")], + "temperature": 0.0, + } + local_path = inst.get("path", "") or inst.get("local_path", "") + if local_path: + entry["model_path"] = local_path + extra.append(entry) + if extra: + merged = {"models": bench_entries + extra} + tf = tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False, + prefix="avocet_bench_models_", + ) + yaml.dump(merged, tf) + tf.close() + _tmp_models_path = tf.name + effective_models_file = _tmp_models_path + except Exception as exc: + logger.warning("Could not merge installed models into temp bench file: %s", exc) + cmd = [ python_bin, bench_script, "--tasks", bench_tasks, - "--models", bench_models, + "--models", effective_models_file, "--output", results_dir, ] if task_ids: cmd.extend(["--filter-tasks"] + task_ids.split(",")) + if model_ids: + cmd.extend(["--filter-models"] + model_ids.split(",")) if model_tags: cmd.extend(["--filter-tags"] + model_tags.split(",")) @@ -233,6 +410,15 @@ def run_benchmark( cmd.extend(["--coordinator", effective_coordinator]) if effective_ollama: cmd.extend(["--ollama-url", effective_ollama]) + effective_judge = judge_url if judge_url else cfg_judge_url + if effective_judge: + cmd.extend(["--judge-url", effective_judge]) + if judge_backend and judge_backend != "chat": + cmd.extend(["--judge-backend", judge_backend]) + if workers > 1: + cmd.extend(["--workers", str(workers)]) + if node_ids: + cmd.extend(["--nodes"] + node_ids.split(",")) # Pass license key as env var so subprocess can authenticate with cf-orch proc_env = {**os.environ} @@ -273,6 +459,11 @@ def run_benchmark( yield f"data: {json.dumps({'type': 'error', 'message': str(exc)})}\n\n" finally: _BENCH_RUNNING = False + if _tmp_models_path: + try: + os.unlink(_tmp_models_path) + except OSError: + pass return StreamingResponse( generate(), @@ -295,6 +486,7 @@ def get_cforch_config() -> dict: "coordinator_url": cfg.get("coordinator_url", ""), "ollama_url": cfg.get("ollama_url", ""), "ollama_model": cfg.get("ollama_model", ""), + "judge_url": cfg.get("judge_url", ""), "license_key_set": bool(cfg.get("license_key", "")), "source": "env" if not _config_file().exists() else "yaml+env", } @@ -303,7 +495,7 @@ def get_cforch_config() -> dict: # ── GET /results ─────────────────────────────────────────────────────────────── @router.get("/results") -def get_results() -> dict: +def get_results() -> list: """Return the latest benchmark summary.json from results_dir.""" cfg = _load_cforch_config() results_dir = cfg.get("results_dir", "") diff --git a/app/models.py b/app/models.py index c332a34..b4c05ae 100644 --- a/app/models.py +++ b/app/models.py @@ -15,6 +15,7 @@ from __future__ import annotations import json import logging import os +import re import shutil import threading from datetime import datetime, timezone @@ -60,6 +61,30 @@ _CF_ORCH_PROFILES_DIR: Path = Path( router = APIRouter() +# ── HuggingFace auth ───────────────────────────────────────────────────────── + +def _get_hf_token() -> str | None: + """Return HF token from label_tool.yaml, then HF_TOKEN / HUGGING_FACE_HUB_TOKEN env vars.""" + config_file = _ROOT / "config" / "label_tool.yaml" + if config_file.exists(): + try: + import yaml as _yaml + raw = _yaml.safe_load(config_file.read_text(encoding="utf-8")) or {} + token = (raw.get("hf_token") or raw.get("cforch", {}).get("hf_token") or "").strip() + if token: + return token + except Exception: + pass + return os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN") or None + + +# ── GGUF quantization detection ─────────────────────────────────────────────── +# Matches quant identifiers in GGUF filenames: Q4_K_M, Q8_0, F16, IQ3_M, etc. +_QUANT_RE = re.compile( + r'[._-]((?:IQ\d|Q\d)[A-Z0-9_]*|F16|BF16)\.gguf$', + re.IGNORECASE, +) + # ── Download progress shared state ──────────────────────────────────────────── # Updated by the background download thread; read by GET /download/stream. _download_progress: dict[str, Any] = {} @@ -91,12 +116,15 @@ _TAG_TO_INFO: dict[str, _TagInfo] = { "audio-classification": {"adapter": None, "role": "classifier", "service": "cf-voice"}, # TTS — cf-tts text-to-speech service "text-to-speech": {"adapter": None, "role": "tts", "service": "cf-tts"}, - # Vision — cf-vision image classification / embedding / VLM service + # Vision classifiers / embedders — cf-vision (SigLIP/CLIP-style models) "image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"}, "zero-shot-image-classification": {"adapter": None, "role": "vision", "service": "cf-vision"}, "image-feature-extraction": {"adapter": None, "role": "embedding", "service": "cf-vision"}, - "image-text-to-text": {"adapter": None, "role": "vlm", "service": "cf-vision"}, - "visual-question-answering": {"adapter": None, "role": "vlm", "service": "cf-vision"}, + # Generative VLMs (image+text → text) — run under vllm, not cf-vision. + # cf-vision is a classifier/embedder service; generative VLMs like Qwen-VL, + # LLaVA, and InternVL are textgen models that happen to accept image inputs. + "image-text-to-text": {"adapter": None, "role": "vlm", "service": "vllm"}, + "visual-question-answering": {"adapter": None, "role": "vlm", "service": "vllm"}, # Image generation — cf-image (text → image; distinct from cf-vision image understanding) "text-to-image": {"adapter": None, "role": "image-gen", "service": "cf-image"}, # Embedding — cf-core shared embedding layer @@ -195,10 +223,17 @@ def _get_queue_entry(entry_id: str) -> dict | None: def _catalog_key(repo_id: str) -> str: """Derive a readable catalog key from repo_id. - ibm-granite/granite-4.1-8b → granite-4.1-8b - facebook/bart-large-cnn → bart-large-cnn + ibm-granite/granite-4.1-8b → granite-4.1-8b + facebook/bart-large-cnn → bart-large-cnn + WithinUsAI/Opus4.7-GODs.Ghost.Codex-4B.GGuF → opus4.7-gods.ghost.codex-4b + + The coordinator skips catalog lookup for keys ending in ".gguf" (treats them + as direct file paths). Strip the suffix so GGUF repo names produce valid keys. """ - return repo_id.split("/", 1)[-1].lower() + key = repo_id.split("/", 1)[-1].lower() + if key.endswith(".gguf"): + key = key[:-5] + return key def _insert_catalog_entry(content: str, entry_lines: str) -> str: @@ -290,6 +325,15 @@ def _register_in_node_catalogs( max_mb: int = cf_text.get("max_mb", 0) catalog: dict = cf_text.get("catalog") or {} + # If the node has a different local model dir, remap the NFS path. + model_base = cf_text.get("model_base_path", "").rstrip("/") + if model_base: + nfs_base = str(_CF_TEXT_MODELS_DIR).rstrip("/") + model_name = local_path.name + effective_path_str = f"{model_base}/{model_name}" + else: + effective_path_str = local_path_str + # Skip if key already exists if model_key in catalog: logger.debug("Key %r already in %s — skipping", model_key, yaml_file.name) @@ -301,10 +345,10 @@ def _register_in_node_catalogs( for entry in catalog.values() if isinstance(entry, dict) } - if local_path_str in registered_paths or any( - p.startswith(local_path_str + "/") for p in registered_paths + if effective_path_str in registered_paths or any( + p.startswith(effective_path_str + "/") for p in registered_paths ): - logger.debug("Path %s already registered in %s — skipping", local_path_str, yaml_file.name) + logger.debug("Path %s already registered in %s — skipping", effective_path_str, yaml_file.name) continue # Determine whether model fits at FP16 or needs 4-bit @@ -330,12 +374,18 @@ def _register_in_node_catalogs( if needs_4bit else f" # FP16 file-size estimate" ) + env_block = ( + f" env:\n" + f" CF_TEXT_4BIT: \"1\"\n" + if needs_4bit else "" + ) entry_block = ( f" # auto-registered by avocet on download\n" f" {model_key}:\n" - f" path: {local_path_str}\n" + f" path: {effective_path_str}\n" f" vram_mb: {vram_for_node}{vram_comment}\n" f" description: \"{desc}\"\n" + f"{env_block}" ) new_content = _insert_catalog_entry(content, entry_block) @@ -388,12 +438,17 @@ def _run_download( role: str | None = None, service: str | None = None, model_size_bytes: int = 0, + quant_pattern: str | None = None, ) -> None: """Background thread: download model via huggingface_hub.snapshot_download. model_size_bytes is the sum of file sizes reported by the HF API (siblings). It is used to estimate vram_mb and written to model_info.json so cf-orch can budget VRAM when allocating a cf-text instance for this model. + + quant_pattern: when set, restricts snapshot_download to only files matching + *{quant_pattern}*.gguf (plus metadata). Avoids downloading every quant variant + from GGUF-only repos like bartowski/*. """ global _download_progress local_dir = _model_dir_for(repo_id, service) @@ -422,10 +477,20 @@ def _run_download( local_dir.mkdir(parents=True, exist_ok=True) poll_thread.start() - snapshot_download( - repo_id=repo_id, - local_dir=str(local_dir), - ) + + dl_kwargs: dict[str, Any] = {"repo_id": repo_id, "local_dir": str(local_dir)} + hf_token = _get_hf_token() + if hf_token: + dl_kwargs["token"] = hf_token + if quant_pattern: + # Include both cases: repos use mixed conventions (Q6_K vs q6_k). + dl_kwargs["allow_patterns"] = [ + f"*{quant_pattern.upper()}*.gguf", + f"*{quant_pattern.lower()}*.gguf", + "*.json", + "README.md", + ] + snapshot_download(**dl_kwargs) # Estimate VRAM from reported file size. # HF siblings sizes are pre-quantisation file sizes; add 10% for KV cache @@ -531,9 +596,31 @@ def lookup_model(repo_id: str) -> dict: ) logger.warning("Unsupported pipeline_tag %r for %s", pipeline_tag, repo_id) - # Estimate model size from siblings list + # Detect GGUF files and parse quant names from siblings list. + # For GGUF-only repos (bartowski, TheBloke, etc.) this lets the UI show + # a per-quant size picker instead of downloading every variant. siblings = data.get("siblings") or [] - model_size_bytes: int = sum(s.get("size", 0) for s in siblings if isinstance(s, dict)) + gguf_files: list[dict] = [] + for s in siblings: + if not isinstance(s, dict): + continue + fname: str = s.get("rfilename", "") + if not fname.lower().endswith(".gguf"): + continue + m = _QUANT_RE.search(fname) + gguf_files.append({ + "filename": fname, + "size": s.get("size", 0) or 0, + "quant_name": m.group(1).upper() if m else None, + }) + gguf_files.sort(key=lambda f: f["size"]) + + # model_size_bytes: total of all siblings (for non-GGUF repos) or all GGUFs only. + # For GGUF repos the frontend will substitute the selected quant's size on submit. + if gguf_files: + model_size_bytes: int = sum(f["size"] for f in gguf_files) + else: + model_size_bytes = sum(s.get("size", 0) for s in siblings if isinstance(s, dict)) # Description: first 300 chars of card data (modelId field used as fallback) card_data = data.get("cardData") or {} @@ -549,6 +636,7 @@ def lookup_model(repo_id: str) -> dict: "compatible": compatible, "warning": warning, "model_size_bytes": model_size_bytes, + "gguf_files": gguf_files if gguf_files else None, "description": description, "tags": data.get("tags") or [], "downloads": data.get("downloads") or 0, @@ -579,6 +667,9 @@ class QueueAddRequest(BaseModel): # Stored in the queue entry so approve can pass it to _run_download # without a second HF API round-trip. model_size_bytes: int = 0 + # GGUF quantization pattern (e.g. "Q5_K_M"). When set, snapshot_download + # restricts to *{quant_pattern}*.gguf instead of fetching all variants. + quant_pattern: str | None = None @router.post("/queue", status_code=201) @@ -597,6 +688,7 @@ def add_to_queue(req: QueueAddRequest) -> dict: "role": req.role, "service": req.service, "model_size_bytes": req.model_size_bytes, + "quant_pattern": req.quant_pattern, "status": "pending", "queued_at": datetime.now(timezone.utc).isoformat(), } @@ -629,6 +721,7 @@ def approve_queue_entry(entry_id: str) -> dict: entry.get("role"), entry.get("service"), entry.get("model_size_bytes", 0), + entry.get("quant_pattern"), ), daemon=True, name=f"model-download-{entry_id}", @@ -638,6 +731,32 @@ def approve_queue_entry(entry_id: str) -> dict: return {"ok": True} +# ── PATCH /queue/{id} ───────────────────────────────────────────────────────── + +class QueuePatchRequest(BaseModel): + service: str | None = None + role: str | None = None + + +@router.patch("/queue/{entry_id}") +def patch_queue_entry(entry_id: str, body: QueuePatchRequest) -> dict: + """Update mutable fields (service, role) on a pending queue entry.""" + entry = _get_queue_entry(entry_id) + if entry is None: + raise HTTPException(404, f"Queue entry {entry_id!r} not found") + if entry.get("status") != "pending": + raise HTTPException(409, f"Only pending entries can be patched (current: {entry.get('status')!r})") + + updates: dict = {} + if body.service is not None: + updates["service"] = body.service + if body.role is not None: + updates["role"] = body.role + + updated = _update_queue_entry(entry_id, updates) + return updated or {} + + # ── DELETE /queue/{id} ───────────────────────────────────────────────────────── @router.delete("/queue/{entry_id}") diff --git a/config/label_tool.yaml.example b/config/label_tool.yaml.example index 3407a0f..3542e9d 100644 --- a/config/label_tool.yaml.example +++ b/config/label_tool.yaml.example @@ -41,11 +41,15 @@ cforch: # Python interpreter with cf-orch installed python_bin: /devl/miniconda3/envs/cf/bin/python - # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST + # Connection config — override env vars CF_ORCH_URL / CF_LICENSE_KEY / OLLAMA_HOST / CF_JUDGE_URL / HF_TOKEN # coordinator_url: http://localhost:7700 # license_key: CFG-AVCT-xxxx-xxxx-xxxx # ollama_url: http://localhost:11434 # ollama_model: llama3.2:3b + # judge_url: http://10.1.10.158:8008 # Sif cf-text — LLM-as-judge secondary scorer + # judge_url: http://10.1.10.71:8008 # Heimdall cf-text (alternative) + # Or set CF_JUDGE_URL. Populates the Judge URL field in the LLM Eval UI automatically. + # hf_token: hf_xxxxxxxxxxxxxxxxxxxx # HuggingFace token — required for gated/terms-restricted models # Imitate tab — pull real samples from sibling CF product APIs and run them # through local LLMs to build a corrections dataset. diff --git a/manage.sh b/manage.sh index daf147d..0a02d1f 100755 --- a/manage.sh +++ b/manage.sh @@ -90,6 +90,12 @@ usage() { echo -e " ${GREEN}score [args]${NC} Shortcut: --score [args]" echo -e " ${GREEN}compare [args]${NC} Shortcut: --compare [args]" echo "" + echo " Planning Benchmark:" + echo -e " ${GREEN}plans-bench [args]${NC} Run benchmark_plans.py (args passed through)" + echo -e " ${GREEN}plans-list${NC} Shortcut: --list-models" + echo -e " ${GREEN}plans-run [args]${NC} Run a single model (--verbose auto-added)" + echo -e " ${GREEN}plans-compare [more]${NC} Compare models side-by-side" + echo "" echo " Writing Style Benchmark:" echo -e " ${GREEN}style-bench [args]${NC} Run benchmark_style.py (args passed through)" echo -e " ${GREEN}style-list${NC} List available ollama models for style bench" @@ -127,6 +133,8 @@ case "$CMD" in fi mkdir -p "$LOG_DIR" API_LOG="${LOG_DIR}/api.log" + # Load .env if present — sets HF_TOKEN and other optional overrides. + [[ -f .env ]] && set -a && source .env && set +a info "Building Vue SPA…" (cd web && npm run build) >> "$API_LOG" 2>&1 info "Starting FastAPI on port ${API_PORT}…" @@ -179,6 +187,9 @@ case "$CMD" in mkdir -p "$LOG_DIR" DEV_API_LOG="${LOG_DIR}/dev-api.log" + # Load .env if present — sets HF_TOKEN and other optional overrides. + [[ -f .env ]] && set -a && source .env && set +a + if [[ -f "$DEV_API_PID_FILE" ]] && kill -0 "$(<"$DEV_API_PID_FILE")" 2>/dev/null; then warn "Dev API already running (PID $(<"$DEV_API_PID_FILE"))" else @@ -255,6 +266,30 @@ case "$CMD" in exec "$0" benchmark --compare "$@" ;; + plans-bench) + info "Running planning benchmark (${ENV_UI})…" + "$PYTHON_UI" scripts/benchmark_plans.py "$@" + ;; + + plans-list) + exec "$0" plans-bench --list-models + ;; + + plans-run) + if [[ $# -lt 1 ]]; then + error "Usage: ./manage.sh plans-run [extra args]" + fi + MODEL="$1"; shift + exec "$0" plans-bench --model "$MODEL" --verbose "$@" + ;; + + plans-compare) + if [[ $# -lt 2 ]]; then + error "Usage: ./manage.sh plans-compare [more…]" + fi + exec "$0" plans-bench --compare "$@" --verbose + ;; + style-bench) info "Running writing style benchmark (${ENV_BM})…" if [[ ! -x "$PYTHON_BM" ]]; then diff --git a/scripts/benchmark_plans.py b/scripts/benchmark_plans.py new file mode 100644 index 0000000..0b02fa7 --- /dev/null +++ b/scripts/benchmark_plans.py @@ -0,0 +1,719 @@ +#!/usr/bin/env python +"""CF-specific planning benchmark — compare base models before fine-tuning. + +Sends held-out CircuitForge planning prompts to one or more models via the +cf-text (local) or cf-orch API, then scores responses against CF-specific +rubrics. Use this to select the best base model for SFT. + +Scoring rubrics (each 0-1, summed to total/N): + - task_structure : uses checkbox syntax (- [ ]), git commit steps + - tier_awareness : mentions Free/Paid/Premium/Ultra tiers + - privacy_pillar : mentions privacy/local-inference/no-logging + - safety_pillar : mentions safety, human approval, or reversibility + - accessibility : mentions ND/accessibility/adaptive needs + - license_split : mentions MIT vs BSL or open-core model + - file_paths : uses plausible file path references + - cf_conventions : uses conda run -n cf, /Library/Development/, or known CF dirs + - paired_coherence : (paired only) plan references the design doc's feature name + - length_ok : 300–2500 words (under-short = hallucination risk; over-long = padding) + +Usage +----- + # List available model targets + python scripts/benchmark_plans.py --list-models + + # Run all held-out prompts against a single model, print report + python scripts/benchmark_plans.py --model llama3.2-3b + + # Compare two models side-by-side + python scripts/benchmark_plans.py --compare llama3.2-3b mistral-7b + + # Run with a custom API base (cf-text default: http://localhost:8080/v1) + python scripts/benchmark_plans.py --model llama3.2-3b --api-base http://localhost:8080/v1 + + # Export detailed results JSON + python scripts/benchmark_plans.py --model llama3.2-3b --output data/bench_results.json +""" +from __future__ import annotations + +import argparse +import json +import re +import sys +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field, asdict +from pathlib import Path +from typing import Any + +import httpx + +# ── Paths ────────────────────────────────────────────────────────────────────── + +_ROOT = Path(__file__).parent.parent +_DATA_DIR = _ROOT / "data" + +CF_TEXT_BASE = "http://localhost:8080/v1" +CF_ORCH_BASE = "http://localhost:8090/v1" +CF_COORD_URL = "http://10.1.10.71:7700" # cf-orch coordinator (LAN) + +# ── Held-out prompts ─────────────────────────────────────────────────────────── +# These are NOT in the training export (no matching docs in circuitforge-plans/). +# Each prompt exercises a different CF planning domain. + +HELD_OUT_PROMPTS: list[dict[str, Any]] = [ + { + "id": "ho_001", + "name": "kiwi_barcode_ocr", + "domain": "feature_plan", + "prompt": ( + "You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. " + "Write a detailed implementation plan for adding barcode scanning via device camera " + "and receipt OCR to the item-add flow.\n\n" + "The plan should include: file structure (create/modify), step-by-step task checklist " + "with checkboxes, any DB migrations, and git commit steps." + ), + "expected_signals": ["task_structure", "file_paths", "cf_conventions"], + }, + { + "id": "ho_002", + "name": "peregrine_ats_scoring", + "domain": "feature_design", + "prompt": ( + "Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n" + "Context: Peregrine users paste job descriptions and their resume. " + "We want to score how well the resume keywords match the JD and suggest rewrites. " + "Describe the architecture, data flow, and key design decisions." + ), + "expected_signals": ["privacy_pillar", "tier_awareness", "license_split"], + }, + { + "id": "ho_003", + "name": "tier_gate_local_llm", + "domain": "architecture", + "prompt": ( + "Design the tier-gating architecture for a new CircuitForge product. " + "The product should:\n" + "- Default to local LLM inference for all tiers\n" + "- Unlock cloud LLM for Paid tier and above\n" + "- Keep fine-tuned model weights for Premium/Ultra only\n\n" + "Describe how the tier check integrates with the LLM router, " + "what happens when a Free user tries a Paid-tier feature, " + "and how BYOK (bring-your-own-key) fits in." + ), + "expected_signals": ["tier_awareness", "privacy_pillar", "license_split"], + }, + { + "id": "ho_004", + "name": "heimdall_webhook_plan", + "domain": "feature_plan", + "prompt": ( + "Break the following Heimdall feature into a detailed implementation plan with " + "file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n" + "Heimdall is the CircuitForge license server (FastAPI + SQLite). " + "The webhook needs to handle checkout.session.completed, " + "customer.subscription.updated, and customer.subscription.deleted events." + ), + "expected_signals": ["task_structure", "file_paths", "safety_pillar"], + }, + { + "id": "ho_005", + "name": "nd_accessible_onboarding", + "domain": "ux_design", + "prompt": ( + "You are a product designer working on Harrier, a CircuitForge tool for " + "helping people navigate government benefits applications.\n\n" + "Design the onboarding flow for neurodivergent (ND) users. " + "Consider: ADHD time-blindness, executive function challenges, demand avoidance, " + "and rejection sensitivity. The flow should reduce cognitive load and " + "never use urgency or panic patterns." + ), + "expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"], + }, + { + "id": "ho_006", + "name": "circuitforge_core_extraction", + "domain": "architecture", + "prompt": ( + "Produce a CircuitForge-style design document for the following circuitforge-core " + "feature — shared ActivityPub federation module.\n\n" + "Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates " + "to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. " + "Design the module API, describe what belongs in MIT vs BSL, and note federation " + "privacy constraints." + ), + "expected_signals": ["license_split", "privacy_pillar", "cf_conventions"], + }, + { + "id": "ho_007", + "name": "snipe_trust_score_plan", + "domain": "feature_plan", + "prompt": ( + "You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. " + "Write a step-by-step engineering plan for: seller trust score calculation.\n\n" + "The score should combine: feedback ratio, account age, item-specifics completeness, " + "listing photo quality, and shipping time accuracy. " + "Include file structure, test plan, and migration steps." + ), + "expected_signals": ["task_structure", "file_paths", "safety_pillar"], + }, + { + "id": "ho_008", + "name": "avocet_training_pipeline", + "domain": "feature_plan", + "prompt": ( + "Break the following Avocet feature into a detailed implementation plan — " + "end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n" + "Avocet is the CircuitForge email classifier training tool. " + "The pipeline should: validate the dataset, run LoRA SFT via unsloth, " + "quantize to Q5_K_M GGUF, run the benchmark harness, and register the model " + "in the Avocet model queue if it beats the baseline." + ), + "expected_signals": ["task_structure", "file_paths", "cf_conventions"], + }, + { + "id": "ho_009", + "name": "privacy_data_flow", + "domain": "architecture", + "prompt": ( + "Design the data privacy architecture for a CircuitForge cloud product. " + "Describe: what PII is collected, how it's stored, retention policy, " + "obfuscation strategy for cloud-side logs, and how consent is obtained " + "in plain language. The product handles job applications (resumes, cover letters)." + ), + "expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"], + }, + { + "id": "ho_010", + "name": "git_workflow_doc", + "domain": "process_doc", + "prompt": ( + "Write a developer process document for CircuitForge: conventional commit and " + "branch workflow for a BSL 1.1 open-core product.\n\n" + "Cover: commit message format (type: description), branch naming, " + "when to use feature branches vs direct main commits, " + "how the MIT/BSL split affects which commits go in which branch, " + "and how CI gates on gitleaks for secret scanning." + ), + "expected_signals": ["license_split", "cf_conventions", "task_structure"], + }, +] + +# ── Rubric scoring ───────────────────────────────────────────────────────────── + +_TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE) +_COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE) +_TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE) +_PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE) +_SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE) +_A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE) +_LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE) +_FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE) +_CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE) + + +@dataclass +class RubricScore: + task_structure: float = 0.0 + tier_awareness: float = 0.0 + privacy_pillar: float = 0.0 + safety_pillar: float = 0.0 + accessibility: float = 0.0 + license_split: float = 0.0 + file_paths: float = 0.0 + cf_conventions: float = 0.0 + length_ok: float = 0.0 + + def total(self) -> float: + vals = [self.task_structure, self.tier_awareness, self.privacy_pillar, + self.safety_pillar, self.accessibility, self.license_split, + self.file_paths, self.cf_conventions, self.length_ok] + return sum(vals) / len(vals) + + def as_dict(self) -> dict[str, float]: + return asdict(self) + + +def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore: + words = len(response.split()) + s = RubricScore() + + # Task structure: needs checkboxes AND at least one commit step + checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response)) + has_commit = bool(_COMMIT_RE.search(response)) + s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0) + + # Tier awareness + s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2) + + # Privacy pillar + s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3) + + # Safety pillar + s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2) + + # Accessibility + s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2) + + # License split awareness + s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2) + + # File paths: at least 3 plausible path references + s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3) + + # CF conventions + s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2) + + # Length: 200–2500 words is healthy; outside = partial credit + if 200 <= words <= 2500: + s.length_ok = 1.0 + elif words < 200: + s.length_ok = words / 200 + else: + s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500) + + return s + + +# ── Model client ─────────────────────────────────────────────────────────────── + +# Registry of named model targets (shorthand → {api_base, model_name}) +MODEL_REGISTRY: dict[str, dict[str, str]] = { + "deepseek-r1-1.5b": { + "api_base": CF_TEXT_BASE, + "model": "deepseek-r1-1.5b", + "description": "DeepSeek R1 1.5B distill (cf-orch catalog key)", + }, + "deepseek-r1-7b-4bit": { + "api_base": CF_TEXT_BASE, + "model": "deepseek-r1-7b-4bit", + "description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)", + }, + "deepseek-coder-6.7b-4bit": { + "api_base": CF_TEXT_BASE, + "model": "deepseek-coder-6.7b-4bit", + "description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)", + }, + "granite-4.1-8b": { + "api_base": CF_TEXT_BASE, + "model": "granite-4.1-8b", + "description": "IBM Granite 4.1 8B, 4-bit (cf-orch catalog key)", + }, + "qwen2.5-3b": { + "api_base": CF_TEXT_BASE, + "model": "qwen2.5-3b", + "description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key, navi only)", + }, + "qwen2.5-7b": { + "api_base": CF_TEXT_BASE, + "model": "qwen2.5-7b", + "description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key, navi only)", + }, +} + + +# ── cf-orch allocation ───────────────────────────────────────────────────────── + +def _cforch_allocate( + model_id: str, + cforch_url: str, + startup_timeout_s: float = 300.0, +) -> tuple[str, str] | None: + """Allocate a cf-text instance for model_id via the cf-orch coordinator. + + Returns (service_url, allocation_id) on success, None on failure. + service_url is the direct node URL exposing /v1/chat/completions. + """ + try: + resp = httpx.post( + f"{cforch_url}/api/services/cf-text/allocate", + json={ + "model_candidates": [model_id], + "caller": "avocet", + "pipeline": "plans_benchmark", + }, + timeout=120.0, + ) + resp.raise_for_status() + data = resp.json() + service_url: str = data["url"] + allocation_id: str = data.get("allocation_id", "") + node_id: str = data.get("node_id", "") + gpu_id: int | None = data.get("gpu_id") + + if data.get("started", False) and not data.get("warm", True): + # Use \n so the SSE generator sees the line immediately + print(f" [cold start] loading {model_id!r} — polling every 3s…", flush=True) + t0 = time.monotonic() + deadline = t0 + startup_timeout_s + probe_misses = 0 + + while time.monotonic() < deadline: + elapsed = time.monotonic() - t0 + try: + status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0) + if status.is_success: + instances = status.json().get("instances", []) + match = next( + (i for i in instances + if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id), + None, + ) + if match: + probe_misses = 0 + state = match.get("state", "") + if state == "running": + print(f" [cold start] ready in {elapsed:.0f}s", flush=True) + return service_url, allocation_id + elif state == "stopped": + print(f" [cold start] failed — service stopped after {elapsed:.0f}s", flush=True) + return None + else: + # still starting — emit keepalive so SSE stream stays alive + print(f" [cold start] state={state!r} elapsed={elapsed:.0f}s", flush=True) + else: + probe_misses += 1 + print(f" [cold start] waiting… elapsed={elapsed:.0f}s", flush=True) + if probe_misses >= 6: + try: + h = httpx.get(f"{service_url}/health", timeout=3.0) + if h.is_success: + print(f" [cold start] ready via health check in {elapsed:.0f}s", flush=True) + return service_url, allocation_id + except Exception: + pass + else: + print(f" [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True) + except Exception as poll_exc: + print(f" [cold start] poll error: {poll_exc} elapsed={elapsed:.0f}s", flush=True) + time.sleep(3.0) + + print(f" [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True) + return None + + return service_url, allocation_id + except Exception as exc: + print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr) + return None + + +def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]: + """Call an OpenAI-compatible /v1/chat/completions on a direct service URL.""" + t0 = time.monotonic() + resp = httpx.post( + f"{service_url.rstrip('/')}/v1/chat/completions", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 2048, + "temperature": 0.2, + }, + timeout=timeout, + ) + resp.raise_for_status() + latency = time.monotonic() - t0 + text = resp.json()["choices"][0]["message"]["content"] + return text, latency + + +def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]: + """Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s).""" + t0 = time.monotonic() + resp = httpx.post( + f"{api_base}/chat/completions", + json={ + "model": model, + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 2048, + "temperature": 0.2, + }, + timeout=timeout, + ) + resp.raise_for_status() + latency = time.monotonic() - t0 + text = resp.json()["choices"][0]["message"]["content"] + return text, latency + + +# ── Benchmark runner ─────────────────────────────────────────────────────────── + +@dataclass +class PromptResult: + prompt_id: str + prompt_name: str + model_key: str + response: str + latency_s: float + word_count: int + scores: dict[str, float] + total_score: float + error: str | None = None + + +def run_benchmark( + model_key: str, + model_name: str, + prompts: list[dict[str, Any]] | None = None, + verbose: bool = False, + # cf-orch path + use_cforch: bool = False, + cforch_url: str = CF_COORD_URL, + # direct path (used when not cf-orch) + api_base: str = CF_TEXT_BASE, +) -> list[PromptResult]: + """Run all prompts through one model. Uses cf-orch allocation when use_cforch=True.""" + if prompts is None: + prompts = HELD_OUT_PROMPTS + + # Allocate once per model when using cf-orch + service_url: str | None = None + if use_cforch: + print(f" Allocating {model_name!r} via cf-orch…", flush=True) + alloc = _cforch_allocate(model_name, cforch_url) + if alloc is None: + # Return all prompts as errors + return [ + PromptResult( + prompt_id=p["id"], prompt_name=p["name"], model_key=model_key, + response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0, + error=f"cf-orch allocation failed for {model_name!r}", + ) + for p in prompts + ] + service_url, _alloc_id = alloc + + results: list[PromptResult] = [] + for p in prompts: + if verbose: + print(f" [{p['id']}] {p['name']} … ", end="", flush=True) + try: + if service_url: + response, latency = _call_model_direct(service_url, model_name, p["prompt"]) + else: + response, latency = _call_model(api_base, model_name, p["prompt"]) + rubric = score_response(response, p) + result = PromptResult( + prompt_id=p["id"], + prompt_name=p["name"], + model_key=model_key, + response=response, + latency_s=round(latency, 2), + word_count=len(response.split()), + scores=rubric.as_dict(), + total_score=round(rubric.total(), 3), + ) + if verbose: + print(f"score={result.total_score:.3f} ({result.word_count}w, {latency:.1f}s)") + except Exception as exc: + result = PromptResult( + prompt_id=p["id"], + prompt_name=p["name"], + model_key=model_key, + response="", + latency_s=0.0, + word_count=0, + scores={}, + total_score=0.0, + error=str(exc), + ) + if verbose: + print(f"ERROR: {exc}") + results.append(result) + return results + + +# ── Reporting ────────────────────────────────────────────────────────────────── + +def _print_single_report(results: list[PromptResult], model_key: str) -> None: + ok = [r for r in results if not r.error] + err = [r for r in results if r.error] + if not ok: + print(f"\n[{model_key}] All {len(err)} prompts failed.\n") + return + + avg_total = sum(r.total_score for r in ok) / len(ok) + avg_latency = sum(r.latency_s for r in ok) / len(ok) + + # Aggregate per-rubric averages + rubric_keys = list(ok[0].scores.keys()) + rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys} + + print(f"\n{'='*60}") + print(f" Model : {model_key}") + print(f" Prompts: {len(ok)}/{len(results)} passed ({len(err)} errors)") + print(f" Overall score : {avg_total:.3f} (avg latency {avg_latency:.1f}s)") + print(f"\n Rubric breakdown:") + for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]): + bar = "█" * int(v * 20) + print(f" {k:<22} {v:.3f} {bar}") + print(f"\n Per-prompt scores:") + for r in sorted(ok, key=lambda x: -x.total_score): + flag = "⚠" if r.total_score < 0.3 else " " + print(f" {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f} ({r.word_count}w)") + if err: + print(f"\n Errors:") + for r in err: + print(f" {r.prompt_id} {r.prompt_name}: {r.error}") + print(f"{'='*60}\n") + + +def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None: + model_keys = list(all_results.keys()) + prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS] + + # Scores by (model, prompt_id) + score_map: dict[tuple[str, str], float] = {} + for mk, results in all_results.items(): + for r in results: + score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0 + + col_w = 10 + header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys) + print(f"\n{'='*len(header)}") + print(" COMPARISON TABLE") + print(f"{'='*len(header)}") + print(f" {header}") + print(f" {'-'*len(header)}") + + for pid in prompt_ids: + pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid) + row = f" {pname:<35}" + best = max(score_map.get((mk, pid), 0.0) for mk in model_keys) + for mk in model_keys: + v = score_map.get((mk, pid), 0.0) + marker = "*" if v == best and len(model_keys) > 1 else " " + row += f"{v:.3f}{marker} " + print(row) + + print(f" {'-'*len(header)}") + avgs_row = f" {'AVERAGE':<35}" + best_avg = -1.0 + avgs: dict[str, float] = {} + for mk in model_keys: + vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids] + avgs[mk] = sum(vals) / len(vals) + best_avg = max(best_avg, avgs[mk]) + for mk in model_keys: + marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " " + avgs_row += f"{avgs[mk]:.3f}{marker} " + print(avgs_row) + print(f"{'='*len(header)}\n") + if len(model_keys) > 1: + winner = max(avgs, key=lambda k: avgs[k]) + print(f" Winner: {winner} (avg {avgs[winner]:.3f})\n") + + +# ── CLI ──────────────────────────────────────────────────────────────────────── + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--list-models", action="store_true", + help="Print registered model shortcuts and exit") + parser.add_argument("--model", metavar="KEY", + help="Benchmark a single model (registry key or raw model name)") + parser.add_argument("--compare", nargs="+", metavar="KEY", + help="Compare two or more models side-by-side") + parser.add_argument("--cforch", action="store_true", + help="Route inference through cf-orch coordinator (allocate per model)") + parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL", + help=f"cf-orch coordinator URL (default: {CF_COORD_URL})") + parser.add_argument("--api-base", default=None, + help="Direct API base URL when not using cf-orch") + parser.add_argument("--model-name", default=None, + help="Override model name sent to API (single-model runs only)") + parser.add_argument("--prompts", nargs="+", metavar="ID", + help="Run only specific prompt IDs (e.g. ho_001 ho_003)") + parser.add_argument("--output", type=Path, default=None, + help="Write detailed JSON results to this path") + parser.add_argument("--workers", type=int, default=1, metavar="N", + help="Run N models concurrently (default 1). Set to number of available nodes.") + parser.add_argument("--verbose", "-v", action="store_true", + help="Print per-prompt progress") + args = parser.parse_args() + + if args.list_models: + print("\nRegistered model shortcuts:") + for key, info in MODEL_REGISTRY.items(): + print(f" {key:<20} {info['description']}") + print(f"\nDefault endpoints:") + print(f" direct {CF_TEXT_BASE}") + print(f" cf-orch {CF_COORD_URL}") + return + + prompts = HELD_OUT_PROMPTS + if args.prompts: + ids = set(args.prompts) + prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids] + if not prompts: + print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr) + sys.exit(1) + + model_keys: list[str] = [] + if args.compare: + model_keys = args.compare + elif args.model: + model_keys = [args.model] + else: + parser.print_help() + sys.exit(0) + + all_results: dict[str, list[PromptResult]] = {} + print_lock = threading.Lock() + + def _run_one(mk: str) -> tuple[str, list[PromptResult]]: + if mk in MODEL_REGISTRY: + reg = MODEL_REGISTRY[mk] + model_name = args.model_name or reg["model"] + direct_base = args.api_base or reg["api_base"] + else: + model_name = args.model_name or mk + direct_base = args.api_base or CF_TEXT_BASE + + if args.cforch: + with print_lock: + print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url}) model={model_name}") + results = run_benchmark( + mk, model_name, prompts=prompts, verbose=args.verbose, + use_cforch=True, cforch_url=args.cforch_url, + ) + else: + with print_lock: + print(f"\nRunning [{mk}] → {direct_base} model={model_name}") + results = run_benchmark( + mk, model_name, prompts=prompts, verbose=args.verbose, + api_base=direct_base, + ) + + with print_lock: + _print_single_report(results, mk) + return mk, results + + workers = max(1, args.workers) + if workers == 1 or len(model_keys) == 1: + for mk in model_keys: + mk_out, results = _run_one(mk) + all_results[mk_out] = results + else: + with ThreadPoolExecutor(max_workers=workers) as pool: + futures = {pool.submit(_run_one, mk): mk for mk in model_keys} + for fut in as_completed(futures): + mk_out, results = fut.result() + all_results[mk_out] = results + + if len(model_keys) > 1: + _print_comparison_table(all_results) + + if args.output: + args.output.parent.mkdir(parents=True, exist_ok=True) + payload = { + mk: [asdict(r) for r in results] + for mk, results in all_results.items() + } + with open(args.output, "w", encoding="utf-8") as f: + json.dump(payload, f, indent=2, ensure_ascii=False) + print(f"Wrote detailed results to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/scripts/export_plans.py b/scripts/export_plans.py new file mode 100644 index 0000000..8a8e850 --- /dev/null +++ b/scripts/export_plans.py @@ -0,0 +1,458 @@ +"""Export circuitforge-plans/ documents as instruction-tuning JSONL pairs. + +Each record is a HuggingFace chat-format example: + + { + "id": "", + "messages": [ + {"role": "user", "content": ""}, + {"role": "assistant", "content": ""} + ], + "meta": { + "source": "peregrine/2026-03-03-feedback-button-design.md", + "product": "peregrine", + "doc_type": "design", # design | plan | spec | implementation | other + "date": "2026-03-03", + "paired_with": "...", # sibling path, or null + "word_count": 1847, + "pair_role": "context" # "context" | "target" | "standalone" + } + } + +Pairing strategy +---------------- +When a design doc and a plan doc share the same date + feature-name prefix, +they are treated as a pair: + - design → plan: instruction = "Given this design doc, write the implementation plan." + context appended = full design doc content. + - Solo docs get a synthetic instruction from the title + first overview section. + +Usage +----- + # Preview stats and 5 sample records + python scripts/export_plans.py --preview + + # Write full output + python scripts/export_plans.py --output data/plan_pairs.jsonl + + # Restrict to specific products + python scripts/export_plans.py --products peregrine,kiwi --output data/plan_pairs.jsonl +""" +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import sys +from pathlib import Path +from typing import Iterator + +# ── Paths ────────────────────────────────────────────────────────────────────── + +_SCRIPT_DIR = Path(__file__).parent +_AVOCET_ROOT = _SCRIPT_DIR.parent +_DEFAULT_PLANS_DIR = Path("/Library/Development/CircuitForge/circuitforge-plans") +_DEFAULT_OUTPUT = _AVOCET_ROOT / "data" / "plan_pairs.jsonl" + +# ── Doc type detection ───────────────────────────────────────────────────────── + +_TYPE_RE = re.compile( + r"-(design|plan|spec|implementation|specs|plans)s?$", + re.IGNORECASE, +) + +_SKIP_DIRS = {"__pycache__", ".git", "node_modules"} + +# Boilerplate lines to strip from document content before using as output. +_BOILERPLATE_RE = re.compile( + r""" + ^\s*>\s*\*\*For\s+agentic\s+workers.* # superpowers agent hints + |^\s*>\s*REQUIRED\s+SUB-SKILL.* + |^\s*\*\*Date:\*\*.* # metadata header lines + |\*\*Status:\*\*\s*Complete.* # completed-feature noise + |\*\*Status:\*\*\s*Done.* + |\*\*Product:\*\*.* + |\*\*Repo:\*\*.* + |\*\*Tech\s+Stack:\*\*.* + |\*\*Candidate:\*\*.* # old synthetic personas + |^Candidate:.* + |^Team:.* + """, + re.VERBOSE | re.MULTILINE, +) + +# Old repo/path names to normalise to current equivalents. +_PATH_NORMALIZATIONS: list[tuple[re.Pattern, str]] = [ + (re.compile(r"/devl/job-seeker", re.IGNORECASE), "/Library/Development/CircuitForge/peregrine"), + (re.compile(r"\bjob-seeker\b", re.IGNORECASE), "peregrine"), + (re.compile(r"Alex Rivera", re.IGNORECASE), "[user]"), +] + +# Instruction paraphrase templates per doc type. +# Each entry is (user_prefix, paired_prefix). +# {title}, {product}, {type_phrase}, {overview}, {design_context} are substituted. +_DESIGN_INSTRUCTIONS = [ + "Write a design document for {product}: {title}.\n\nContext: {overview}", + "You are a software architect working on {product}. Draft a design spec for: {title}.\n\n{overview}", + "Produce a CircuitForge-style design document for the following {product} feature — {title}.\n\nBackground: {overview}", +] + +_PLAN_INSTRUCTIONS = [ + "Write an implementation plan for {product}: {title}.\n\nContext: {overview}", + "Break the following {product} feature into a detailed implementation plan with file structure and task checkboxes — {title}.\n\n{overview}", + "You are a senior engineer on {product}. Produce a step-by-step engineering plan for: {title}.\n\n{overview}", +] + +_PAIRED_INSTRUCTIONS = [ + ( + "You are a software architect working on {product}, a CircuitForge product. " + "Given the following design document, write a detailed implementation plan " + "(file structure, task breakdown with checkboxes, migration steps if needed).\n\n" + "---\n{design_context}\n---" + ), + ( + "The following is a design spec for a {product} feature. " + "Produce a concrete implementation plan: file list, task checklist, any DB migrations needed.\n\n" + "---\n{design_context}\n---" + ), + ( + "Convert this {product} design document into an actionable implementation plan. " + "Include all files to create/modify, step-by-step tasks with checkboxes, and migration steps.\n\n" + "---\n{design_context}\n---" + ), +] + + +def _doc_type(stem: str) -> str: + m = _TYPE_RE.search(stem) + if not m: + return "other" + raw = m.group(1).lower().rstrip("s") + return {"implementation": "plan"}.get(raw, raw) + + +def _date_feature(stem: str) -> tuple[str, str]: + """Return (date, feature_slug) from '2026-03-03-feedback-button-design'.""" + m = re.match(r"^(\d{4}-\d{2}-\d{2})-(.+?)(?:-(design|plan|spec|implementation)s?)?$", stem, re.I) + if m: + return m.group(1), m.group(2) + return "", stem + + +# ── Content extraction ───────────────────────────────────────────────────────── + +def _extract_title(content: str) -> str: + m = re.search(r"^#\s+(.+)", content, re.MULTILINE) + return m.group(1).strip() if m else "" + + +def _extract_overview(content: str) -> str: + """Return first substantive paragraph or h2 section body (≤300 chars).""" + # Superpowers plans have an explicit **Goal:** line — prefer that. + goal_m = re.search(r"\*\*Goal:\*\*\s*(.+)", content) + if goal_m: + return goal_m.group(1).strip()[:300] + + # Otherwise use the body of the first h2 section. + h2_m = re.search( + r"^##\s+\d*\.?\s*.+\n([\s\S]+?)(?=^##|\Z)", + content, + re.MULTILINE, + ) + if h2_m: + body = h2_m.group(1).strip() + # Strip markdown bullet/code noise for the instruction + body = re.sub(r"```[\s\S]*?```", "", body) + body = re.sub(r"`[^`]+`", lambda m: m.group().strip("`"), body) + body = re.sub(r"\*\*([^*]+)\*\*", r"\1", body) + body = re.sub(r"\s+", " ", body).strip() + return body[:300] + + return "" + + +def _clean_content(content: str) -> str: + """Remove boilerplate, normalize old paths/names, collapse whitespace.""" + cleaned = _BOILERPLATE_RE.sub("", content) + for pattern, replacement in _PATH_NORMALIZATIONS: + cleaned = pattern.sub(replacement, cleaned) + cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) + return cleaned.strip() + + +def _quality_flags(content: str) -> list[str]: + """Return a list of quality issue labels found in cleaned content.""" + flags = [] + if "Alex Rivera" in content or "[user]" in content: + flags.append("persona-residue") + if re.search(r"\bStatus:\s*(Complete|Done|Merged)\b", content): + flags.append("completed-status") + return flags + + +def _make_instruction( + title: str, + product: str, + doc_type: str, + overview: str, + design_context: str | None = None, + variant: int = 0, +) -> str: + """Synthesise a natural planning prompt for this document. + + variant: 0-2 selects which paraphrase template to use. Caller cycles + through all three to produce multiple training examples per document. + """ + product_label = product.replace("-", " ").title() if product else "CircuitForge" + idx = variant % 3 + + if design_context: + tmpl = _PAIRED_INSTRUCTIONS[idx] + return tmpl.format( + product=product_label, + design_context=design_context[:2500], + ) + + templates = _PLAN_INSTRUCTIONS if doc_type in ("plan",) else _DESIGN_INSTRUCTIONS + tmpl = templates[idx] + return tmpl.format( + product=product_label, + title=title, + overview=overview or "", + type_phrase="planning document", + ) + + +def _record_id(content: str, source: str) -> str: + return hashlib.sha256(f"{source}:{content}".encode()).hexdigest()[:16] + + +# ── Pair discovery ───────────────────────────────────────────────────────────── + +def _find_pairs(plans_dir: Path) -> dict[str, list[tuple[str, Path]]]: + """Return {prefix_key → [(doc_type, path), ...]} for docs sharing date+feature.""" + by_prefix: dict[str, list[tuple[str, Path]]] = {} + for path in plans_dir.rglob("*.md"): + if any(part in _SKIP_DIRS for part in path.parts): + continue + if path.name == "README.md": + continue + stem = path.stem + date, feature = _date_feature(stem) + if not date: + continue + key = str(path.parent / f"{date}-{feature}") + by_prefix.setdefault(key, []).append((_doc_type(stem), path)) + return by_prefix + + +# ── Record generation ────────────────────────────────────────────────────────── + +def _records_for_group( + doc_type_paths: list[tuple[str, Path]], + plans_dir: Path, +) -> Iterator[dict]: + """Yield one or more training records for a group of related docs.""" + # Separate design vs plan docs within this group + designs = [(t, p) for t, p in doc_type_paths if t in ("design", "spec")] + plans_ = [(t, p) for t, p in doc_type_paths if t in ("plan",)] + others = [(t, p) for t, p in doc_type_paths if t not in ("design", "spec", "plan")] + + all_paths = doc_type_paths + + if designs and plans_: + # Paired: yield a design→plan record (3 instruction variants) + design_type, design_path = designs[0] + plan_type, plan_path = plans_[0] + design_content = design_path.read_text(encoding="utf-8") + plan_content = plan_path.read_text(encoding="utf-8") + + product = _product_from_path(plan_path, plans_dir) + title = _extract_title(plan_content) or plan_path.stem + cleaned = _clean_content(plan_content) + design_cleaned = _clean_content(design_content) + flags = _quality_flags(cleaned) + + if len(cleaned.split()) >= 80: + rel_src = str(plan_path.relative_to(plans_dir)) + rel_design = str(design_path.relative_to(plans_dir)) + for variant in range(3): + instruction = _make_instruction( + title=title, + product=product, + doc_type="plan", + overview=_extract_overview(design_content), + design_context=design_cleaned, + variant=variant, + ) + yield { + "id": _record_id(f"v{variant}:{cleaned}", rel_src), + "messages": [ + {"role": "user", "content": instruction}, + {"role": "assistant", "content": cleaned}, + ], + "meta": { + "source": rel_src, + "product": product, + "doc_type": "plan", + "date": _date_feature(plan_path.stem)[0], + "paired_with": rel_design, + "word_count": len(cleaned.split()), + "pair_role": "target", + "variant": variant, + "quality_flags": flags, + }, + } + + # Also yield the design doc as standalone variants + all_paths = [(t, p) for t, p in all_paths if p != plan_path] + + # Remaining docs as standalone records (3 instruction variants each) + for doc_type, path in all_paths: + content = path.read_text(encoding="utf-8") + cleaned = _clean_content(content) + if len(cleaned.split()) < 80: + continue + + product = _product_from_path(path, plans_dir) + title = _extract_title(content) or path.stem + overview = _extract_overview(content) + flags = _quality_flags(cleaned) + rel_src = str(path.relative_to(plans_dir)) + + for variant in range(3): + instruction = _make_instruction( + title=title, + product=product, + doc_type=doc_type, + overview=overview, + variant=variant, + ) + yield { + "id": _record_id(f"v{variant}:{cleaned}", rel_src), + "messages": [ + {"role": "user", "content": instruction}, + {"role": "assistant", "content": cleaned}, + ], + "meta": { + "source": rel_src, + "product": product, + "doc_type": doc_type, + "date": _date_feature(path.stem)[0], + "paired_with": None, + "word_count": len(cleaned.split()), + "pair_role": "standalone", + "variant": variant, + "quality_flags": flags, + }, + } + + +def _product_from_path(path: Path, plans_dir: Path) -> str: + rel = path.relative_to(plans_dir) + return rel.parts[0] if len(rel.parts) > 1 else "shared" + + +# ── Main export ──────────────────────────────────────────────────────────────── + +def export( + plans_dir: Path, + products: list[str] | None = None, +) -> list[dict]: + groups = _find_pairs(plans_dir) + records: list[dict] = [] + seen_ids: set[str] = set() + + for group_key, doc_type_paths in groups.items(): + # Filter by product if requested + if products: + paths = [p for _, p in doc_type_paths] + prods = {_product_from_path(p, plans_dir) for p in paths} + if not prods.intersection(products): + continue + + for record in _records_for_group(doc_type_paths, plans_dir): + if record["id"] not in seen_ids: + seen_ids.add(record["id"]) + records.append(record) + + return records + + +# ── CLI ──────────────────────────────────────────────────────────────────────── + +def _print_stats(records: list[dict]) -> None: + from collections import Counter + products = Counter(r["meta"]["product"] for r in records) + doc_types = Counter(r["meta"]["doc_type"] for r in records) + pair_roles = Counter(r["meta"]["pair_role"] for r in records) + wc = [r["meta"]["word_count"] for r in records] + wc.sort() + + print(f"\n{'='*55}") + print(f" Total records: {len(records)}") + print(f" Word counts : min={wc[0]}, median={wc[len(wc)//2]}, max={wc[-1]}") + print(f"\n By product:") + for p, n in products.most_common(): + print(f" {p:<22} {n}") + print(f"\n By doc type:") + for t, n in doc_types.most_common(): + print(f" {t:<22} {n}") + print(f"\n Pair roles:") + for r, n in pair_roles.most_common(): + print(f" {r:<22} {n}") + print(f"{'='*55}\n") + + +def _print_sample(records: list[dict], n: int = 3) -> None: + import random + sample = random.sample(records, min(n, len(records))) + for i, rec in enumerate(sample, 1): + meta = rec["meta"] + user_msg = rec["messages"][0]["content"] + asst_msg = rec["messages"][1]["content"] + print(f"\n{'─'*55}") + print(f"SAMPLE {i}/{n} [{meta['product']} / {meta['doc_type']} / {meta['pair_role']}]") + print(f"source: {meta['source']}") + print(f"\nUSER ({len(user_msg)} chars):\n{user_msg[:500]}{'...' if len(user_msg)>500 else ''}") + print(f"\nASSISTANT ({meta['word_count']} words):\n{asst_msg[:400]}{'...' if len(asst_msg)>400 else ''}") + print(f"\n{'─'*55}\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument("--plans-dir", type=Path, default=_DEFAULT_PLANS_DIR) + parser.add_argument("--output", type=Path, default=None, + help="Write JSONL to this path (omit for preview-only)") + parser.add_argument("--products", default=None, + help="Comma-separated product filter, e.g. peregrine,kiwi") + parser.add_argument("--preview", action="store_true", + help="Print stats + sample records, don't write output") + parser.add_argument("--samples", type=int, default=3, + help="Number of sample records to show in preview (default 3)") + args = parser.parse_args() + + products = [p.strip() for p in args.products.split(",")] if args.products else None + + print(f"Scanning {args.plans_dir} …", file=sys.stderr) + records = export(args.plans_dir, products=products) + + _print_stats(records) + + if args.preview or args.output is None: + _print_sample(records, n=args.samples) + if args.output is None: + print("(Pass --output to write JSONL)") + return + + args.output.parent.mkdir(parents=True, exist_ok=True) + with open(args.output, "w", encoding="utf-8") as f: + for rec in records: + f.write(json.dumps(rec, ensure_ascii=False) + "\n") + + print(f"Wrote {len(records)} records to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/tests/test_cforch.py b/tests/test_cforch.py index 45b109e..03484b8 100644 --- a/tests/test_cforch.py +++ b/tests/test_cforch.py @@ -14,7 +14,9 @@ from fastapi.testclient import TestClient @pytest.fixture(autouse=True) def reset_cforch_globals(tmp_path): - """Redirect _CONFIG_DIR to tmp_path and reset running-state globals.""" + """Redirect _CONFIG_DIR to tmp_path, reset running-state globals, and stub + list_installed to return [] so real disk model directories don't bleed into + tests that don't exercise the installed-model merge path.""" from app import cforch as cforch_module prev_config_dir = cforch_module._CONFIG_DIR @@ -25,7 +27,8 @@ def reset_cforch_globals(tmp_path): cforch_module._BENCH_RUNNING = False cforch_module._bench_proc = None - yield tmp_path + with patch("app.models.list_installed", return_value=[]): + yield tmp_path cforch_module.set_config_dir(prev_config_dir) cforch_module._BENCH_RUNNING = prev_running @@ -141,6 +144,35 @@ def test_models_parses_bench_models_yaml(client, config_dir, tmp_path): assert m["vram_estimate_mb"] == 6000 +def test_models_merges_installed_generators(client, config_dir, tmp_path): + """Installed cf-text/vllm generator models appear in the model list, + deduplicated against bench_models.yaml entries.""" + models_file = tmp_path / "bench_models.yaml" + _write_models_yaml(models_file, [ + {"name": "llama3", "id": "llama3:8b", "service": "ollama", "tags": [], "vram_estimate_mb": 6000}, + {"name": "already-there", "id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "tags": [], "vram_estimate_mb": 8000}, + ]) + _write_config(config_dir, {"bench_models": str(models_file)}) + + fake_installed = [ + # should be included — cf-text generator not already in YAML + {"model_id": "meta-llama/Llama-3.1-8B", "service": "cf-text", "role": "generator", "vram_mb": 16000}, + # should be deduped — repo_id matches a YAML entry + {"model_id": "ibm-granite/granite-4.1-8b", "service": "cf-text", "role": "generator", "vram_mb": 8000}, + # should be excluded — classifier, not a generator + {"model_id": "cross-encoder/ms-marco-MiniLM-L6", "service": "avocet", "role": "reranker", "vram_mb": 500}, + ] + with patch("app.models.list_installed", return_value=fake_installed): + r = client.get("/api/cforch/models") + assert r.status_code == 200 + ids = [m["id"] for m in r.json()["models"]] + assert "llama3:8b" in ids # from YAML + assert "ibm-granite/granite-4.1-8b" in ids # from YAML (not duplicated) + assert "meta-llama/Llama-3.1-8B" in ids # merged from installed + assert "cross-encoder/ms-marco-MiniLM-L6" not in ids # filtered out (reranker) + assert ids.count("ibm-granite/granite-4.1-8b") == 1 # no duplicate + + # ── GET /run ─────────────────────────────────────────────────────────────────── def test_run_returns_409_when_already_running(client): diff --git a/tests/test_models.py b/tests/test_models.py index 19af1e7..4af2340 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -541,3 +541,84 @@ def test_delete_installed_name_with_slash_blocked(client): except _HTTPException as exc: assert exc.status_code in (400, 404) raise + + +# ── Catalog registration ─────────────────────────────────────────────────────── + +_MINIMAL_YAML = """\ +services: + cf-text: + max_mb: {max_mb} + catalog: + existing-model: + path: /some/path + vram_mb: 1000 + description: "placeholder" +""" + + +def _make_node_yaml(tmp_path: Path, max_mb: int = 8192) -> Path: + p = tmp_path / "testnode.yaml" + p.write_text(_MINIMAL_YAML.format(max_mb=max_mb), encoding="utf-8") + return p + + +def test_catalog_registration_fp16_no_env_block(tmp_path): + """When model fits at FP16, no env block should be written.""" + from app import models as models_module + + node_yaml = _make_node_yaml(tmp_path, max_mb=8192) + with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path): + updated = models_module._register_in_node_catalogs( + repo_id="org/SmallModel", + local_path=tmp_path / "org--SmallModel", + vram_mb_fp16=4000, + role="generator", + ) + + assert "testnode" in updated + content = node_yaml.read_text() + # _catalog_key strips org prefix and lowercases: "org/SmallModel" → "smallmodel" + assert "smallmodel:" in content + assert "CF_TEXT_4BIT" not in content + assert "env:" not in content + + +def test_catalog_registration_needs_4bit_writes_env_block(tmp_path): + """When model only fits at 4-bit, env: CF_TEXT_4BIT: '1' must be written.""" + from app import models as models_module + + node_yaml = _make_node_yaml(tmp_path, max_mb=8192) + with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path): + updated = models_module._register_in_node_catalogs( + repo_id="org/BigModel", + local_path=tmp_path / "org--BigModel", + vram_mb_fp16=20000, # won't fit at FP16 on 8 GB + role="generator", + ) + + assert "testnode" in updated + content = node_yaml.read_text() + # _catalog_key: "org/BigModel" → "bigmodel" + assert "bigmodel:" in content + assert "env:" in content + assert 'CF_TEXT_4BIT: "1"' in content + assert "CF_TEXT_4BIT=1 required" in content # description note + + +def test_catalog_registration_too_large_skipped(tmp_path): + """Model too large even at 4-bit should not be registered.""" + from app import models as models_module + + node_yaml = _make_node_yaml(tmp_path, max_mb=8192) + with patch.object(models_module, "_CF_ORCH_PROFILES_DIR", tmp_path): + updated = models_module._register_in_node_catalogs( + repo_id="org/HugeModel", + local_path=tmp_path / "org--HugeModel", + vram_mb_fp16=80000, # 4-bit ~22 GB, still won't fit on 8 GB + role="generator", + ) + + assert updated == [] + content = node_yaml.read_text() + assert "hugemodel" not in content diff --git a/web/src/views/BenchmarkView.vue b/web/src/views/BenchmarkView.vue index a4ae4ea..3791428 100644 --- a/web/src/views/BenchmarkView.vue +++ b/web/src/views/BenchmarkView.vue @@ -21,21 +21,28 @@ :class="{ active: benchMode === 'style' }" @click="benchMode = 'style'" >✍️ Writing Style + - - - + + + + diff --git a/web/src/views/LlmEvalTab.vue b/web/src/views/LlmEvalTab.vue index 34c2be1..4475701 100644 --- a/web/src/views/LlmEvalTab.vue +++ b/web/src/views/LlmEvalTab.vue @@ -6,6 +6,8 @@ 📋 Task Selection {{ llmTaskBadge }} + +
Loading tasks…
@@ -44,6 +46,8 @@ 🎯 Model Selection {{ llmModelBadge }} + +
Loading models…
@@ -78,6 +82,33 @@
+ +
+ Nodes: + + + {{ enabledNodeIds.length === llmNodes.filter(n => n.online).length + ? 'auto-routing (all nodes)' + : `restricted to: ${enabledNodeIds.join(', ')}` }} + +
+
+ + Select at least one task and one model to run. @@ -119,6 +168,7 @@ Model overall + judge {{ col }} tok/s @@ -130,6 +180,12 @@ class="hm-value-cell" :class="{ 'bt-best': llmBestByCol['overall'] === row.model_id }" >{{ pct(row.avg_quality_score) }} + {{ row.avg_judge_score != null ? pct(row.avg_judge_score) : '—' }} + judge_score_by_task_type?: Record } // ── State ─────────────────────────────────────────────────────────────────── @@ -195,6 +259,10 @@ const llmError = ref('') const llmResults = ref([]) const llmEventSource = ref(null) const llmLogEl = ref(null) +const llmJudgeUrl = ref('') +const llmWorkers = ref(1) +const llmNodes = ref([]) +const enabledNodes = ref>(new Set()) // ── Computed ──────────────────────────────────────────────────────────────── const llmTasksByType = computed((): Record => { @@ -239,6 +307,14 @@ const llmTaskTypeCols = computed(() => { return [...types].sort() }) +const llmHasJudge = computed(() => + llmResults.value.some(r => r.avg_judge_score != null) +) + +const enabledNodeIds = computed(() => + llmNodes.value.filter(n => n.online && enabledNodes.value.has(n.node_id)).map(n => n.node_id) +) + const llmBestByCol = computed((): Record => { const best: Record = {} if (llmResults.value.length === 0) return best @@ -249,6 +325,16 @@ const llmBestByCol = computed((): Record => { } best['overall'] = bestId + if (llmHasJudge.value) { + bestId = ''; bestVal = -Infinity + for (const r of llmResults.value) { + if (r.avg_judge_score != null && r.avg_judge_score > bestVal) { + bestVal = r.avg_judge_score; bestId = r.model_id + } + } + best['judge'] = bestId + } + for (const col of llmTaskTypeCols.value) { bestId = ''; bestVal = -Infinity for (const r of llmResults.value) { @@ -306,6 +392,15 @@ function toggleService(models: CfOrchModel[], checked: boolean) { } selectedLlmModels.value = next } +function selectAllTasks() { selectedLlmTasks.value = new Set(llmTasks.value.map(t => t.id)) } +function clearAllTasks() { selectedLlmTasks.value = new Set() } +function selectAllModels() { selectedLlmModels.value = new Set(llmModels.value.map(m => m.id)) } +function clearAllModels() { selectedLlmModels.value = new Set() } +function toggleNode(id: string, checked: boolean) { + const next = new Set(enabledNodes.value) + checked ? next.add(id) : next.delete(id) + enabledNodes.value = next +} // ── Data loaders ───────────────────────────────────────────────────────────── async function loadLlmTasks() { @@ -335,6 +430,21 @@ async function loadLlmResults() { } } +async function loadLlmConfig() { + const { data } = await useApiFetch<{ judge_url?: string }>('/api/cforch/config') + if (data?.judge_url && !llmJudgeUrl.value) { + llmJudgeUrl.value = data.judge_url + } +} + +async function loadLlmNodes() { + const { data } = await useApiFetch<{ nodes: CfOrchNode[] }>('/api/cforch/nodes') + if (data?.nodes) { + llmNodes.value = data.nodes + enabledNodes.value = new Set(data.nodes.filter(n => n.online).map(n => n.node_id)) + } +} + // ── Run / cancel ────────────────────────────────────────────────────────────── function startLlmBenchmark() { llmRunning.value = true @@ -344,6 +454,15 @@ function startLlmBenchmark() { const params = new URLSearchParams() const taskIds = [...selectedLlmTasks.value].join(',') if (taskIds) params.set('task_ids', taskIds) + const modelIds = [...selectedLlmModels.value].join(',') + if (modelIds) params.set('model_ids', modelIds) + if (llmJudgeUrl.value.trim()) params.set('judge_url', llmJudgeUrl.value.trim()) + if (llmWorkers.value > 1) params.set('workers', String(llmWorkers.value)) + const onlineNodeIds = llmNodes.value.filter(n => n.online).map(n => n.node_id) + const isRestricted = enabledNodeIds.value.length < onlineNodeIds.length + if (isRestricted && enabledNodeIds.value.length > 0) { + params.set('node_ids', enabledNodeIds.value.join(',')) + } const es = new EventSource(`/api/cforch/run?${params}`) llmEventSource.value = es @@ -387,6 +506,8 @@ onMounted(() => { loadLlmTasks() loadLlmModels() loadLlmResults() + loadLlmConfig() + loadLlmNodes() }) @@ -451,6 +572,43 @@ onMounted(() => { color: var(--color-text-secondary, #6b7a99); } +.judge-url-input { + flex: 1; + min-width: 14rem; + max-width: 24rem; + padding: 0.35rem 0.6rem; + border: 1px solid var(--color-border, #d0d7e8); + border-radius: 0.375rem; + background: var(--color-surface, #fff); + color: var(--color-text, #1a2338); + font-size: 0.8rem; + font-family: var(--font-mono, monospace); +} +.judge-url-input:disabled { opacity: 0.5; } +.judge-url-input::placeholder { color: var(--color-text-secondary, #6b7a99); } + +.workers-label { + display: flex; + align-items: center; + gap: 0.3rem; + font-size: 0.8rem; + color: var(--color-text-secondary, #6b7a99); + white-space: nowrap; +} +.workers-prefix { font-family: var(--font-mono, monospace); } +.workers-input { + width: 3.2rem; + padding: 0.35rem 0.4rem; + border: 1px solid var(--color-border, #d0d7e8); + border-radius: 0.375rem; + background: var(--color-surface, #fff); + color: var(--color-text, #1a2338); + font-size: 0.8rem; + font-family: var(--font-mono, monospace); + text-align: center; +} +.workers-input:disabled { opacity: 0.5; } + /* ── Run log ────────────────────────────────────────────── */ .run-log { border: 1px solid var(--color-border, #d0d7e8); @@ -592,6 +750,15 @@ onMounted(() => { white-space: nowrap; } +.hm-judge-col { + background: color-mix(in srgb, var(--color-surface-raised, #e4ebf5) 80%, #c6d5f5); +} +.hm-judge-cell { + background: color-mix(in srgb, var(--color-surface, #fff) 85%, #c6d5f5); + font-style: italic; + opacity: 0.9; +} + /* ── Model Picker ───────────────────────────────────────── */ .model-picker { border: 1px solid var(--color-border, #d0d7e8); @@ -630,6 +797,24 @@ details[open] .picker-summary::before { content: '▼ '; } margin-left: auto; } +.picker-bulk-btn { + padding: 0.1rem 0.45rem; + font-size: 0.7rem; + font-family: var(--font-mono, monospace); + background: var(--color-surface, #fff); + border: 1px solid var(--color-border, #d0d7e8); + border-radius: 0.25rem; + color: var(--color-text-secondary, #6b7a99); + cursor: pointer; + transition: background 0.12s, color 0.12s; + flex-shrink: 0; +} +.picker-bulk-btn:hover { + background: var(--app-primary, #2A6080); + color: #fff; + border-color: var(--app-primary, #2A6080); +} + .picker-body { padding: 0.75rem; border-top: 1px solid var(--color-border, #d0d7e8); @@ -712,4 +897,61 @@ details[open] .picker-summary::before { content: '▼ '; } .picker-model-list { padding-left: 0; } .picker-model-name { max-width: 14ch; } } + +/* ── Node picker ────────────────────────────────────── */ +.node-picker { + display: flex; + align-items: center; + gap: 0.5rem; + flex-wrap: wrap; + padding: 0.5rem 0.75rem; + border: 1px solid var(--color-border, #d0d7e8); + border-radius: 0.5rem; + background: var(--color-surface-raised, #e4ebf5); +} + +.node-picker-label { + font-size: 0.78rem; + font-weight: 600; + color: var(--color-text-secondary, #6b7a99); + text-transform: uppercase; + letter-spacing: 0.04em; + white-space: nowrap; +} + +.node-chip { + display: inline-flex; + align-items: center; + gap: 0.3rem; + padding: 0.2rem 0.55rem; + border: 1px solid var(--color-border, #d0d7e8); + border-radius: 1rem; + background: var(--color-surface, #fff); + font-size: 0.78rem; + font-family: var(--font-mono, monospace); + color: var(--color-text, #1a2338); + cursor: pointer; + transition: background 0.12s, opacity 0.12s; +} +.node-chip--off { + opacity: 0.45; + background: transparent; +} +.node-chip--offline { + opacity: 0.35; + cursor: not-allowed; + font-style: italic; +} +.node-chip-check { accent-color: var(--app-primary, #2A6080); } +.node-chip-status { + font-size: 0.66rem; + color: var(--color-text-secondary, #6b7a99); +} + +.node-picker-hint { + font-size: 0.72rem; + color: var(--color-text-secondary, #6b7a99); + font-family: var(--font-mono, monospace); + margin-left: auto; +} diff --git a/web/src/views/ModelsView.vue b/web/src/views/ModelsView.vue index 34cba78..364f4cb 100644 --- a/web/src/views/ModelsView.vue +++ b/web/src/views/ModelsView.vue @@ -51,8 +51,31 @@ {{ lookupResult.adapter_recommendation }} - - {{ humanBytes(lookupResult.size) }} + + {{ humanBytes(selectedQuantSize) }} + +
+ + +
+ + + + Q5_K_M or Q6_K recommended for 8 GB GPUs. Q8_0 for max quality.
@@ -67,7 +90,7 @@
+ +
+ +
-
@@ -252,6 +305,12 @@ import { ref, computed, onMounted, onUnmounted } from 'vue' // ── Type definitions ────────────────────────────────── +interface GgufFile { + filename: string + size: number + quant_name: string | null +} + interface LookupResult { repo_id: string pipeline_tag: string | null @@ -260,7 +319,8 @@ interface LookupResult { service: string | null compatible: boolean warning: string | null - size: number | null + model_size_bytes: number + gguf_files: GgufFile[] | null description: string | null already_installed: boolean already_queued: boolean @@ -274,6 +334,7 @@ interface QueuedModel { adapter_recommendation: string | null role: string | null service: string | null + quant_pattern: string | null } interface InstalledModel { @@ -302,6 +363,26 @@ const lookupLoading = ref(false) const lookupError = ref(null) const lookupResult = ref(null) const addingToQueue = ref(false) +const selectedQuant = ref(null) + +// Size of the selected GGUF file, or total model size for non-GGUF repos. +const selectedQuantSize = computed(() => { + const r = lookupResult.value + if (!r) return 0 + if (r.gguf_files?.length && selectedQuant.value) { + const f = r.gguf_files.find(f => (f.quant_name ?? f.filename) === selectedQuant.value) + return f?.size ?? r.model_size_bytes + } + return r.model_size_bytes +}) + +// Disable "Add to queue" when a GGUF repo but no quant chosen yet. +const canAddToQueue = computed(() => { + const r = lookupResult.value + if (!r || r.already_installed || r.already_queued || addingToQueue.value) return false + if (r.gguf_files?.length && !selectedQuant.value) return false + return true +}) const queuedModels = ref([]) const installedModels = ref([]) @@ -411,6 +492,7 @@ async function doLookup() { lookupLoading.value = true lookupError.value = null lookupResult.value = null + selectedQuant.value = null try { const res = await fetch(`/api/models/lookup?repo_id=${encodeURIComponent(repoId)}`) @@ -442,7 +524,15 @@ async function addToQueue() { const res = await fetch('/api/models/queue', { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ repo_id, pipeline_tag, adapter_recommendation, role, service }), + body: JSON.stringify({ + repo_id, + pipeline_tag, + adapter_recommendation, + role, + service, + model_size_bytes: selectedQuantSize.value, + quant_pattern: selectedQuant.value, + }), }) if (res.ok) { lookupResult.value = { ...lookupResult.value, already_queued: true } @@ -454,8 +544,16 @@ async function addToQueue() { } } -async function approveModel(id: string) { +async function approveModel(id: string, draft?: { service: string; role: string }) { try { + // If the user picked a service/role for an unrecognized model, patch it first. + if (draft?.service && draft?.role) { + await fetch(`/api/models/queue/${encodeURIComponent(id)}`, { + method: 'PATCH', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ service: draft.service, role: draft.role }), + }) + } const res = await fetch(`/api/models/queue/${encodeURIComponent(id)}/approve`, { method: 'POST' }) if (res.ok) { await loadQueue() @@ -774,6 +872,44 @@ onUnmounted(() => { align-self: flex-start; } +/* ── Quant picker ── */ +.quant-picker { + display: flex; + flex-direction: column; + gap: 0.35rem; +} + +.quant-label { + font-size: 0.8rem; + font-weight: 600; + color: var(--color-text-muted, #4a5c7a); + text-transform: uppercase; + letter-spacing: 0.04em; +} + +.quant-select { + padding: 0.4rem 0.6rem; + border: 1px solid var(--color-border, #a8b8d0); + border-radius: var(--radius-md, 0.5rem); + background: var(--color-surface, #f0f4fb); + color: var(--color-text, #1a2338); + font-size: 0.9rem; + font-family: var(--font-mono, monospace); + cursor: pointer; +} + +.quant-hint { + font-size: 0.78rem; + color: var(--color-text-muted, #4a5c7a); +} + +.chip-quant { + background: color-mix(in srgb, var(--color-primary, #2A6080) 15%, transparent); + color: var(--color-primary, #2A6080); + font-family: var(--font-mono, monospace); + font-size: 0.75rem; +} + /* ── Model cards (queue + downloads) ── */ .model-card { border: 1px solid var(--color-border, #a8b8d0); diff --git a/web/src/views/PlansBenchTab.vue b/web/src/views/PlansBenchTab.vue new file mode 100644 index 0000000..9368203 --- /dev/null +++ b/web/src/views/PlansBenchTab.vue @@ -0,0 +1,1043 @@ + + + + +