#!/usr/bin/env python """CF-specific planning benchmark — compare base models before fine-tuning. Sends held-out CircuitForge planning prompts to one or more models via the cf-text (local) or cf-orch API, then scores responses against CF-specific rubrics. Use this to select the best base model for SFT. Scoring rubrics (each 0-1, summed to total/N): - task_structure : uses checkbox syntax (- [ ]), git commit steps - tier_awareness : mentions Free/Paid/Premium/Ultra tiers - privacy_pillar : mentions privacy/local-inference/no-logging - safety_pillar : mentions safety, human approval, or reversibility - accessibility : mentions ND/accessibility/adaptive needs - license_split : mentions MIT vs BSL or open-core model - file_paths : uses plausible file path references - cf_conventions : uses conda run -n cf, /Library/Development/, or known CF dirs - paired_coherence : (paired only) plan references the design doc's feature name - length_ok : 300–2500 words (under-short = hallucination risk; over-long = padding) Usage ----- # List available model targets python scripts/benchmark_plans.py --list-models # Run all held-out prompts against a single model, print report python scripts/benchmark_plans.py --model granite-4.1-8b # Compare two models side-by-side python scripts/benchmark_plans.py --compare granite-4.1-8b deepseek-r1-7b-4bit # Run with a custom API base (cf-text default: http://localhost:8080/v1) python scripts/benchmark_plans.py --model granite-4.1-8b --api-base http://localhost:8080/v1 # Export detailed results JSON python scripts/benchmark_plans.py --model granite-4.1-8b --output data/bench_results.json """ from __future__ import annotations import argparse import json import re import sys import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass, field, asdict from pathlib import Path from typing import Any import httpx # ── Paths ────────────────────────────────────────────────────────────────────── _ROOT = Path(__file__).parent.parent _DATA_DIR = _ROOT / "data" CF_TEXT_BASE = "http://localhost:8080/v1" CF_ORCH_BASE = "http://localhost:8090/v1" CF_COORD_URL = "http://10.1.10.71:7700" # cf-orch coordinator (LAN) # ── Held-out prompts ─────────────────────────────────────────────────────────── # These are NOT in the training export (no matching docs in circuitforge-plans/). # Each prompt exercises a different CF planning domain. HELD_OUT_PROMPTS: list[dict[str, Any]] = [ { "id": "ho_001", "name": "kiwi_barcode_ocr", "domain": "feature_plan", "prompt": ( "You are a senior engineer on Kiwi, a CircuitForge pantry-tracking product. " "Write a detailed implementation plan for adding barcode scanning via device camera " "and receipt OCR to the item-add flow.\n\n" "The plan should include: file structure (create/modify), step-by-step task checklist " "with checkboxes, any DB migrations, and git commit steps." ), "expected_signals": ["task_structure", "file_paths", "cf_conventions"], }, { "id": "ho_002", "name": "peregrine_ats_scoring", "domain": "feature_design", "prompt": ( "Write a design document for Peregrine: ATS keyword scoring for job applications.\n\n" "Context: Peregrine users paste job descriptions and their resume. " "We want to score how well the resume keywords match the JD and suggest rewrites. " "Describe the architecture, data flow, and key design decisions." ), "expected_signals": ["privacy_pillar", "tier_awareness", "license_split"], }, { "id": "ho_003", "name": "tier_gate_local_llm", "domain": "architecture", "prompt": ( "Design the tier-gating architecture for a new CircuitForge product. " "The product should:\n" "- Default to local LLM inference for all tiers\n" "- Unlock cloud LLM for Paid tier and above\n" "- Keep fine-tuned model weights for Premium/Ultra only\n\n" "Describe how the tier check integrates with the LLM router, " "what happens when a Free user tries a Paid-tier feature, " "and how BYOK (bring-your-own-key) fits in." ), "expected_signals": ["tier_awareness", "privacy_pillar", "license_split"], }, { "id": "ho_004", "name": "heimdall_webhook_plan", "domain": "feature_plan", "prompt": ( "Break the following Heimdall feature into a detailed implementation plan with " "file structure and task checkboxes — Stripe webhook handler for subscription lifecycle.\n\n" "Heimdall is the CircuitForge license server (FastAPI + SQLite). " "The webhook needs to handle checkout.session.completed, " "customer.subscription.updated, and customer.subscription.deleted events." ), "expected_signals": ["task_structure", "file_paths", "safety_pillar"], }, { "id": "ho_005", "name": "nd_accessible_onboarding", "domain": "ux_design", "prompt": ( "You are a product designer working on Harrier, a CircuitForge tool for " "helping people navigate government benefits applications.\n\n" "Design the onboarding flow for neurodivergent (ND) users. " "Consider: ADHD time-blindness, executive function challenges, demand avoidance, " "and rejection sensitivity. The flow should reduce cognitive load and " "never use urgency or panic patterns." ), "expected_signals": ["accessibility", "safety_pillar", "privacy_pillar"], }, { "id": "ho_006", "name": "circuitforge_core_extraction", "domain": "architecture", "prompt": ( "Produce a CircuitForge-style design document for the following circuitforge-core " "feature — shared ActivityPub federation module.\n\n" "Background: Multiple CF products (Kiwi, Rook, Snipe) want to publish updates " "to ActivityPub. Build it once in cf-core (MIT licensed) so all products can use it. " "Design the module API, describe what belongs in MIT vs BSL, and note federation " "privacy constraints." ), "expected_signals": ["license_split", "privacy_pillar", "cf_conventions"], }, { "id": "ho_007", "name": "snipe_trust_score_plan", "domain": "feature_plan", "prompt": ( "You are a senior engineer on Snipe, a CircuitForge eBay trust-scoring tool. " "Write a step-by-step engineering plan for: seller trust score calculation.\n\n" "The score should combine: feedback ratio, account age, item-specifics completeness, " "listing photo quality, and shipping time accuracy. " "Include file structure, test plan, and migration steps." ), "expected_signals": ["task_structure", "file_paths", "safety_pillar"], }, { "id": "ho_008", "name": "avocet_training_pipeline", "domain": "feature_plan", "prompt": ( "Break the following Avocet feature into a detailed implementation plan — " "end-to-end fine-tuning pipeline from labeled JSONL to deployed GGUF model.\n\n" "Avocet is the CircuitForge email classifier training tool. " "The pipeline should: validate the dataset, run LoRA SFT via unsloth, " "quantize to Q5_K_M GGUF, run the benchmark harness, and register the model " "in the Avocet model queue if it beats the baseline." ), "expected_signals": ["task_structure", "file_paths", "cf_conventions"], }, { "id": "ho_009", "name": "privacy_data_flow", "domain": "architecture", "prompt": ( "Design the data privacy architecture for a CircuitForge cloud product. " "Describe: what PII is collected, how it's stored, retention policy, " "obfuscation strategy for cloud-side logs, and how consent is obtained " "in plain language. The product handles job applications (resumes, cover letters)." ), "expected_signals": ["privacy_pillar", "safety_pillar", "accessibility"], }, { "id": "ho_010", "name": "git_workflow_doc", "domain": "process_doc", "prompt": ( "Write a developer process document for CircuitForge: conventional commit and " "branch workflow for a BSL 1.1 open-core product.\n\n" "Cover: commit message format (type: description), branch naming, " "when to use feature branches vs direct main commits, " "how the MIT/BSL split affects which commits go in which branch, " "and how CI gates on gitleaks for secret scanning." ), "expected_signals": ["license_split", "cf_conventions", "task_structure"], }, ] # ── Rubric scoring ───────────────────────────────────────────────────────────── _TASK_STRUCTURE_RE = re.compile(r"- \[ \]", re.MULTILINE) _COMMIT_RE = re.compile(r"git commit|git add", re.IGNORECASE) _TIER_RE = re.compile(r"\b(Free|Paid|Premium|Ultra)\s+tier|\btier\s+(Free|Paid|Premium|Ultra)", re.IGNORECASE) _PRIVACY_RE = re.compile(r"\b(privacy|local.?inference|no.?logging|no.?pii|user.?data|data.?reten|obfuscat)", re.IGNORECASE) _SAFETY_RE = re.compile(r"\b(human.?approv|reversib|safety|safe.?default|fail.?safe|harm)", re.IGNORECASE) _A11Y_RE = re.compile(r"\b(neurodiverg|ND\b|accessib|adaptive|ADHD|autism|executive.?function|demand.?avoid)", re.IGNORECASE) _LICENSE_RE = re.compile(r"\b(MIT|BSL|open.?core|proprietary|commercial.?licens)", re.IGNORECASE) _FILE_PATH_RE = re.compile(r"(app/|tests?/|src/|scripts?/)\w[\w/.-]{3,}", re.IGNORECASE) _CF_CONV_RE = re.compile(r"(conda run -n cf|/Library/Development/CircuitForge|circuitforge-core|manage\.sh)", re.IGNORECASE) @dataclass class RubricScore: task_structure: float = 0.0 tier_awareness: float = 0.0 privacy_pillar: float = 0.0 safety_pillar: float = 0.0 accessibility: float = 0.0 license_split: float = 0.0 file_paths: float = 0.0 cf_conventions: float = 0.0 length_ok: float = 0.0 def total(self) -> float: vals = [self.task_structure, self.tier_awareness, self.privacy_pillar, self.safety_pillar, self.accessibility, self.license_split, self.file_paths, self.cf_conventions, self.length_ok] return sum(vals) / len(vals) def as_dict(self) -> dict[str, float]: return asdict(self) def score_response(response: str, prompt_meta: dict[str, Any]) -> RubricScore: words = len(response.split()) s = RubricScore() # Task structure: needs checkboxes AND at least one commit step checkbox_hits = len(_TASK_STRUCTURE_RE.findall(response)) has_commit = bool(_COMMIT_RE.search(response)) s.task_structure = min(1.0, checkbox_hits / 5) * 0.7 + (0.3 if has_commit else 0.0) # Tier awareness s.tier_awareness = min(1.0, len(_TIER_RE.findall(response)) / 2) # Privacy pillar s.privacy_pillar = min(1.0, len(_PRIVACY_RE.findall(response)) / 3) # Safety pillar s.safety_pillar = min(1.0, len(_SAFETY_RE.findall(response)) / 2) # Accessibility s.accessibility = min(1.0, len(_A11Y_RE.findall(response)) / 2) # License split awareness s.license_split = min(1.0, len(_LICENSE_RE.findall(response)) / 2) # File paths: at least 3 plausible path references s.file_paths = min(1.0, len(_FILE_PATH_RE.findall(response)) / 3) # CF conventions s.cf_conventions = min(1.0, len(_CF_CONV_RE.findall(response)) / 2) # Length: 200–2500 words is healthy; outside = partial credit if 200 <= words <= 2500: s.length_ok = 1.0 elif words < 200: s.length_ok = words / 200 else: s.length_ok = max(0.0, 1.0 - (words - 2500) / 2500) return s # ── Model client ─────────────────────────────────────────────────────────────── # Registry of named model targets (shorthand → {api_base, model_name}) MODEL_REGISTRY: dict[str, dict[str, str]] = { "deepseek-r1-1.5b": { "api_base": CF_TEXT_BASE, "model": "deepseek-r1-1.5b", "description": "DeepSeek R1 1.5B distill (cf-orch catalog key)", }, "deepseek-r1-7b-4bit": { "api_base": CF_TEXT_BASE, "model": "deepseek-r1-7b-4bit", "description": "DeepSeek R1 7B distill, 4-bit (cf-orch catalog key)", }, "deepseek-r1-0528-qwen3-8b-gguf": { "api_base": CF_TEXT_BASE, "model": "deepseek-r1-0528-qwen3-8b-gguf", "description": "DeepSeek R1 0528 Qwen3 8B GGUF -- current reasoning model (4 nodes)", }, "deepseek-coder-6.7b-4bit": { "api_base": CF_TEXT_BASE, "model": "deepseek-coder-6.7b-4bit", "description": "DeepSeek Coder 6.7B instruct, 4-bit (cf-orch catalog key)", }, "granite-4.1-8b": { "api_base": CF_TEXT_BASE, "model": "granite-4.1-8b", "description": "IBM Granite 4.1 8B, 4-bit -- safety-trained (cf-orch catalog key)", }, "capybarahermes-2.5-mistral-7b-gguf": { "api_base": CF_TEXT_BASE, "model": "capybarahermes-2.5-mistral-7b-gguf", "description": "CapybaraHermes 2.5 Mistral 7B GGUF -- conversational/creative (4 nodes)", }, "darwin-9b-opus-gguf": { "api_base": CF_TEXT_BASE, "model": "darwin-9b-opus-gguf", "description": "Darwin 9B Opus GGUF -- high-quality long-form writing (3 nodes)", }, "qwen2.5-3b": { "api_base": CF_TEXT_BASE, "model": "qwen2.5-3b", "description": "Qwen 2.5 3B Q4 GGUF (cf-orch catalog key)", }, "qwen2.5-7b": { "api_base": CF_TEXT_BASE, "model": "qwen2.5-7b", "description": "Qwen 2.5 7B Q4 GGUF (cf-orch catalog key)", }, } # ── cf-orch allocation ───────────────────────────────────────────────────────── def _cforch_allocate( model_id: str, cforch_url: str, startup_timeout_s: float = 300.0, ) -> tuple[str, str] | None: """Allocate a cf-text instance for model_id via the cf-orch coordinator. Returns (service_url, allocation_id) on success, None on failure. service_url is the direct node URL exposing /v1/chat/completions. """ try: resp = httpx.post( f"{cforch_url}/api/services/cf-text/allocate", json={ "model_candidates": [model_id], "caller": "avocet", "pipeline": "plans_benchmark", }, timeout=120.0, ) resp.raise_for_status() data = resp.json() service_url: str = data["url"] allocation_id: str = data.get("allocation_id", "") node_id: str = data.get("node_id", "") gpu_id: int | None = data.get("gpu_id") if data.get("started", False) and not data.get("warm", True): # Use \n so the SSE generator sees the line immediately print(f" [cold start] loading {model_id!r} — polling every 3s…", flush=True) t0 = time.monotonic() deadline = t0 + startup_timeout_s probe_misses = 0 while time.monotonic() < deadline: elapsed = time.monotonic() - t0 try: status = httpx.get(f"{cforch_url}/api/services/cf-text/status", timeout=5.0) if status.is_success: instances = status.json().get("instances", []) match = next( (i for i in instances if i.get("node_id") == node_id and i.get("gpu_id") == gpu_id), None, ) if match: probe_misses = 0 state = match.get("state", "") if state == "running": print(f" [cold start] ready in {elapsed:.0f}s", flush=True) return service_url, allocation_id elif state == "stopped": print(f" [cold start] failed — service stopped after {elapsed:.0f}s", flush=True) return None else: # still starting — emit keepalive so SSE stream stays alive print(f" [cold start] state={state!r} elapsed={elapsed:.0f}s", flush=True) else: probe_misses += 1 print(f" [cold start] waiting… elapsed={elapsed:.0f}s", flush=True) if probe_misses >= 6: try: h = httpx.get(f"{service_url}/health", timeout=3.0) if h.is_success: print(f" [cold start] ready via health check in {elapsed:.0f}s", flush=True) return service_url, allocation_id except Exception: pass else: print(f" [cold start] status poll returned {status.status_code}, elapsed={elapsed:.0f}s", flush=True) except Exception as poll_exc: print(f" [cold start] poll error: {poll_exc} elapsed={elapsed:.0f}s", flush=True) time.sleep(3.0) print(f" [cold start] timed out after {time.monotonic()-t0:.0f}s", flush=True) return None return service_url, allocation_id except Exception as exc: print(f"[warn] cf-orch allocation failed for {model_id!r}: {exc}", file=sys.stderr) return None def _call_model_direct(service_url: str, model: str, prompt: str, timeout: int = 600) -> tuple[str, float]: """Call an OpenAI-compatible /v1/chat/completions on a direct service URL.""" t0 = time.monotonic() resp = httpx.post( f"{service_url.rstrip('/')}/v1/chat/completions", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 2048, "temperature": 0.2, }, timeout=timeout, ) resp.raise_for_status() latency = time.monotonic() - t0 text = resp.json()["choices"][0]["message"]["content"] return text, latency def _call_model(api_base: str, model: str, prompt: str, timeout: int = 180) -> tuple[str, float]: """Call an OpenAI-compatible /chat/completions endpoint. Returns (text, latency_s).""" t0 = time.monotonic() resp = httpx.post( f"{api_base}/chat/completions", json={ "model": model, "messages": [{"role": "user", "content": prompt}], "max_tokens": 2048, "temperature": 0.2, }, timeout=timeout, ) resp.raise_for_status() latency = time.monotonic() - t0 text = resp.json()["choices"][0]["message"]["content"] return text, latency # ── Benchmark runner ─────────────────────────────────────────────────────────── @dataclass class PromptResult: prompt_id: str prompt_name: str model_key: str response: str latency_s: float word_count: int scores: dict[str, float] total_score: float error: str | None = None def run_benchmark( model_key: str, model_name: str, prompts: list[dict[str, Any]] | None = None, verbose: bool = False, # cf-orch path use_cforch: bool = False, cforch_url: str = CF_COORD_URL, # direct path (used when not cf-orch) api_base: str = CF_TEXT_BASE, ) -> list[PromptResult]: """Run all prompts through one model. Uses cf-orch allocation when use_cforch=True.""" if prompts is None: prompts = HELD_OUT_PROMPTS # Allocate once per model when using cf-orch service_url: str | None = None if use_cforch: print(f" Allocating {model_name!r} via cf-orch…", flush=True) alloc = _cforch_allocate(model_name, cforch_url) if alloc is None: # Return all prompts as errors return [ PromptResult( prompt_id=p["id"], prompt_name=p["name"], model_key=model_key, response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0, error=f"cf-orch allocation failed for {model_name!r}", ) for p in prompts ] service_url, _alloc_id = alloc results: list[PromptResult] = [] for p in prompts: if verbose: print(f" [{p['id']}] {p['name']} … ", end="", flush=True) try: if service_url: response, latency = _call_model_direct(service_url, model_name, p["prompt"]) else: response, latency = _call_model(api_base, model_name, p["prompt"]) rubric = score_response(response, p) result = PromptResult( prompt_id=p["id"], prompt_name=p["name"], model_key=model_key, response=response, latency_s=round(latency, 2), word_count=len(response.split()), scores=rubric.as_dict(), total_score=round(rubric.total(), 3), ) if verbose: print(f"score={result.total_score:.3f} ({result.word_count}w, {latency:.1f}s)") except Exception as exc: result = PromptResult( prompt_id=p["id"], prompt_name=p["name"], model_key=model_key, response="", latency_s=0.0, word_count=0, scores={}, total_score=0.0, error=str(exc), ) if verbose: print(f"ERROR: {exc}") results.append(result) return results # ── Reporting ────────────────────────────────────────────────────────────────── def _print_single_report(results: list[PromptResult], model_key: str) -> None: ok = [r for r in results if not r.error] err = [r for r in results if r.error] if not ok: print(f"\n[{model_key}] All {len(err)} prompts failed.\n") return avg_total = sum(r.total_score for r in ok) / len(ok) avg_latency = sum(r.latency_s for r in ok) / len(ok) # Aggregate per-rubric averages rubric_keys = list(ok[0].scores.keys()) rubric_avgs = {k: sum(r.scores.get(k, 0) for r in ok) / len(ok) for k in rubric_keys} print(f"\n{'='*60}") print(f" Model : {model_key}") print(f" Prompts: {len(ok)}/{len(results)} passed ({len(err)} errors)") print(f" Overall score : {avg_total:.3f} (avg latency {avg_latency:.1f}s)") print(f"\n Rubric breakdown:") for k, v in sorted(rubric_avgs.items(), key=lambda x: -x[1]): bar = "█" * int(v * 20) print(f" {k:<22} {v:.3f} {bar}") print(f"\n Per-prompt scores:") for r in sorted(ok, key=lambda x: -x.total_score): flag = "⚠" if r.total_score < 0.3 else " " print(f" {flag} {r.prompt_id} {r.prompt_name:<35} {r.total_score:.3f} ({r.word_count}w)") if err: print(f"\n Errors:") for r in err: print(f" {r.prompt_id} {r.prompt_name}: {r.error}") print(f"{'='*60}\n") def _print_comparison_table(all_results: dict[str, list[PromptResult]]) -> None: model_keys = list(all_results.keys()) prompt_ids = [p["id"] for p in HELD_OUT_PROMPTS] # Scores by (model, prompt_id) score_map: dict[tuple[str, str], float] = {} for mk, results in all_results.items(): for r in results: score_map[(mk, r.prompt_id)] = r.total_score if not r.error else 0.0 col_w = 10 header = f"{'Prompt':<35}" + "".join(f"{mk[:col_w-1]:<{col_w}}" for mk in model_keys) print(f"\n{'='*len(header)}") print(" COMPARISON TABLE") print(f"{'='*len(header)}") print(f" {header}") print(f" {'-'*len(header)}") for pid in prompt_ids: pname = next(p["name"] for p in HELD_OUT_PROMPTS if p["id"] == pid) row = f" {pname:<35}" best = max(score_map.get((mk, pid), 0.0) for mk in model_keys) for mk in model_keys: v = score_map.get((mk, pid), 0.0) marker = "*" if v == best and len(model_keys) > 1 else " " row += f"{v:.3f}{marker} " print(row) print(f" {'-'*len(header)}") avgs_row = f" {'AVERAGE':<35}" best_avg = -1.0 avgs: dict[str, float] = {} for mk in model_keys: vals = [score_map.get((mk, pid), 0.0) for pid in prompt_ids] avgs[mk] = sum(vals) / len(vals) best_avg = max(best_avg, avgs[mk]) for mk in model_keys: marker = "*" if avgs[mk] == best_avg and len(model_keys) > 1 else " " avgs_row += f"{avgs[mk]:.3f}{marker} " print(avgs_row) print(f"{'='*len(header)}\n") if len(model_keys) > 1: winner = max(avgs, key=lambda k: avgs[k]) print(f" Winner: {winner} (avg {avgs[winner]:.3f})\n") # ── CLI ──────────────────────────────────────────────────────────────────────── def main() -> None: parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--list-models", action="store_true", help="Print registered model shortcuts and exit") parser.add_argument("--model", metavar="KEY", help="Benchmark a single model (registry key or raw model name)") parser.add_argument("--compare", nargs="+", metavar="KEY", help="Compare two or more models side-by-side") parser.add_argument("--cforch", action="store_true", help="Route inference through cf-orch coordinator (allocate per model)") parser.add_argument("--cforch-url", default=CF_COORD_URL, metavar="URL", help=f"cf-orch coordinator URL (default: {CF_COORD_URL})") parser.add_argument("--api-base", default=None, help="Direct API base URL when not using cf-orch") parser.add_argument("--model-name", default=None, help="Override model name sent to API (single-model runs only)") parser.add_argument("--prompts", nargs="+", metavar="ID", help="Run only specific prompt IDs (e.g. ho_001 ho_003)") parser.add_argument("--output", type=Path, default=None, help="Write detailed JSON results to this path") parser.add_argument("--workers", type=int, default=1, metavar="N", help="Run N models concurrently (default 1). Set to number of available nodes.") parser.add_argument("--verbose", "-v", action="store_true", help="Print per-prompt progress") args = parser.parse_args() if args.list_models: print("\nRegistered model shortcuts:") for key, info in MODEL_REGISTRY.items(): print(f" {key:<20} {info['description']}") print(f"\nDefault endpoints:") print(f" direct {CF_TEXT_BASE}") print(f" cf-orch {CF_COORD_URL}") return prompts = HELD_OUT_PROMPTS if args.prompts: ids = set(args.prompts) prompts = [p for p in HELD_OUT_PROMPTS if p["id"] in ids] if not prompts: print(f"No prompts matched IDs: {args.prompts}", file=sys.stderr) sys.exit(1) model_keys: list[str] = [] if args.compare: model_keys = args.compare elif args.model: model_keys = [args.model] else: parser.print_help() sys.exit(0) all_results: dict[str, list[PromptResult]] = {} print_lock = threading.Lock() def _run_one(mk: str) -> tuple[str, list[PromptResult]]: if mk in MODEL_REGISTRY: reg = MODEL_REGISTRY[mk] model_name = args.model_name or reg["model"] direct_base = args.api_base or reg["api_base"] else: model_name = args.model_name or mk direct_base = args.api_base or CF_TEXT_BASE if args.cforch: with print_lock: print(f"\nRunning [{mk}] via cf-orch ({args.cforch_url}) model={model_name}") results = run_benchmark( mk, model_name, prompts=prompts, verbose=args.verbose, use_cforch=True, cforch_url=args.cforch_url, ) else: with print_lock: print(f"\nRunning [{mk}] → {direct_base} model={model_name}") results = run_benchmark( mk, model_name, prompts=prompts, verbose=args.verbose, api_base=direct_base, ) with print_lock: _print_single_report(results, mk) return mk, results workers = max(1, args.workers) if workers == 1 or len(model_keys) == 1: for mk in model_keys: mk_out, results = _run_one(mk) all_results[mk_out] = results else: with ThreadPoolExecutor(max_workers=workers) as pool: futures = {pool.submit(_run_one, mk): mk for mk in model_keys} for fut in as_completed(futures): mk_out, results = fut.result() all_results[mk_out] = results if len(model_keys) > 1: _print_comparison_table(all_results) if args.output: args.output.parent.mkdir(parents=True, exist_ok=True) payload = { mk: [asdict(r) for r in results] for mk, results in all_results.items() } with open(args.output, "w", encoding="utf-8") as f: json.dump(payload, f, indent=2, ensure_ascii=False) print(f"Wrote detailed results to {args.output}") if __name__ == "__main__": main()