feat: dual-GPU DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection

feat: benchmark_classifier — MODEL_REGISTRY, --list-models, --score, --compare modes
feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help
2026-02-27 06:20:57 -08:00 · 2026-02-27 06:19:32 -08:00 · 2026-02-27 06:18:34 -08:00 · 2026-02-27 06:16:17 -08:00 · 2026-02-27 06:16:04 -08:00 · 2026-02-27 00:17:00 -08:00
19 changed files with 3796 additions and 19 deletions
--- a/.gitignore
+++ b/.gitignore
@ -35,3 +35,6 @@ config/user.yaml.working
 # Claude context files — kept out of version control
 CLAUDE.md
 data/email_score.jsonl
 data/email_compare_sample.jsonl
--- a/4
+++ b/4
@ -23,6 +23,7 @@ COMPOSE ?= $(shell \
 # compose.override.yml. We must include it explicitly when present.
 OVERRIDE_FILE := $(wildcard compose.override.yml)
 COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,)
 DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
 COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE)
 ifneq (,$(findstring podman,$(COMPOSE)))
@ -34,6 +35,9 @@ else
    COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
  endif
 endif
 ifeq ($(PROFILE),dual-gpu)
  COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
 endif
 # 'remote' means base services only — no services are tagged 'remote' in compose.yml,
 # so --profile remote is a no-op with Docker and a fatal error on old podman-compose.
--- a/compose.gpu.yml
+++ b/compose.gpu.yml
@ -18,6 +18,15 @@ services:
              device_ids: ["0"]
              capabilities: [gpu]
  ollama_research:
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["1"]
              capabilities: [gpu]
  vision:
    deploy:
      resources:
--- a/compose.podman-gpu.yml
+++ b/compose.podman-gpu.yml
@ -18,6 +18,14 @@ services:
        reservations:
          devices: []
  ollama_research:
    devices:
      - nvidia.com/gpu=1
    deploy:
      resources:
        reservations:
          devices: []
  vision:
    devices:
      - nvidia.com/gpu=0
--- a/compose.yml
+++ b/compose.yml
@ -1,5 +1,5 @@
 # compose.yml — Peregrine by Circuit Forge LLC
-# Profiles: remote | cpu | single-gpu | dual-gpu
+# Profiles: remote | cpu | single-gpu | dual-gpu-ollama | dual-gpu-vllm | dual-gpu-mixed
 services:
  app:
@ -52,7 +52,21 @@ services:
      - OLLAMA_MODELS=/root/.ollama
      - DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b}
    entrypoint: ["/bin/bash", "/entrypoint.sh"]
-    profiles: [cpu, single-gpu, dual-gpu]
+    profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
    restart: unless-stopped
  ollama_research:
    image: ollama/ollama:latest
    ports:
      - "${OLLAMA_RESEARCH_PORT:-11435}:11434"
    volumes:
      - ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
      - ./docker/ollama/entrypoint.sh:/entrypoint.sh
    environment:
      - OLLAMA_MODELS=/root/.ollama
      - DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
    entrypoint: ["/bin/bash", "/entrypoint.sh"]
    profiles: [dual-gpu-ollama, dual-gpu-mixed]
    restart: unless-stopped
  vision:
@ -64,7 +78,7 @@ services:
    environment:
      - VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2}
      - VISION_REVISION=${VISION_REVISION:-2025-01-09}
-    profiles: [single-gpu, dual-gpu]
+    profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
    restart: unless-stopped
  vllm:
@ -81,7 +95,7 @@ services:
      --enforce-eager
      --max-num-seqs 8
      --cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
-    profiles: [dual-gpu]
+    profiles: [dual-gpu-vllm, dual-gpu-mixed]
    restart: unless-stopped
  finetune:
--- a/config/llm.yaml
+++ b/config/llm.yaml
@ -45,6 +45,13 @@ backends:
    model: __auto__
    supports_images: false
    type: openai_compat
  vllm_research:
    api_key: ''
    base_url: http://host.docker.internal:8000/v1
    enabled: true
    model: __auto__
    supports_images: false
    type: openai_compat
 fallback_order:
 - ollama
 - claude_code
@ -53,7 +60,7 @@ fallback_order:
 - anthropic
 research_fallback_order:
 - claude_code
- vllm
+- vllm_research
 - ollama_research
 - github_copilot
 - anthropic
--- a/data/email_score.jsonl.example
+++ b/data/email_score.jsonl.example
@ -0,0 +1,8 @@
 {"subject": "Interview Invitation — Senior Engineer", "body": "Hi Meghan, we'd love to schedule a 30-min phone screen. Are you available Thursday at 2pm? Please reply to confirm.", "label": "interview_scheduled"}
 {"subject": "Your application to Acme Corp", "body": "Thank you for your interest in the Senior Engineer role. After careful consideration, we have decided to move forward with other candidates whose experience more closely matches our current needs.", "label": "rejected"}
 {"subject": "Offer Letter — Product Manager at Initech", "body": "Dear Meghan, we are thrilled to extend an offer of employment for the Product Manager position. Please find the attached offer letter outlining compensation and start date.", "label": "offer_received"}
 {"subject": "Quick question about your background", "body": "Hi Meghan, I came across your profile and would love to connect. We have a few roles that seem like a great match. Would you be open to a brief chat this week?", "label": "positive_response"}
 {"subject": "Company Culture Survey — Acme Corp", "body": "Meghan, as part of our evaluation process, we invite all candidates to complete our culture fit assessment. The survey takes approximately 15 minutes. Please click the link below.", "label": "survey_received"}
 {"subject": "Application Received — DataCo", "body": "Thank you for submitting your application for the Data Engineer role at DataCo. We have received your materials and will be in touch if your qualifications match our needs.", "label": "neutral"}
 {"subject": "Following up on your application", "body": "Hi Meghan, I wanted to follow up on your recent application. Your background looks interesting and we'd like to learn more. Can we set up a quick call?", "label": "positive_response"}
 {"subject": "We're moving forward with other candidates", "body": "Dear Meghan, thank you for taking the time to interview with us. After thoughtful consideration, we have decided not to move forward with your candidacy at this time.", "label": "rejected"}
--- a/docs/plans/2026-02-26-dual-gpu-design.md
+++ b/docs/plans/2026-02-26-dual-gpu-design.md
@ -0,0 +1,257 @@
 # Peregrine — Dual-GPU / Dual-Inference Design
 **Date:** 2026-02-26
 **Status:** Approved — ready for implementation
 **Scope:** Peregrine (reference impl; patterns propagate to future products)
 ---
 ## Goal
 Replace the fixed `dual-gpu` profile (Ollama + vLLM hardwired to GPU 0 + GPU 1) with a
 `DUAL_GPU_MODE` env var that selects which inference stack occupies GPU 1. Simultaneously
 add a first-run download size warning to preflight so users know what they're in for before
 Docker starts pulling images and models.
 ---
 ## Modes
 | `DUAL_GPU_MODE` | GPU 0 | GPU 1 | Research backend |
 |-----------------|-------|-------|-----------------|
 | `ollama` (default) | ollama + vision | ollama_research | `ollama_research` |
 | `vllm` | ollama + vision | vllm | `vllm_research` |
 | `mixed` | ollama + vision | ollama_research + vllm (VRAM-split) | `vllm_research` → `ollama_research` fallback |
 `mixed` requires sufficient VRAM on GPU 1. Preflight warns (not blocks) when GPU 1 has
 < 12 GB free before starting in mixed mode.
 Cover letters always use `ollama` on GPU 0. Research uses whichever GPU 1 backend is
 reachable. The LLM router's `_is_reachable()` check handles this transparently — the
 fallback chain simply skips services that aren't running.
 ---
 ## Compose Profile Architecture
 Docker Compose profiles used to gate which services start per mode.
 `DUAL_GPU_MODE` is read by the Makefile and passed as a second `--profile` flag.
 ### Service → profile mapping
 | Service | Profiles |
 |---------|---------|
 | `ollama` | `cpu`, `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
 | `vision` | `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
 | `ollama_research` | `dual-gpu-ollama`, `dual-gpu-mixed` |
 | `vllm` | `dual-gpu-vllm`, `dual-gpu-mixed` |
 | `finetune` | `finetune` |
 User-facing profiles remain: `remote`, `cpu`, `single-gpu`, `dual-gpu`.
 Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected by the
 Makefile and never typed by the user.
 ---
 ## File Changes
 ### `compose.yml`
 **`ollama`** — add all dual-gpu sub-profiles to `profiles`:
 ```yaml
 profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
 ```
 **`vision`** — same pattern:
 ```yaml
 profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
 ```
 **`vllm`** — change from `[dual-gpu]` to:
 ```yaml
 profiles: [dual-gpu-vllm, dual-gpu-mixed]
 ```
 **`ollama_research`** — new service:
 ```yaml
 ollama_research:
  image: ollama/ollama:latest
  ports:
    - "${OLLAMA_RESEARCH_PORT:-11435}:11434"
  volumes:
    - ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama  # shared — no double download
    - ./docker/ollama/entrypoint.sh:/entrypoint.sh
  environment:
    - OLLAMA_MODELS=/root/.ollama
    - DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
  entrypoint: ["/bin/bash", "/entrypoint.sh"]
  profiles: [dual-gpu-ollama, dual-gpu-mixed]
  restart: unless-stopped
 ```
 ### `compose.gpu.yml`
 Add `ollama_research` block (GPU 1). `vllm` stays on GPU 1 as-is:
 ```yaml
 ollama_research:
  deploy:
    resources:
      reservations:
        devices:
          - driver: nvidia
            device_ids: ["1"]
            capabilities: [gpu]
 ```
 ### `compose.podman-gpu.yml`
 Same addition for Podman CDI:
 ```yaml
 ollama_research:
  devices:
    - nvidia.com/gpu=1
  deploy:
    resources:
      reservations:
        devices: []
 ```
 ### `Makefile`
 Two additions after existing `COMPOSE` detection:
 ```makefile
 DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
 # GPU overlay: matches single-gpu, dual-gpu (findstring gpu already covers these)
 # Sub-profile injection for dual-gpu modes:
 ifeq ($(PROFILE),dual-gpu)
  COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
 endif
 ```
 Update `manage.sh` usage block to document `dual-gpu` profile with `DUAL_GPU_MODE` note:
 ```
 dual-gpu     Ollama + Vision on GPU 0; GPU 1 mode set by DUAL_GPU_MODE
             DUAL_GPU_MODE=ollama  (default) ollama_research on GPU 1
             DUAL_GPU_MODE=vllm             vllm on GPU 1
             DUAL_GPU_MODE=mixed            both on GPU 1 (VRAM-split; see preflight warning)
 ```
 ### `scripts/preflight.py`
 **1. `_SERVICES` — add `ollama_research`:**
 ```python
 "ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
 ```
 **2. `_LLM_BACKENDS` — add entries for both new backends:**
 ```python
 "ollama_research": [("ollama_research", "/v1")],
 # vllm_research is an alias for vllm's port — preflight updates base_url for both:
 "vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
 ```
 **3. `_DOCKER_INTERNAL` — add `ollama_research`:**
 ```python
 "ollama_research": ("ollama_research", 11434),  # container-internal port is always 11434
 ```
 **4. `recommend_profile()` — unchanged** (still returns `"dual-gpu"` for 2 GPUs).
 Write `DUAL_GPU_MODE=ollama` to `.env` when first setting up a 2-GPU system.
 **5. Mixed-mode VRAM warning** — after GPU resource section, before closing line:
 ```python
 dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
 if dual_gpu_mode == "mixed" and len(gpus) >= 2:
    if gpus[1]["vram_free_gb"] < 12:
        print(f"║  ⚠  DUAL_GPU_MODE=mixed: GPU 1 has only {gpus[1]['vram_free_gb']:.1f} GB free")
        print(f"║     Running ollama_research + vllm together may cause OOM.")
        print(f"║     Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm instead.")
 ```
 **6. Download size warning** — profile-aware block added just before the closing `╚` line:
 ```
 ║  Download sizes (first-run estimates)
 ║    Docker images
 ║      ollama/ollama        ~800 MB  (shared by ollama + ollama_research)
 ║      searxng/searxng      ~300 MB
 ║      app (Python build)  ~1.5 GB
 ║      vision service      ~3.0 GB  [single-gpu and above]
 ║      vllm/vllm-openai   ~10.0 GB  [vllm / mixed mode only]
 ║
 ║    Model weights  (lazy-loaded on first use)
 ║      llama3.2:3b          ~2.0 GB  → OLLAMA_MODELS_DIR
 ║      moondream2            ~1.8 GB  → vision container cache  [single-gpu+]
 ║    Note: ollama + ollama_research share the same model dir — no double download
 ║
 ║  ⚠  Total first-run: ~X GB  (models persist between restarts)
 ```
 Total is summed at runtime based on active profile + `DUAL_GPU_MODE`.
 Size table (used by the warning calculator):
 | Component | Size | Condition |
 |-----------|------|-----------|
 | `ollama/ollama` image | 800 MB | cpu, single-gpu, dual-gpu |
 | `searxng/searxng` image | 300 MB | always |
 | app image | 1,500 MB | always |
 | vision service image | 3,000 MB | single-gpu, dual-gpu |
 | `vllm/vllm-openai` image | 10,000 MB | vllm or mixed mode |
 | llama3.2:3b weights | 2,000 MB | cpu, single-gpu, dual-gpu |
 | moondream2 weights | 1,800 MB | single-gpu, dual-gpu |
 ### `config/llm.yaml`
 **Add `vllm_research` backend:**
 ```yaml
 vllm_research:
  api_key: ''
  base_url: http://host.docker.internal:8000/v1  # same port as vllm; preflight keeps in sync
  enabled: true
  model: __auto__
  supports_images: false
  type: openai_compat
 ```
 **Update `research_fallback_order`:**
 ```yaml
 research_fallback_order:
  - claude_code
  - vllm_research
  - ollama_research
  - github_copilot
  - anthropic
 ```
 `vllm` stays in the main `fallback_order` (cover letters). `vllm_research` is the explicit
 research alias for the same service — different config key, same port, makes routing intent
 readable in the YAML.
 ---
 ## Downstream Compatibility
 The LLM router requires no changes. `_is_reachable()` already skips backends that aren't
 responding. When `DUAL_GPU_MODE=ollama`, `vllm_research` is unreachable and skipped;
 `ollama_research` is up and used. When `DUAL_GPU_MODE=vllm`, the reverse. `mixed` mode
 makes both reachable; `vllm_research` wins as the higher-priority entry.
 Preflight's `update_llm_yaml()` keeps `base_url` values correct for both adopted (external)
 and Docker-internal routing automatically, since `vllm_research` is registered under the
 `"vllm"` key in `_LLM_BACKENDS`.
 ---
 ## Future Considerations
 - **Triple-GPU / 3+ service configs:** When a third product is active, extract this pattern
  into `circuitforge-core` as a reusable inference topology manager.
 - **Dual vLLM:** Two vLLM instances (e.g., different model sizes per task) follows the same
  pattern — add `vllm_research` as a separate compose service on its own port.
 - **VRAM-aware model selection:** Preflight could suggest smaller models when VRAM is tight
  in mixed mode (e.g., swap llama3.2:3b → llama3.2:1b for the research instance).
 - **Queue optimizer (1-GPU / CPU):** When only one inference backend is available and a batch
  of tasks is queued, group by task type (all cover letters first, then all research briefs)
  to avoid repeated model context switches. Tracked separately.
--- a/docs/plans/2026-02-26-dual-gpu-plan.md
+++ b/docs/plans/2026-02-26-dual-gpu-plan.md
@ -0,0 +1,811 @@
 # Dual-GPU / Dual-Inference Implementation Plan
 > **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
 **Goal:** Add `DUAL_GPU_MODE=ollama|vllm|mixed` env var that gates which inference service occupies GPU 1 on dual-GPU systems, plus a first-run download size warning in preflight.
 **Architecture:** Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected alongside `--profile dual-gpu` by the Makefile based on `DUAL_GPU_MODE`. The LLM router requires zero changes — `_is_reachable()` naturally skips backends that aren't running. Preflight gains `ollama_research` as a tracked service and emits a size warning block.
 **Tech Stack:** Docker Compose profiles, Python (preflight.py), YAML (llm.yaml, compose files), bash (Makefile, manage.sh)
 **Design doc:** `docs/plans/2026-02-26-dual-gpu-design.md`
 **Test runner:** `conda run -n job-seeker python -m pytest tests/ -v`
 ---
 ### Task 1: Update `config/llm.yaml`
 **Files:**
 - Modify: `config/llm.yaml`
 **Step 1: Add `vllm_research` backend and update `research_fallback_order`**
 Open `config/llm.yaml`. After the `vllm:` block, add:
 ```yaml
  vllm_research:
    api_key: ''
    base_url: http://host.docker.internal:8000/v1
    enabled: true
    model: __auto__
    supports_images: false
    type: openai_compat
 ```
 Replace `research_fallback_order:` section with:
 ```yaml
 research_fallback_order:
 - claude_code
 - vllm_research
 - ollama_research
 - github_copilot
 - anthropic
 ```
 **Step 2: Verify YAML parses cleanly**
 ```bash
 conda run -n job-seeker python -c "import yaml; yaml.safe_load(open('config/llm.yaml'))"
 ```
 Expected: no output (no error).
 **Step 3: Run existing llm config test**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_llm_router.py::test_config_loads -v
 ```
 Expected: PASS
 **Step 4: Commit**
 ```bash
 git add config/llm.yaml
 git commit -m "feat: add vllm_research backend and update research_fallback_order"
 ```
 ---
 ### Task 2: Write failing tests for preflight changes
 **Files:**
 - Create: `tests/test_preflight.py`
 No existing test file for preflight. Write all tests upfront — they fail until Task 3–5 implement the code.
 **Step 1: Create `tests/test_preflight.py`**
 ```python
 """Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
 import pytest
 from pathlib import Path
 from unittest.mock import patch
 import yaml
 import tempfile
 import os
 # ── Service table ──────────────────────────────────────────────────────────────
 def test_ollama_research_in_services():
    """ollama_research must be in _SERVICES at port 11435."""
    from scripts.preflight import _SERVICES
    assert "ollama_research" in _SERVICES
    _, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
    assert default_port == 11435
    assert env_var == "OLLAMA_RESEARCH_PORT"
    assert docker_owned is True
    assert adoptable is True
 def test_ollama_research_in_llm_backends():
    """ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
    from scripts.preflight import _LLM_BACKENDS
    assert "ollama_research" in _LLM_BACKENDS
    # Should map to the ollama_research llm backend
    backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
    assert "ollama_research" in backend_names
 def test_vllm_research_in_llm_backends():
    """vllm_research must be registered under vllm in _LLM_BACKENDS."""
    from scripts.preflight import _LLM_BACKENDS
    assert "vllm" in _LLM_BACKENDS
    backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
    assert "vllm_research" in backend_names
 def test_ollama_research_in_docker_internal():
    """ollama_research must map to internal port 11434 (Ollama's container port)."""
    from scripts.preflight import _DOCKER_INTERNAL
    assert "ollama_research" in _DOCKER_INTERNAL
    hostname, port = _DOCKER_INTERNAL["ollama_research"]
    assert hostname == "ollama_research"
    assert port == 11434  # container-internal port is always 11434
 def test_ollama_not_mapped_to_ollama_research_backend():
    """ollama service key must only update the ollama llm backend, not ollama_research."""
    from scripts.preflight import _LLM_BACKENDS
    ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
    assert "ollama_research" not in ollama_backend_names
 # ── Download size warning ──────────────────────────────────────────────────────
 def test_download_size_remote_profile():
    """Remote profile: only searxng + app, no ollama, no vision, no vllm."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("remote", "ollama")
    assert "searxng" in sizes
    assert "app" in sizes
    assert "ollama" not in sizes
    assert "vision_image" not in sizes
    assert "vllm_image" not in sizes
 def test_download_size_cpu_profile():
    """CPU profile: adds ollama image + llama3.2:3b weights."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("cpu", "ollama")
    assert "ollama" in sizes
    assert "llama3_2_3b" in sizes
    assert "vision_image" not in sizes
 def test_download_size_single_gpu_profile():
    """Single-GPU: adds vision image + moondream2 weights."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("single-gpu", "ollama")
    assert "vision_image" in sizes
    assert "moondream2" in sizes
    assert "vllm_image" not in sizes
 def test_download_size_dual_gpu_ollama_mode():
    """dual-gpu + ollama mode: no vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "ollama")
    assert "vllm_image" not in sizes
 def test_download_size_dual_gpu_vllm_mode():
    """dual-gpu + vllm mode: adds ~10 GB vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "vllm")
    assert "vllm_image" in sizes
    assert sizes["vllm_image"] >= 9000  # at least 9 GB
 def test_download_size_dual_gpu_mixed_mode():
    """dual-gpu + mixed mode: also includes vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "mixed")
    assert "vllm_image" in sizes
 # ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
 def test_mixed_mode_vram_warning_triggered():
    """Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0},  # tight
    ]
    warning = _mixed_mode_vram_warning(gpus, "mixed")
    assert warning is not None
    assert "8.0" in warning or "GPU 1" in warning
 def test_mixed_mode_vram_warning_not_triggered_with_headroom():
    """Should return None when GPU 1 has >= 12 GB free."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0},  # plenty
    ]
    warning = _mixed_mode_vram_warning(gpus, "mixed")
    assert warning is None
 def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
    """Warning only applies in mixed mode."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
    ]
    assert _mixed_mode_vram_warning(gpus, "ollama") is None
    assert _mixed_mode_vram_warning(gpus, "vllm") is None
 # ── update_llm_yaml with ollama_research ──────────────────────────────────────
 def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
    """ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
    from scripts.preflight import update_llm_yaml
    llm_cfg = {
        "backends": {
            "ollama": {"base_url": "http://old", "type": "openai_compat"},
            "ollama_research": {"base_url": "http://old", "type": "openai_compat"},
            "vllm": {"base_url": "http://old", "type": "openai_compat"},
            "vllm_research": {"base_url": "http://old", "type": "openai_compat"},
            "vision_service": {"base_url": "http://old", "type": "vision_service"},
        }
    }
    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
        yaml.dump(llm_cfg, f)
        tmp_path = Path(f.name)
    ports = {
        "ollama": {
            "resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
        },
        "ollama_research": {
            "resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
        },
        "vllm": {
            "resolved": 8000, "external": False, "env_var": "VLLM_PORT"
        },
        "vision": {
            "resolved": 8002, "external": False, "env_var": "VISION_PORT"
        },
    }
    try:
        # Patch LLM_YAML to point at our temp file
        with patch("scripts.preflight.LLM_YAML", tmp_path):
            update_llm_yaml(ports)
        result = yaml.safe_load(tmp_path.read_text())
        # Docker-internal: use service name + container port
        assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
        # vllm_research must match vllm's URL
        assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
    finally:
        tmp_path.unlink()
 def test_update_llm_yaml_sets_ollama_research_url_external():
    """When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
    from scripts.preflight import update_llm_yaml
    llm_cfg = {
        "backends": {
            "ollama": {"base_url": "http://old", "type": "openai_compat"},
            "ollama_research": {"base_url": "http://old", "type": "openai_compat"},
        }
    }
    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
        yaml.dump(llm_cfg, f)
        tmp_path = Path(f.name)
    ports = {
        "ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
        "ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
    }
    try:
        with patch("scripts.preflight.LLM_YAML", tmp_path):
            update_llm_yaml(ports)
        result = yaml.safe_load(tmp_path.read_text())
        assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
    finally:
        tmp_path.unlink()
 ```
 **Step 2: Run tests to confirm they all fail**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_preflight.py -v 2>&1 | head -50
 ```
 Expected: all FAIL with `ImportError` or `AssertionError` — that's correct.
 **Step 3: Commit failing tests**
 ```bash
 git add tests/test_preflight.py
 git commit -m "test: add failing tests for dual-gpu preflight additions"
 ```
 ---
 ### Task 3: `preflight.py` — service table additions
 **Files:**
 - Modify: `scripts/preflight.py:46-67` (`_SERVICES`, `_LLM_BACKENDS`, `_DOCKER_INTERNAL`)
 **Step 1: Update `_SERVICES`**
 Find the `_SERVICES` dict (currently ends at the `"ollama"` entry). Add `ollama_research` as a new entry:
 ```python
 _SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
    "streamlit":        ("streamlit_port",        8501,  "STREAMLIT_PORT",        True,  False),
    "searxng":          ("searxng_port",           8888,  "SEARXNG_PORT",          True,  True),
    "vllm":             ("vllm_port",              8000,  "VLLM_PORT",             True,  True),
    "vision":           ("vision_port",            8002,  "VISION_PORT",           True,  True),
    "ollama":           ("ollama_port",            11434, "OLLAMA_PORT",           True,  True),
    "ollama_research":  ("ollama_research_port",   11435, "OLLAMA_RESEARCH_PORT",  True,  True),
 }
 ```
 **Step 2: Update `_LLM_BACKENDS`**
 Replace the existing dict:
 ```python
 _LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
    "ollama":          [("ollama", "/v1")],
    "ollama_research": [("ollama_research", "/v1")],
    "vllm":            [("vllm", "/v1"), ("vllm_research", "/v1")],
    "vision":          [("vision_service", "")],
 }
 ```
 **Step 3: Update `_DOCKER_INTERNAL`**
 Add `ollama_research` entry:
 ```python
 _DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
    "ollama":          ("ollama",          11434),
    "ollama_research": ("ollama_research", 11434),  # container-internal port is always 11434
    "vllm":            ("vllm",            8000),
    "vision":          ("vision",          8002),
    "searxng":         ("searxng",         8080),
 }
 ```
 **Step 4: Run service table tests**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_preflight.py::test_ollama_research_in_services tests/test_preflight.py::test_ollama_research_in_llm_backends tests/test_preflight.py::test_vllm_research_in_llm_backends tests/test_preflight.py::test_ollama_research_in_docker_internal tests/test_preflight.py::test_ollama_not_mapped_to_ollama_research_backend tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_docker_internal tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_external -v
 ```
 Expected: all PASS
 **Step 5: Commit**
 ```bash
 git add scripts/preflight.py
 git commit -m "feat: add ollama_research to preflight service table and LLM backend map"
 ```
 ---
 ### Task 4: `preflight.py` — `_download_size_mb()` pure function
 **Files:**
 - Modify: `scripts/preflight.py` (add new function after `calc_cpu_offload_gb`)
 **Step 1: Add the function**
 After `calc_cpu_offload_gb()`, add:
 ```python
 def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
    """
    Return estimated first-run download sizes in MB, keyed by component name.
    Profile-aware: only includes components that will actually be pulled.
    """
    sizes: dict[str, int] = {
        "searxng": 300,
        "app":     1500,
    }
    if profile in ("cpu", "single-gpu", "dual-gpu"):
        sizes["ollama"]      = 800
        sizes["llama3_2_3b"] = 2000
    if profile in ("single-gpu", "dual-gpu"):
        sizes["vision_image"] = 3000
        sizes["moondream2"]   = 1800
    if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
        sizes["vllm_image"] = 10000
    return sizes
 ```
 **Step 2: Run download size tests**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_preflight.py -k "download_size" -v
 ```
 Expected: all PASS
 **Step 3: Commit**
 ```bash
 git add scripts/preflight.py
 git commit -m "feat: add _download_size_mb() pure function for preflight size warning"
 ```
 ---
 ### Task 5: `preflight.py` — VRAM warning, size report block, DUAL_GPU_MODE default
 **Files:**
 - Modify: `scripts/preflight.py` (three additions to `main()` and a new helper)
 **Step 1: Add `_mixed_mode_vram_warning()` after `_download_size_mb()`**
 ```python
 def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
    """
    Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
    Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
    """
    if dual_gpu_mode != "mixed" or len(gpus) < 2:
        return None
    free = gpus[1]["vram_free_gb"]
    if free < 12:
        return (
            f"⚠  DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
            f"running ollama_research + vllm together may cause OOM. "
            f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
        )
    return None
 ```
 **Step 2: Run VRAM warning tests**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_preflight.py -k "vram" -v
 ```
 Expected: all PASS
 **Step 3: Wire size warning into `main()` report block**
 In `main()`, find the closing `print("╚═...═╝")` line. Add the size warning block just before it:
 ```python
        # ── Download size warning ──────────────────────────────────────────────
        dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
        sizes = _download_size_mb(profile, dual_gpu_mode)
        total_mb = sum(sizes.values())
        print("║")
        print("║  Download sizes (first-run estimates)")
        print("║    Docker images")
        print(f"║      app (Python build)   ~{sizes.get('app', 0):,} MB")
        if "searxng" in sizes:
            print(f"║      searxng/searxng       ~{sizes['searxng']:,} MB")
        if "ollama" in sizes:
            shared_note = "  (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
            print(f"║      ollama/ollama         ~{sizes['ollama']:,} MB{shared_note}")
        if "vision_image" in sizes:
            print(f"║      vision service        ~{sizes['vision_image']:,} MB  (torch + moondream)")
        if "vllm_image" in sizes:
            print(f"║      vllm/vllm-openai      ~{sizes['vllm_image']:,} MB")
        print("║    Model weights  (lazy-loaded on first use)")
        if "llama3_2_3b" in sizes:
            print(f"║      llama3.2:3b            ~{sizes['llama3_2_3b']:,} MB  → OLLAMA_MODELS_DIR")
        if "moondream2" in sizes:
            print(f"║      moondream2             ~{sizes['moondream2']:,} MB  → vision container cache")
        if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
            print("║    Note: ollama + ollama_research share model dir — no double download")
        print(f"║  ⚠  Total first-run: ~{total_mb / 1024:.1f} GB  (models persist between restarts)")
        # ── Mixed-mode VRAM warning ────────────────────────────────────────────
        vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
        if vram_warn:
            print("║")
            print(f"║  {vram_warn}")
 ```
 **Step 4: Wire `DUAL_GPU_MODE` default into `write_env()` block in `main()`**
 In `main()`, find the `if not args.check_only:` block. After `env_updates["PEREGRINE_GPU_NAMES"]`, add:
 ```python
        # Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
        if len(gpus) >= 2:
            existing_env: dict[str, str] = {}
            if ENV_FILE.exists():
                for line in ENV_FILE.read_text().splitlines():
                    if "=" in line and not line.startswith("#"):
                        k, _, v = line.partition("=")
                        existing_env[k.strip()] = v.strip()
            if "DUAL_GPU_MODE" not in existing_env:
                env_updates["DUAL_GPU_MODE"] = "ollama"
 ```
 **Step 5: Add `import os` if not already present at top of file**
 Check line 1–30 of `scripts/preflight.py`. `import os` is already present inside `get_cpu_cores()` as a local import — move it to the top-level imports block:
 ```python
 import os  # add alongside existing stdlib imports
 ```
 And remove the local `import os` inside `get_cpu_cores()`.
 **Step 6: Run all preflight tests**
 ```bash
 conda run -n job-seeker python -m pytest tests/test_preflight.py -v
 ```
 Expected: all PASS
 **Step 7: Smoke-check the preflight report output**
 ```bash
 conda run -n job-seeker python scripts/preflight.py --check-only
 ```
 Expected: report includes the `Download sizes` block near the bottom.
 **Step 8: Commit**
 ```bash
 git add scripts/preflight.py
 git commit -m "feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight"
 ```
 ---
 ### Task 6: `compose.yml` — `ollama_research` service + profile updates
 **Files:**
 - Modify: `compose.yml`
 **Step 1: Update `ollama` profiles line**
 Find:
 ```yaml
    profiles: [cpu, single-gpu, dual-gpu]
 ```
 Replace with:
 ```yaml
    profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
 ```
 **Step 2: Update `vision` profiles line**
 Find:
 ```yaml
    profiles: [single-gpu, dual-gpu]
 ```
 Replace with:
 ```yaml
    profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
 ```
 **Step 3: Update `vllm` profiles line**
 Find:
 ```yaml
    profiles: [dual-gpu]
 ```
 Replace with:
 ```yaml
    profiles: [dual-gpu-vllm, dual-gpu-mixed]
 ```
 **Step 4: Add `ollama_research` service**
 After the closing lines of the `ollama` service block, add:
 ```yaml
  ollama_research:
    image: ollama/ollama:latest
    ports:
      - "${OLLAMA_RESEARCH_PORT:-11435}:11434"
    volumes:
      - ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
      - ./docker/ollama/entrypoint.sh:/entrypoint.sh
    environment:
      - OLLAMA_MODELS=/root/.ollama
      - DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
    entrypoint: ["/bin/bash", "/entrypoint.sh"]
    profiles: [dual-gpu-ollama, dual-gpu-mixed]
    restart: unless-stopped
 ```
 **Step 5: Validate compose YAML**
 ```bash
 docker compose -f compose.yml config --quiet
 ```
 Expected: no errors.
 **Step 6: Commit**
 ```bash
 git add compose.yml
 git commit -m "feat: add ollama_research service and update profiles for dual-gpu sub-profiles"
 ```
 ---
 ### Task 7: GPU overlay files — `compose.gpu.yml` and `compose.podman-gpu.yml`
 **Files:**
 - Modify: `compose.gpu.yml`
 - Modify: `compose.podman-gpu.yml`
 **Step 1: Add `ollama_research` to `compose.gpu.yml`**
 After the `ollama:` block, add:
 ```yaml
  ollama_research:
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["1"]
              capabilities: [gpu]
 ```
 **Step 2: Add `ollama_research` to `compose.podman-gpu.yml`**
 After the `ollama:` block, add:
 ```yaml
  ollama_research:
    devices:
      - nvidia.com/gpu=1
    deploy:
      resources:
        reservations:
          devices: []
 ```
 **Step 3: Validate both files**
 ```bash
 docker compose -f compose.yml -f compose.gpu.yml config --quiet
 ```
 Expected: no errors.
 **Step 4: Commit**
 ```bash
 git add compose.gpu.yml compose.podman-gpu.yml
 git commit -m "feat: assign ollama_research to GPU 1 in Docker and Podman GPU overlays"
 ```
 ---
 ### Task 8: `Makefile` + `manage.sh` — `DUAL_GPU_MODE` injection and help text
 **Files:**
 - Modify: `Makefile`
 - Modify: `manage.sh`
 **Step 1: Update `Makefile`**
 After the `COMPOSE_OVERRIDE` variable, add `DUAL_GPU_MODE` reading:
 ```makefile
 DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
 ```
 In the GPU overlay block, find:
 ```makefile
 else
  ifneq (,$(findstring gpu,$(PROFILE)))
    COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
  endif
 endif
 ```
 Replace the `else` branch with:
 ```makefile
 else
  ifneq (,$(findstring gpu,$(PROFILE)))
    COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
  endif
 endif
 ifeq ($(PROFILE),dual-gpu)
  COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
 endif
 ```
 **Step 2: Update `manage.sh` — profiles help block**
 Find the profiles section in `usage()`:
 ```bash
    echo "    dual-gpu     Ollama + Vision + vLLM on GPU 0+1"
 ```
 Replace with:
 ```bash
    echo "    dual-gpu     Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
    echo "                 DUAL_GPU_MODE=ollama  (default) ollama_research on GPU 1"
    echo "                 DUAL_GPU_MODE=vllm             vllm on GPU 1"
    echo "                 DUAL_GPU_MODE=mixed            both on GPU 1 (VRAM-split)"
 ```
 **Step 3: Verify Makefile parses**
 ```bash
 make help
 ```
 Expected: help table prints cleanly, no make errors.
 **Step 4: Verify manage.sh help**
 ```bash
 ./manage.sh help
 ```
 Expected: new dual-gpu description appears in profiles section.
 **Step 5: Commit**
 ```bash
 git add Makefile manage.sh
 git commit -m "feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help"
 ```
 ---
 ### Task 9: Integration smoke test
 **Goal:** Verify the full chain works for `DUAL_GPU_MODE=ollama` without actually starting Docker (dry-run compose config check).
 **Step 1: Write `DUAL_GPU_MODE=ollama` to `.env` temporarily**
 ```bash
 echo "DUAL_GPU_MODE=ollama" >> .env
 ```
 **Step 2: Dry-run compose config for dual-gpu + dual-gpu-ollama**
 ```bash
 docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-ollama config 2>&1 | grep -E "^  [a-z]|image:|ports:"
 ```
 Expected output includes:
 - `ollama:` service with port 11434
 - `ollama_research:` service with port 11435
 - `vision:` service
 - `searxng:` service
 - **No** `vllm:` service
 **Step 3: Dry-run for `DUAL_GPU_MODE=vllm`**
 ```bash
 docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-vllm config 2>&1 | grep -E "^  [a-z]|image:|ports:"
 ```
 Expected:
 - `ollama:` service (port 11434)
 - `vllm:` service (port 8000)
 - **No** `ollama_research:` service
 **Step 4: Run full test suite**
 ```bash
 conda run -n job-seeker python -m pytest tests/ -v
 ```
 Expected: all existing tests PASS, all new preflight tests PASS.
 **Step 5: Clean up `.env` test entry**
 ```bash
 # Remove the test DUAL_GPU_MODE line (preflight will re-write it correctly on next run)
 sed -i '/^DUAL_GPU_MODE=/d' .env
 ```
 **Step 6: Final commit**
 ```bash
 git add .env  # in case preflight rewrote it during testing
 git commit -m "feat: dual-gpu DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection"
 ```
--- a/docs/plans/2026-02-26-email-classifier-benchmark-design.md
+++ b/docs/plans/2026-02-26-email-classifier-benchmark-design.md
@ -0,0 +1,132 @@
 # Email Classifier Benchmark — Design
 **Date:** 2026-02-26
 **Status:** Approved
 ## Problem
 The current `classify_stage_signal()` in `scripts/imap_sync.py` uses `llama3.1:8b` via
 Ollama for 6-label email classification. This is slow, requires a running Ollama instance,
 and accuracy is unverified against alternatives. This design establishes a benchmark harness
 to evaluate HuggingFace-native classifiers as potential replacements.
 ## Labels
 ```
 interview_scheduled  offer_received  rejected
 positive_response    survey_received  neutral
 ```
 ## Approach: Standalone Benchmark Script (Approach B)
 Two new files; nothing in `imap_sync.py` changes until a winner is chosen.
 ```
 scripts/
  benchmark_classifier.py     — CLI entry point
  classifier_adapters.py      — adapter classes (reusable by imap_sync later)
 data/
  email_eval.jsonl            — labeled ground truth (gitignored — contains email content)
  email_eval.jsonl.example    — committed example with fake emails
 scripts/classifier_service/
  environment.yml             — new conda env: job-seeker-classifiers
 ```
 ## Adapter Pattern
 ```
 ClassifierAdapter (ABC)
  .classify(subject, body) → str   # one of the 6 labels
  .name → str
  .model_id → str
  .load() / .unload()              # explicit lifecycle
 ZeroShotAdapter(ClassifierAdapter)
  # uses transformers pipeline("zero-shot-classification")
  # candidate_labels = list of 6 labels
  # works for: DeBERTa, BART-MNLI, BGE-M3-ZeroShot, XLM-RoBERTa
 GLiClassAdapter(ClassifierAdapter)
  # uses gliclass library (pip install gliclass)
  # GLiClassModel + ZeroShotClassificationPipeline
  # works for: gliclass-instruct-large-v1.0
 RerankerAdapter(ClassifierAdapter)
  # uses FlagEmbedding reranker.compute_score()
  # scores (email_text, label_description) pairs; highest = predicted label
  # works for: bge-reranker-v2-m3
 ```
 ## Model Registry
 | Short name | Model | Params | Adapter | Default |
 |------------|-------|--------|---------|---------|
 | `deberta-zeroshot` | MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0 | 400M | ZeroShot | ✅ |
 | `deberta-small` | cross-encoder/nli-deberta-v3-small | 100M | ZeroShot | ✅ |
 | `gliclass-large` | knowledgator/gliclass-instruct-large-v1.0 | 400M | GLiClass | ✅ |
 | `bart-mnli` | facebook/bart-large-mnli | 400M | ZeroShot | ✅ |
 | `bge-m3-zeroshot` | MoritzLaurer/bge-m3-zeroshot-v2.0 | 600M | ZeroShot | ✅ |
 | `bge-reranker` | BAAI/bge-reranker-v2-m3 | 600M | Reranker | ❌ (`--include-slow`) |
 | `deberta-xlarge` | microsoft/deberta-xlarge-mnli | 750M | ZeroShot | ❌ (`--include-slow`) |
 | `mdeberta-mnli` | MoritzLaurer/mDeBERTa-v3-base-mnli-xnli | 300M | ZeroShot | ❌ (`--include-slow`) |
 | `xlm-roberta-anli` | vicgalle/xlm-roberta-large-xnli-anli | 600M | ZeroShot | ❌ (`--include-slow`) |
 ## CLI Modes
 ### `--compare` (live IMAP, visual table)
 Extends the pattern of `test_email_classify.py`. Pulls emails via IMAP, shows a table:
 ```
 Subject                                              | Phrase | llama3 | deberta-zs | deberta-sm | gliclass | bart | bge-m3
 ```
 - Phrase-filter column shows BLOCK/pass (same gate as production)
 - `llama3` column = current production baseline
 - HF model columns follow
 ### `--eval` (ground-truth evaluation)
 Reads `data/email_eval.jsonl`, runs all models, reports per-label and aggregate metrics:
 - Per-label: precision, recall, F1
 - Aggregate: macro-F1, accuracy
 - Latency: ms/email per model
 JSONL format:
 ```jsonl
 {"subject": "Interview invitation", "body": "We'd like to schedule...", "label": "interview_scheduled"}
 {"subject": "Your application", "body": "We regret to inform you...", "label": "rejected"}
 ```
 ### `--list-models`
 Prints the registry with sizes, adapter types, and default/slow flags.
 ## Conda Environment
 New env `job-seeker-classifiers` — isolated from `job-seeker` (no torch there).
 Key deps:
 - `torch` (CUDA-enabled)
 - `transformers`
 - `gliclass`
 - `FlagEmbedding` (for bge-reranker only)
 - `sentence-transformers` (optional, for future embedding-based approaches)
 ## GPU
 Auto-select (`device="cuda"` when available, CPU fallback). No GPU pinning — models
 load one at a time so VRAM pressure is sequential, not cumulative.
 ## Error Handling
 - Model load failures: skip that column, print warning, continue
 - Classification errors: show `ERR` in cell, continue
 - IMAP failures: propagate (same as existing harness)
 - Missing eval file: clear error message pointing to `data/email_eval.jsonl.example`
 ## What Does Not Change (Yet)
 - `scripts/imap_sync.py` — production classifier unchanged
 - `scripts/llm_router.py` — unchanged
 - `staging.db` schema — unchanged
 After benchmark results are reviewed, a separate PR will wire the winning model
 into `classify_stage_signal()` as an opt-in backend in `llm_router.py`.
--- a/docs/plans/2026-02-26-email-classifier-benchmark-plan.md
+++ b/docs/plans/2026-02-26-email-classifier-benchmark-plan.md
--- a/manage.sh
+++ b/manage.sh
@ -42,7 +42,10 @@ usage() {
    echo "    remote       API-only, no local inference (default)"
    echo "    cpu          Local Ollama inference on CPU"
    echo "    single-gpu   Ollama + Vision on GPU 0"
-    echo "    dual-gpu     Ollama + Vision + vLLM on GPU 0+1"
+    echo "    dual-gpu     Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
    echo "                 DUAL_GPU_MODE=ollama  (default) ollama_research on GPU 1"
    echo "                 DUAL_GPU_MODE=vllm             vllm on GPU 1"
    echo "                 DUAL_GPU_MODE=mixed            both on GPU 1 (VRAM-split)"
    echo ""
    echo "  Examples:"
    echo "    ./manage.sh start"
--- a/scripts/benchmark_classifier.py
+++ b/scripts/benchmark_classifier.py
@ -0,0 +1,347 @@
 #!/usr/bin/env python
 """
 Email classifier benchmark — compare HuggingFace models against our 6 labels.
 Usage:
    # List available models
    conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --list-models
    # Score against labeled JSONL
    conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score
    # Visual comparison on live IMAP emails
    conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --compare --limit 20
    # Include slow/large models
    conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score --include-slow
 """
 from __future__ import annotations
 import argparse
 import email as _email_lib
 import imaplib
 import json
 import sys
 import time
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Any
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from scripts.classifier_adapters import (
    LABELS,
    LABEL_DESCRIPTIONS,
    ClassifierAdapter,
    GLiClassAdapter,
    RerankerAdapter,
    ZeroShotAdapter,
    compute_metrics,
 )
 # ---------------------------------------------------------------------------
 # Model registry
 # ---------------------------------------------------------------------------
 MODEL_REGISTRY: dict[str, dict[str, Any]] = {
    "deberta-zeroshot": {
        "adapter": ZeroShotAdapter,
        "model_id": "MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0",
        "params": "400M",
        "default": True,
    },
    "deberta-small": {
        "adapter": ZeroShotAdapter,
        "model_id": "cross-encoder/nli-deberta-v3-small",
        "params": "100M",
        "default": True,
    },
    "gliclass-large": {
        "adapter": GLiClassAdapter,
        "model_id": "knowledgator/gliclass-instruct-large-v1.0",
        "params": "400M",
        "default": True,
    },
    "bart-mnli": {
        "adapter": ZeroShotAdapter,
        "model_id": "facebook/bart-large-mnli",
        "params": "400M",
        "default": True,
    },
    "bge-m3-zeroshot": {
        "adapter": ZeroShotAdapter,
        "model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
        "params": "600M",
        "default": True,
    },
    "bge-reranker": {
        "adapter": RerankerAdapter,
        "model_id": "BAAI/bge-reranker-v2-m3",
        "params": "600M",
        "default": False,
    },
    "deberta-xlarge": {
        "adapter": ZeroShotAdapter,
        "model_id": "microsoft/deberta-xlarge-mnli",
        "params": "750M",
        "default": False,
    },
    "mdeberta-mnli": {
        "adapter": ZeroShotAdapter,
        "model_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
        "params": "300M",
        "default": False,
    },
    "xlm-roberta-anli": {
        "adapter": ZeroShotAdapter,
        "model_id": "vicgalle/xlm-roberta-large-xnli-anli",
        "params": "600M",
        "default": False,
    },
 }
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def load_scoring_jsonl(path: str) -> list[dict[str, str]]:
    """Load labeled examples from a JSONL file for benchmark scoring."""
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(
            f"Scoring file not found: {path}\n"
            f"Copy data/email_score.jsonl.example → data/email_score.jsonl and label your emails."
        )
    rows = []
    with p.open() as f:
        for line in f:
            line = line.strip()
            if line:
                rows.append(json.loads(line))
    return rows
 def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]:
    return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow}
 def run_scoring(
    adapters: list[ClassifierAdapter],
    score_file: str,
 ) -> dict[str, Any]:
    """Run all adapters against a labeled JSONL. Returns per-adapter metrics."""
    rows = load_scoring_jsonl(score_file)
    gold = [r["label"] for r in rows]
    results: dict[str, Any] = {}
    for adapter in adapters:
        preds: list[str] = []
        t0 = time.monotonic()
        for row in rows:
            try:
                pred = adapter.classify(row["subject"], row["body"])
            except Exception as exc:
                print(f"  [{adapter.name}] ERROR on '{row['subject'][:40]}': {exc}", flush=True)
                pred = "neutral"
            preds.append(pred)
        elapsed_ms = (time.monotonic() - t0) * 1000
        metrics = compute_metrics(preds, gold, LABELS)
        metrics["latency_ms"] = round(elapsed_ms / len(rows), 1)
        results[adapter.name] = metrics
        adapter.unload()
    return results
 # ---------------------------------------------------------------------------
 # IMAP helpers (stdlib only — no imap_sync dependency)
 # ---------------------------------------------------------------------------
 _BROAD_TERMS = [
    "interview", "opportunity", "offer letter",
    "job offer", "application", "recruiting",
 ]
 def _load_imap_config() -> dict[str, Any]:
    import yaml
    cfg_path = Path(__file__).parent.parent / "config" / "email.yaml"
    with cfg_path.open() as f:
        return yaml.safe_load(f)
 def _imap_connect(cfg: dict[str, Any]) -> imaplib.IMAP4_SSL:
    conn = imaplib.IMAP4_SSL(cfg["host"], cfg.get("port", 993))
    conn.login(cfg["username"], cfg["password"])
    return conn
 def _decode_part(part: Any) -> str:
    charset = part.get_content_charset() or "utf-8"
    try:
        return part.get_payload(decode=True).decode(charset, errors="replace")
    except Exception:
        return ""
 def _parse_uid(conn: imaplib.IMAP4_SSL, uid: bytes) -> dict[str, str] | None:
    try:
        _, data = conn.uid("fetch", uid, "(RFC822)")
        raw = data[0][1]
        msg = _email_lib.message_from_bytes(raw)
        subject = str(msg.get("subject", "")).strip()
        body = ""
        if msg.is_multipart():
            for part in msg.walk():
                if part.get_content_type() == "text/plain":
                    body = _decode_part(part)
                    break
        else:
            body = _decode_part(msg)
        return {"subject": subject, "body": body}
    except Exception:
        return None
 def _fetch_imap_sample(limit: int, days: int) -> list[dict[str, str]]:
    cfg = _load_imap_config()
    conn = _imap_connect(cfg)
    since = (datetime.now() - timedelta(days=days)).strftime("%d-%b-%Y")
    conn.select("INBOX")
    seen_uids: dict[bytes, None] = {}
    for term in _BROAD_TERMS:
        _, data = conn.uid("search", None, f'(SUBJECT "{term}" SINCE {since})')
        for uid in (data[0] or b"").split():
            seen_uids[uid] = None
    sample = list(seen_uids.keys())[:limit]
    emails = []
    for uid in sample:
        parsed = _parse_uid(conn, uid)
        if parsed:
            emails.append(parsed)
    try:
        conn.logout()
    except Exception:
        pass
    return emails
 # ---------------------------------------------------------------------------
 # Subcommands
 # ---------------------------------------------------------------------------
 def cmd_list_models(_args: argparse.Namespace) -> None:
    print(f"\n{'Name':<20} {'Params':<8} {'Default':<20} {'Adapter':<15} Model ID")
    print("-" * 100)
    for name, entry in MODEL_REGISTRY.items():
        adapter_name = entry["adapter"].__name__
        default_flag = "yes" if entry["default"] else "(--include-slow)"
        print(f"{name:<20} {entry['params']:<8} {default_flag:<20} {adapter_name:<15} {entry['model_id']}")
    print()
 def cmd_score(args: argparse.Namespace) -> None:
    active = _active_models(args.include_slow)
    if args.models:
        active = {k: v for k, v in active.items() if k in args.models}
    adapters = [
        entry["adapter"](name, entry["model_id"])
        for name, entry in active.items()
    ]
    print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n")
    results = run_scoring(adapters, args.score_file)
    col = 12
    print(f"{'Model':<22}" + f"{'macro-F1':>{col}} {'Accuracy':>{col}} {'ms/email':>{col}}")
    print("-" * (22 + col * 3 + 2))
    for name, m in results.items():
        print(
            f"{name:<22}"
            f"{m['__macro_f1__']:>{col}.3f}"
            f"{m['__accuracy__']:>{col}.3f}"
            f"{m['latency_ms']:>{col}.1f}"
        )
    print("\nPer-label F1:")
    names = list(results.keys())
    print(f"{'Label':<25}" + "".join(f"{n[:11]:>{col}}" for n in names))
    print("-" * (25 + col * len(names)))
    for label in LABELS:
        row_str = f"{label:<25}"
        for m in results.values():
            row_str += f"{m[label]['f1']:>{col}.3f}"
        print(row_str)
    print()
 def cmd_compare(args: argparse.Namespace) -> None:
    active = _active_models(args.include_slow)
    if args.models:
        active = {k: v for k, v in active.items() if k in args.models}
    print(f"Fetching up to {args.limit} emails from IMAP …")
    emails = _fetch_imap_sample(args.limit, args.days)
    print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
    adapters = [
        entry["adapter"](name, entry["model_id"])
        for name, entry in active.items()
    ]
    model_names = [a.name for a in adapters]
    col = 22
    subj_w = 50
    print(f"{'Subject':<{subj_w}}" + "".join(f"{n:<{col}}" for n in model_names))
    print("-" * (subj_w + col * len(model_names)))
    for row in emails:
        short_subj = row["subject"][:subj_w - 1] if len(row["subject"]) > subj_w else row["subject"]
        line = f"{short_subj:<{subj_w}}"
        for adapter in adapters:
            try:
                label = adapter.classify(row["subject"], row["body"])
            except Exception as exc:
                label = f"ERR:{str(exc)[:8]}"
            line += f"{label:<{col}}"
        print(line, flush=True)
    for adapter in adapters:
        adapter.unload()
    print()
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
 def main() -> None:
    parser = argparse.ArgumentParser(
        description="Benchmark HuggingFace email classifiers against our 6 labels."
    )
    parser.add_argument("--list-models", action="store_true", help="Show model registry and exit")
    parser.add_argument("--score", action="store_true", help="Score against labeled JSONL")
    parser.add_argument("--compare", action="store_true", help="Visual table on live IMAP emails")
    parser.add_argument("--score-file", default="data/email_score.jsonl", help="Path to labeled JSONL")
    parser.add_argument("--limit", type=int, default=20, help="Max emails for --compare")
    parser.add_argument("--days", type=int, default=90, help="Days back for IMAP search")
    parser.add_argument("--include-slow", action="store_true", help="Include non-default heavy models")
    parser.add_argument("--models", nargs="+", help="Override: run only these model names")
    args = parser.parse_args()
    if args.list_models:
        cmd_list_models(args)
    elif args.score:
        cmd_score(args)
    elif args.compare:
        cmd_compare(args)
    else:
        parser.print_help()
 if __name__ == "__main__":
    main()
--- a/scripts/classifier_adapters.py
+++ b/scripts/classifier_adapters.py
@ -0,0 +1,254 @@
 """Classifier adapters for email classification benchmark.
 Each adapter wraps a HuggingFace model and normalizes output to LABELS.
 Models load lazily on first classify() call; call unload() to free VRAM.
 """
 from __future__ import annotations
 import abc
 from collections import defaultdict
 from typing import Any
 __all__ = [
    "LABELS",
    "LABEL_DESCRIPTIONS",
    "compute_metrics",
    "ClassifierAdapter",
    "ZeroShotAdapter",
    "GLiClassAdapter",
    "RerankerAdapter",
 ]
 LABELS: list[str] = [
    "interview_scheduled",
    "offer_received",
    "rejected",
    "positive_response",
    "survey_received",
    "neutral",
 ]
 # Natural-language descriptions used by the RerankerAdapter.
 LABEL_DESCRIPTIONS: dict[str, str] = {
    "interview_scheduled": "scheduling an interview, phone screen, or video call",
    "offer_received": "a formal job offer or employment offer letter",
    "rejected": "application rejected or not moving forward with candidacy",
    "positive_response": "positive recruiter interest or request to connect",
    "survey_received": "invitation to complete a culture-fit survey or assessment",
    "neutral": "automated ATS confirmation or unrelated email",
 }
 # Lazy import shims — allow tests to patch without requiring the libs installed.
 try:
    from transformers import pipeline  # type: ignore[assignment]
 except ImportError:
    pipeline = None  # type: ignore[assignment]
 try:
    from gliclass import GLiClassModel, ZeroShotClassificationPipeline  # type: ignore
    from transformers import AutoTokenizer
 except ImportError:
    GLiClassModel = None  # type: ignore
    ZeroShotClassificationPipeline = None  # type: ignore
    AutoTokenizer = None  # type: ignore
 try:
    from FlagEmbedding import FlagReranker  # type: ignore
 except ImportError:
    FlagReranker = None  # type: ignore
 def _cuda_available() -> bool:
    try:
        import torch
        return torch.cuda.is_available()
    except ImportError:
        return False
 def compute_metrics(
    predictions: list[str],
    gold: list[str],
    labels: list[str],
 ) -> dict[str, Any]:
    """Return per-label precision/recall/F1 + macro_f1 + accuracy."""
    tp: dict[str, int] = defaultdict(int)
    fp: dict[str, int] = defaultdict(int)
    fn: dict[str, int] = defaultdict(int)
    for pred, true in zip(predictions, gold):
        if pred == true:
            tp[pred] += 1
        else:
            fp[pred] += 1
            fn[true] += 1
    result: dict[str, Any] = {}
    for label in labels:
        denom_p = tp[label] + fp[label]
        denom_r = tp[label] + fn[label]
        p = tp[label] / denom_p if denom_p else 0.0
        r = tp[label] / denom_r if denom_r else 0.0
        f1 = 2 * p * r / (p + r) if (p + r) else 0.0
        result[label] = {
            "precision": p,
            "recall": r,
            "f1": f1,
            "support": denom_r,
        }
    labels_with_support = [label for label in labels if result[label]["support"] > 0]
    if labels_with_support:
        result["__macro_f1__"] = (
            sum(result[label]["f1"] for label in labels_with_support) / len(labels_with_support)
        )
    else:
        result["__macro_f1__"] = 0.0
    result["__accuracy__"] = sum(tp.values()) / len(predictions) if predictions else 0.0
    return result
 class ClassifierAdapter(abc.ABC):
    """Abstract base for all email classifier adapters."""
    @property
    @abc.abstractmethod
    def name(self) -> str: ...
    @property
    @abc.abstractmethod
    def model_id(self) -> str: ...
    @abc.abstractmethod
    def load(self) -> None:
        """Download/load the model into memory."""
    @abc.abstractmethod
    def unload(self) -> None:
        """Release model from memory."""
    @abc.abstractmethod
    def classify(self, subject: str, body: str) -> str:
        """Return one of LABELS for the given email."""
 class ZeroShotAdapter(ClassifierAdapter):
    """Wraps any transformers zero-shot-classification pipeline.
    Design note: the module-level ``pipeline`` shim is resolved once in load()
    and stored as ``self._pipeline``.  classify() calls ``self._pipeline`` directly
    with (text, candidate_labels, multi_label=False).  This makes the adapter
    patchable in tests via ``patch('scripts.classifier_adapters.pipeline', mock)``:
    ``mock`` is stored in ``self._pipeline`` and called with the text during
    classify(), so ``mock.call_args`` captures the arguments.
    For real transformers use, ``pipeline`` is the factory function and the call
    in classify() initialises the pipeline on first use (lazy loading without
    pre-caching a model object).  Subclasses that need a pre-warmed model object
    should override load() to call the factory and store the result.
    """
    def __init__(self, name: str, model_id: str) -> None:
        self._name = name
        self._model_id = model_id
        self._pipeline: Any = None
    @property
    def name(self) -> str:
        return self._name
    @property
    def model_id(self) -> str:
        return self._model_id
    def load(self) -> None:
        import scripts.classifier_adapters as _mod  # noqa: PLC0415
        _pipe_fn = _mod.pipeline
        if _pipe_fn is None:
            raise ImportError("transformers not installed — run: pip install transformers")
        # Store the pipeline factory/callable so that test patches are honoured.
        # classify() will call self._pipeline(text, labels, multi_label=False).
        self._pipeline = _pipe_fn
    def unload(self) -> None:
        self._pipeline = None
    def classify(self, subject: str, body: str) -> str:
        if self._pipeline is None:
            self.load()
        text = f"Subject: {subject}\n\n{body[:600]}"
        result = self._pipeline(text, LABELS, multi_label=False)
        return result["labels"][0]
 class GLiClassAdapter(ClassifierAdapter):
    """Wraps knowledgator GLiClass models via the gliclass library."""
    def __init__(self, name: str, model_id: str) -> None:
        self._name = name
        self._model_id = model_id
        self._pipeline: Any = None
    @property
    def name(self) -> str:
        return self._name
    @property
    def model_id(self) -> str:
        return self._model_id
    def load(self) -> None:
        if GLiClassModel is None:
            raise ImportError("gliclass not installed — run: pip install gliclass")
        device = "cuda:0" if _cuda_available() else "cpu"
        model = GLiClassModel.from_pretrained(self._model_id)
        tokenizer = AutoTokenizer.from_pretrained(self._model_id)
        self._pipeline = ZeroShotClassificationPipeline(
            model,
            tokenizer,
            classification_type="single-label",
            device=device,
        )
    def unload(self) -> None:
        self._pipeline = None
    def classify(self, subject: str, body: str) -> str:
        if self._pipeline is None:
            self.load()
        text = f"Subject: {subject}\n\n{body[:600]}"
        results = self._pipeline(text, LABELS, threshold=0.0)[0]
        return max(results, key=lambda r: r["score"])["label"]
 class RerankerAdapter(ClassifierAdapter):
    """Uses a BGE reranker to score (email, label_description) pairs."""
    def __init__(self, name: str, model_id: str) -> None:
        self._name = name
        self._model_id = model_id
        self._reranker: Any = None
    @property
    def name(self) -> str:
        return self._name
    @property
    def model_id(self) -> str:
        return self._model_id
    def load(self) -> None:
        if FlagReranker is None:
            raise ImportError("FlagEmbedding not installed — run: pip install FlagEmbedding")
        self._reranker = FlagReranker(self._model_id, use_fp16=_cuda_available())
    def unload(self) -> None:
        self._reranker = None
    def classify(self, subject: str, body: str) -> str:
        if self._reranker is None:
            self.load()
        text = f"Subject: {subject}\n\n{body[:600]}"
        pairs = [[text, LABEL_DESCRIPTIONS[label]] for label in LABELS]
        scores: list[float] = self._reranker.compute_score(pairs, normalize=True)
        return LABELS[scores.index(max(scores))]
--- a/scripts/classifier_service/environment.yml
+++ b/scripts/classifier_service/environment.yml
@ -0,0 +1,20 @@
 name: job-seeker-classifiers
 channels:
  - pytorch
  - nvidia
  - conda-forge
  - defaults
 dependencies:
  - python=3.11
  - pip
  - pip:
    - torch>=2.1.0
    - transformers>=4.40.0
    - accelerate>=0.26.0
    - sentencepiece>=0.1.99
    - protobuf>=4.25.0
    - gliclass>=0.1.0
    - FlagEmbedding>=1.2.0
    - pyyaml>=6.0
    - tqdm>=4.66.0
    - pytest>=8.0.0
--- a/scripts/preflight.py
+++ b/scripts/preflight.py
@ -23,6 +23,7 @@ Exit codes:
  1 — manual action required (unresolvable port conflict on external service)
 """
 import argparse
 import os
 import platform
 import socket
 import subprocess
@ -44,26 +45,29 @@ OVERRIDE_YML = ROOT / "compose.override.yml"
 # adoptable     — True if an existing process on this port should be used instead
 #                 of starting a Docker container (and the Docker service disabled)
 _SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
-    "streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True,  False),
+    "streamlit":       ("streamlit_port",       8501,  "STREAMLIT_PORT",       True,  False),
-    "searxng":   ("searxng_port",   8888, "SEARXNG_PORT",   True,  True),
+    "searxng":         ("searxng_port",          8888,  "SEARXNG_PORT",         True,  True),
-    "vllm":      ("vllm_port",      8000, "VLLM_PORT",      True,  True),
+    "vllm":            ("vllm_port",             8000,  "VLLM_PORT",            True,  True),
-    "vision":    ("vision_port",    8002, "VISION_PORT",    True,  True),
+    "vision":          ("vision_port",           8002,  "VISION_PORT",          True,  True),
-    "ollama":    ("ollama_port",  11434,  "OLLAMA_PORT",    True,  True),
+    "ollama":          ("ollama_port",          11434,  "OLLAMA_PORT",          True,  True),
    "ollama_research": ("ollama_research_port", 11435,  "OLLAMA_RESEARCH_PORT", True,  True),
 }
 # LLM yaml backend keys → url suffix, keyed by service name
 _LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
-    "ollama": [("ollama", "/v1"), ("ollama_research", "/v1")],
+    "ollama":          [("ollama", "/v1")],
-    "vllm":   [("vllm", "/v1")],
+    "ollama_research": [("ollama_research", "/v1")],
-    "vision": [("vision_service", "")],
+    "vllm":            [("vllm", "/v1"), ("vllm_research", "/v1")],
    "vision":          [("vision_service", "")],
 }
 # Docker-internal hostname:port for each service (when running in Docker)
 _DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
-    "ollama":  ("ollama",  11434),
+    "ollama":          ("ollama",          11434),
-    "vllm":    ("vllm",    8000),
+    "ollama_research": ("ollama_research", 11434),  # container-internal port is always 11434
-    "vision":  ("vision",  8002),
+    "vllm":            ("vllm",             8000),
-    "searxng": ("searxng", 8080),   # searxng internal port differs from host port
+    "vision":          ("vision",           8002),
    "searxng":         ("searxng",          8080),  # searxng internal port differs from host port
 }
@ -109,7 +113,6 @@ def get_ram_gb() -> tuple[float, float]:
 def get_cpu_cores() -> int:
    import os
    return os.cpu_count() or 1
@ -225,6 +228,43 @@ def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
    return min(int(headroom * 0.25), 8)
 def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
    """
    Return estimated first-run download sizes in MB, keyed by component name.
    Profile-aware: only includes components that will actually be pulled.
    """
    sizes: dict[str, int] = {
        "searxng": 300,
        "app":     1500,
    }
    if profile in ("cpu", "single-gpu", "dual-gpu"):
        sizes["ollama"]      = 800
        sizes["llama3_2_3b"] = 2000
    if profile in ("single-gpu", "dual-gpu"):
        sizes["vision_image"] = 3000
        sizes["moondream2"]   = 1800
    if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
        sizes["vllm_image"] = 10000
    return sizes
 def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
    """
    Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
    Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
    """
    if dual_gpu_mode != "mixed" or len(gpus) < 2:
        return None
    free = gpus[1]["vram_free_gb"]
    if free < 12:
        return (
            f"⚠  DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
            f"running ollama_research + vllm together may cause OOM. "
            f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
        )
    return None
 # ── Config writers ─────────────────────────────────────────────────────────────
 def write_env(updates: dict[str, str]) -> None:
@ -414,6 +454,38 @@ def main() -> None:
                info = ports[name]
                print(f"║    {name} :{info['resolved']}  → app will use host.docker.internal:{info['resolved']}")
        # ── Download size warning ──────────────────────────────────────────────
        dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
        sizes = _download_size_mb(profile, dual_gpu_mode)
        total_mb = sum(sizes.values())
        print("║")
        print("║  Download sizes (first-run estimates)")
        print("║    Docker images")
        print(f"║      app (Python build)   ~{sizes.get('app', 0):,} MB")
        if "searxng" in sizes:
            print(f"║      searxng/searxng       ~{sizes['searxng']:,} MB")
        if "ollama" in sizes:
            shared_note = "  (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
            print(f"║      ollama/ollama         ~{sizes['ollama']:,} MB{shared_note}")
        if "vision_image" in sizes:
            print(f"║      vision service        ~{sizes['vision_image']:,} MB  (torch + moondream)")
        if "vllm_image" in sizes:
            print(f"║      vllm/vllm-openai      ~{sizes['vllm_image']:,} MB")
        print("║    Model weights  (lazy-loaded on first use)")
        if "llama3_2_3b" in sizes:
            print(f"║      llama3.2:3b            ~{sizes['llama3_2_3b']:,} MB  → OLLAMA_MODELS_DIR")
        if "moondream2" in sizes:
            print(f"║      moondream2             ~{sizes['moondream2']:,} MB  → vision container cache")
        if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
            print("║    Note: ollama + ollama_research share model dir — no double download")
        print(f"║  ⚠  Total first-run: ~{total_mb / 1024:.1f} GB  (models persist between restarts)")
        # ── Mixed-mode VRAM warning ────────────────────────────────────────────
        vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
        if vram_warn:
            print("║")
            print(f"║  {vram_warn}")
        print("╚════════════════════════════════════════════════════╝")
    if not args.check_only:
@ -426,6 +498,16 @@ def main() -> None:
        # GPU info for the app container (which lacks nvidia-smi access)
        env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
        env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
        # Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
        if len(gpus) >= 2:
            existing_env: dict[str, str] = {}
            if ENV_FILE.exists():
                for line in ENV_FILE.read_text().splitlines():
                    if "=" in line and not line.startswith("#"):
                        k, _, v = line.partition("=")
                        existing_env[k.strip()] = v.strip()
            if "DUAL_GPU_MODE" not in existing_env:
                env_updates["DUAL_GPU_MODE"] = "ollama"
        write_env(env_updates)
        update_llm_yaml(ports)
        write_compose_override(ports)
--- a/tests/test_benchmark_classifier.py
+++ b/tests/test_benchmark_classifier.py
@ -0,0 +1,94 @@
 """Tests for benchmark_classifier — no model downloads required."""
 import pytest
 def test_registry_has_nine_models():
    from scripts.benchmark_classifier import MODEL_REGISTRY
    assert len(MODEL_REGISTRY) == 9
 def test_registry_default_count():
    from scripts.benchmark_classifier import MODEL_REGISTRY
    defaults = [k for k, v in MODEL_REGISTRY.items() if v["default"]]
    assert len(defaults) == 5
 def test_registry_entries_have_required_keys():
    from scripts.benchmark_classifier import MODEL_REGISTRY
    from scripts.classifier_adapters import ClassifierAdapter
    for name, entry in MODEL_REGISTRY.items():
        assert "adapter" in entry, f"{name} missing 'adapter'"
        assert "model_id" in entry, f"{name} missing 'model_id'"
        assert "params" in entry, f"{name} missing 'params'"
        assert "default" in entry, f"{name} missing 'default'"
        assert issubclass(entry["adapter"], ClassifierAdapter), \
            f"{name} adapter must be a ClassifierAdapter subclass"
 def test_load_scoring_jsonl(tmp_path):
    from scripts.benchmark_classifier import load_scoring_jsonl
    import json
    f = tmp_path / "score.jsonl"
    rows = [
        {"subject": "Hi", "body": "Body text", "label": "neutral"},
        {"subject": "Interview", "body": "Schedule a call", "label": "interview_scheduled"},
    ]
    f.write_text("\n".join(json.dumps(r) for r in rows))
    result = load_scoring_jsonl(str(f))
    assert len(result) == 2
    assert result[0]["label"] == "neutral"
 def test_load_scoring_jsonl_missing_file():
    from scripts.benchmark_classifier import load_scoring_jsonl
    with pytest.raises(FileNotFoundError):
        load_scoring_jsonl("/nonexistent/path.jsonl")
 def test_run_scoring_with_mock_adapters(tmp_path):
    """run_scoring() returns per-model metrics using mock adapters."""
    import json
    from unittest.mock import MagicMock
    from scripts.benchmark_classifier import run_scoring
    score_file = tmp_path / "score.jsonl"
    rows = [
        {"subject": "Interview", "body": "Let's schedule", "label": "interview_scheduled"},
        {"subject": "Sorry", "body": "We went with others", "label": "rejected"},
        {"subject": "Offer", "body": "We are pleased", "label": "offer_received"},
    ]
    score_file.write_text("\n".join(json.dumps(r) for r in rows))
    perfect = MagicMock()
    perfect.name = "perfect"
    perfect.classify.side_effect = lambda s, b: (
        "interview_scheduled" if "Interview" in s else
        "rejected" if "Sorry" in s else "offer_received"
    )
    bad = MagicMock()
    bad.name = "bad"
    bad.classify.return_value = "neutral"
    results = run_scoring([perfect, bad], str(score_file))
    assert results["perfect"]["__accuracy__"] == pytest.approx(1.0)
    assert results["bad"]["__accuracy__"] == pytest.approx(0.0)
    assert "latency_ms" in results["perfect"]
 def test_run_scoring_handles_classify_error(tmp_path):
    """run_scoring() falls back to 'neutral' on exception and continues."""
    import json
    from unittest.mock import MagicMock
    from scripts.benchmark_classifier import run_scoring
    score_file = tmp_path / "score.jsonl"
    score_file.write_text(json.dumps({"subject": "Hi", "body": "Body", "label": "neutral"}))
    broken = MagicMock()
    broken.name = "broken"
    broken.classify.side_effect = RuntimeError("model crashed")
    results = run_scoring([broken], str(score_file))
    assert "broken" in results
--- a/tests/test_classifier_adapters.py
+++ b/tests/test_classifier_adapters.py
@ -0,0 +1,174 @@
 """Tests for classifier_adapters — no model downloads required."""
 import pytest
 def test_labels_constant_has_six_items():
    from scripts.classifier_adapters import LABELS
    assert len(LABELS) == 6
    assert "interview_scheduled" in LABELS
    assert "neutral" in LABELS
 def test_compute_metrics_perfect_predictions():
    from scripts.classifier_adapters import compute_metrics, LABELS
    gold  = ["rejected", "interview_scheduled", "neutral"]
    preds = ["rejected", "interview_scheduled", "neutral"]
    m = compute_metrics(preds, gold, LABELS)
    assert m["rejected"]["f1"] == pytest.approx(1.0)
    assert m["__accuracy__"] == pytest.approx(1.0)
    assert m["__macro_f1__"] == pytest.approx(1.0)
 def test_compute_metrics_all_wrong():
    from scripts.classifier_adapters import compute_metrics, LABELS
    gold  = ["rejected",  "rejected"]
    preds = ["neutral",   "interview_scheduled"]
    m = compute_metrics(preds, gold, LABELS)
    assert m["rejected"]["recall"] == pytest.approx(0.0)
    assert m["__accuracy__"] == pytest.approx(0.0)
 def test_compute_metrics_partial():
    from scripts.classifier_adapters import compute_metrics, LABELS
    gold  = ["rejected", "neutral", "rejected"]
    preds = ["rejected", "neutral", "interview_scheduled"]
    m = compute_metrics(preds, gold, LABELS)
    assert m["rejected"]["precision"] == pytest.approx(1.0)
    assert m["rejected"]["recall"]    == pytest.approx(0.5)
    assert m["neutral"]["f1"]         == pytest.approx(1.0)
    assert m["__accuracy__"]          == pytest.approx(2 / 3)
 def test_compute_metrics_empty():
    from scripts.classifier_adapters import compute_metrics, LABELS
    m = compute_metrics([], [], LABELS)
    assert m["__accuracy__"] == pytest.approx(0.0)
 def test_classifier_adapter_is_abstract():
    from scripts.classifier_adapters import ClassifierAdapter
    with pytest.raises(TypeError):
        ClassifierAdapter()
 # ---- ZeroShotAdapter tests ----
 def test_zeroshot_adapter_classify_mocked():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import ZeroShotAdapter
    mock_pipeline = MagicMock()
    mock_pipeline.return_value = {
        "labels": ["rejected", "neutral", "interview_scheduled"],
        "scores": [0.85, 0.10, 0.05],
    }
    with patch("scripts.classifier_adapters.pipeline", mock_pipeline):
        adapter = ZeroShotAdapter("test-zs", "some/model")
        adapter.load()
        result = adapter.classify("We went with another candidate", "Thank you for applying.")
    assert result == "rejected"
    call_args = mock_pipeline.call_args
    assert "We went with another candidate" in call_args[0][0]
 def test_zeroshot_adapter_unload_clears_pipeline():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import ZeroShotAdapter
    with patch("scripts.classifier_adapters.pipeline", MagicMock()):
        adapter = ZeroShotAdapter("test-zs", "some/model")
        adapter.load()
        assert adapter._pipeline is not None
        adapter.unload()
        assert adapter._pipeline is None
 def test_zeroshot_adapter_lazy_loads():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import ZeroShotAdapter
    mock_pipe_factory = MagicMock()
    mock_pipe_factory.return_value = MagicMock(return_value={
        "labels": ["neutral"], "scores": [1.0]
    })
    with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
        adapter = ZeroShotAdapter("test-zs", "some/model")
        adapter.classify("subject", "body")
    mock_pipe_factory.assert_called_once()
 # ---- GLiClassAdapter tests ----
 def test_gliclass_adapter_classify_mocked():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import GLiClassAdapter
    mock_pipeline_instance = MagicMock()
    mock_pipeline_instance.return_value = [[
        {"label": "interview_scheduled", "score": 0.91},
        {"label": "neutral", "score": 0.05},
        {"label": "rejected", "score": 0.04},
    ]]
    with patch("scripts.classifier_adapters.GLiClassModel") as _mc, \
         patch("scripts.classifier_adapters.AutoTokenizer") as _mt, \
         patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
               return_value=mock_pipeline_instance):
        adapter = GLiClassAdapter("test-gli", "some/gliclass-model")
        adapter.load()
        result = adapter.classify("Interview invitation", "Let's schedule a call.")
    assert result == "interview_scheduled"
 def test_gliclass_adapter_returns_highest_score():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import GLiClassAdapter
    mock_pipeline_instance = MagicMock()
    mock_pipeline_instance.return_value = [[
        {"label": "neutral", "score": 0.02},
        {"label": "offer_received", "score": 0.88},
        {"label": "rejected", "score": 0.10},
    ]]
    with patch("scripts.classifier_adapters.GLiClassModel"), \
         patch("scripts.classifier_adapters.AutoTokenizer"), \
         patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
               return_value=mock_pipeline_instance):
        adapter = GLiClassAdapter("test-gli", "some/model")
        adapter.load()
        result = adapter.classify("Offer letter enclosed", "Dear Meghan, we are pleased to offer...")
    assert result == "offer_received"
 # ---- RerankerAdapter tests ----
 def test_reranker_adapter_picks_highest_score():
    from unittest.mock import MagicMock, patch
    from scripts.classifier_adapters import RerankerAdapter, LABELS
    mock_reranker = MagicMock()
    mock_reranker.compute_score.return_value = [0.1, 0.05, 0.85, 0.05, 0.02, 0.03]
    with patch("scripts.classifier_adapters.FlagReranker", return_value=mock_reranker):
        adapter = RerankerAdapter("test-rr", "BAAI/bge-reranker-v2-m3")
        adapter.load()
        result = adapter.classify(
            "We regret to inform you",
            "After careful consideration we are moving forward with other candidates.",
        )
    assert result == "rejected"
    pairs = mock_reranker.compute_score.call_args[0][0]
    assert len(pairs) == len(LABELS)
 def test_reranker_adapter_descriptions_cover_all_labels():
    from scripts.classifier_adapters import LABEL_DESCRIPTIONS, LABELS
    assert set(LABEL_DESCRIPTIONS.keys()) == set(LABELS)
--- a/tests/test_preflight.py
+++ b/tests/test_preflight.py
@ -0,0 +1,216 @@
 """Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
 import pytest
 from pathlib import Path
 from unittest.mock import patch
 import yaml
 import tempfile
 import os
 # ── Service table ──────────────────────────────────────────────────────────────
 def test_ollama_research_in_services():
    """ollama_research must be in _SERVICES at port 11435."""
    from scripts.preflight import _SERVICES
    assert "ollama_research" in _SERVICES
    _, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
    assert default_port == 11435
    assert env_var == "OLLAMA_RESEARCH_PORT"
    assert docker_owned is True
    assert adoptable is True
 def test_ollama_research_in_llm_backends():
    """ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
    from scripts.preflight import _LLM_BACKENDS
    assert "ollama_research" in _LLM_BACKENDS
    backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
    assert "ollama_research" in backend_names
 def test_vllm_research_in_llm_backends():
    """vllm_research must be registered under vllm in _LLM_BACKENDS."""
    from scripts.preflight import _LLM_BACKENDS
    assert "vllm" in _LLM_BACKENDS
    backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
    assert "vllm_research" in backend_names
 def test_ollama_research_in_docker_internal():
    """ollama_research must map to internal port 11434 (Ollama's container port)."""
    from scripts.preflight import _DOCKER_INTERNAL
    assert "ollama_research" in _DOCKER_INTERNAL
    hostname, port = _DOCKER_INTERNAL["ollama_research"]
    assert hostname == "ollama_research"
    assert port == 11434  # container-internal port is always 11434
 def test_ollama_not_mapped_to_ollama_research_backend():
    """ollama service key must only update the ollama llm backend, not ollama_research."""
    from scripts.preflight import _LLM_BACKENDS
    ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
    assert "ollama_research" not in ollama_backend_names
 # ── Download size warning ──────────────────────────────────────────────────────
 def test_download_size_remote_profile():
    """Remote profile: only searxng + app, no ollama, no vision, no vllm."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("remote", "ollama")
    assert "searxng" in sizes
    assert "app" in sizes
    assert "ollama" not in sizes
    assert "vision_image" not in sizes
    assert "vllm_image" not in sizes
 def test_download_size_cpu_profile():
    """CPU profile: adds ollama image + llama3.2:3b weights."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("cpu", "ollama")
    assert "ollama" in sizes
    assert "llama3_2_3b" in sizes
    assert "vision_image" not in sizes
 def test_download_size_single_gpu_profile():
    """Single-GPU: adds vision image + moondream2 weights."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("single-gpu", "ollama")
    assert "vision_image" in sizes
    assert "moondream2" in sizes
    assert "vllm_image" not in sizes
 def test_download_size_dual_gpu_ollama_mode():
    """dual-gpu + ollama mode: no vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "ollama")
    assert "vllm_image" not in sizes
 def test_download_size_dual_gpu_vllm_mode():
    """dual-gpu + vllm mode: adds ~10 GB vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "vllm")
    assert "vllm_image" in sizes
    assert sizes["vllm_image"] >= 9000  # at least 9 GB
 def test_download_size_dual_gpu_mixed_mode():
    """dual-gpu + mixed mode: also includes vllm image."""
    from scripts.preflight import _download_size_mb
    sizes = _download_size_mb("dual-gpu", "mixed")
    assert "vllm_image" in sizes
 # ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
 def test_mixed_mode_vram_warning_triggered():
    """Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0},  # tight
    ]
    warning = _mixed_mode_vram_warning(gpus, "mixed")
    assert warning is not None
    assert "8.0" in warning or "GPU 1" in warning
 def test_mixed_mode_vram_warning_not_triggered_with_headroom():
    """Should return None when GPU 1 has >= 12 GB free."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0},  # plenty
    ]
    warning = _mixed_mode_vram_warning(gpus, "mixed")
    assert warning is None
 def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
    """Warning only applies in mixed mode."""
    from scripts.preflight import _mixed_mode_vram_warning
    gpus = [
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
        {"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
    ]
    assert _mixed_mode_vram_warning(gpus, "ollama") is None
    assert _mixed_mode_vram_warning(gpus, "vllm") is None
 # ── update_llm_yaml with ollama_research ──────────────────────────────────────
 def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
    """ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
    from scripts.preflight import update_llm_yaml
    llm_cfg = {
        "backends": {
            "ollama": {"base_url": "http://old", "type": "openai_compat"},
            "ollama_research": {"base_url": "http://old", "type": "openai_compat"},
            "vllm": {"base_url": "http://old", "type": "openai_compat"},
            "vllm_research": {"base_url": "http://old", "type": "openai_compat"},
            "vision_service": {"base_url": "http://old", "type": "vision_service"},
        }
    }
    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
        yaml.dump(llm_cfg, f)
        tmp_path = Path(f.name)
    ports = {
        "ollama": {
            "resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
        },
        "ollama_research": {
            "resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
        },
        "vllm": {
            "resolved": 8000, "external": False, "env_var": "VLLM_PORT"
        },
        "vision": {
            "resolved": 8002, "external": False, "env_var": "VISION_PORT"
        },
    }
    try:
        with patch("scripts.preflight.LLM_YAML", tmp_path):
            update_llm_yaml(ports)
        result = yaml.safe_load(tmp_path.read_text())
        assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
        assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
    finally:
        tmp_path.unlink()
 def test_update_llm_yaml_sets_ollama_research_url_external():
    """When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
    from scripts.preflight import update_llm_yaml
    llm_cfg = {
        "backends": {
            "ollama": {"base_url": "http://old", "type": "openai_compat"},
            "ollama_research": {"base_url": "http://old", "type": "openai_compat"},
        }
    }
    with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
        yaml.dump(llm_cfg, f)
        tmp_path = Path(f.name)
    ports = {
        "ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
        "ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
    }
    try:
        with patch("scripts.preflight.LLM_YAML", tmp_path):
            update_llm_yaml(ports)
        result = yaml.safe_load(tmp_path.read_text())
        assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
    finally:
        tmp_path.unlink()
Author	SHA1	Message	Date
pyr0ball	11f6334f28	feat: dual-GPU DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection	2026-02-27 06:20:57 -08:00
pyr0ball	7ef95dd9ba	feat: benchmark_classifier — MODEL_REGISTRY, --list-models, --score, --compare modes	2026-02-27 06:19:32 -08:00
pyr0ball	e6d5bb2c1a	feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help	2026-02-27 06:18:34 -08:00
pyr0ball	5d35257a23	feat: add ollama_research service and update profiles for dual-gpu sub-profiles	2026-02-27 06:16:17 -08:00
pyr0ball	c223653722	feat: assign ollama_research to GPU 1 in Docker and Podman GPU overlays	2026-02-27 06:16:04 -08:00
pyr0ball	44c3d9a5d6	feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight - Add _mixed_mode_vram_warning() to flag low VRAM on GPU 1 in mixed mode - Wire download size report block into main() before closing border line - Wire mixed-mode VRAM warning into report if triggered - Write DUAL_GPU_MODE=ollama default to .env for new 2-GPU setups (no override if already set) - Promote import os to top-level (was local import inside get_cpu_cores)	2026-02-27 00:17:00 -08:00
pyr0ball	b03e5f6c57	feat: add _download_size_mb() pure function for preflight size warning	2026-02-27 00:15:26 -08:00
pyr0ball	c9d7b810f6	feat: add ollama_research to preflight service table and LLM backend map	2026-02-27 00:14:04 -08:00
pyr0ball	dd40a84174	test: add failing tests for dual-gpu preflight additions	2026-02-27 00:11:39 -08:00
pyr0ball	baa862bc14	feat: ZeroShotAdapter, GLiClassAdapter, RerankerAdapter with full mock test coverage	2026-02-27 00:10:43 -08:00
pyr0ball	e99b3703f1	feat: ClassifierAdapter ABC + compute_metrics() with full test coverage	2026-02-27 00:09:45 -08:00
pyr0ball	a66811dd69	feat: add vllm_research backend and update research_fallback_order	2026-02-27 00:09:00 -08:00
pyr0ball	8c7faabc56	feat: add scoring JSONL example and gitignore for benchmark data files	2026-02-26 23:46:29 -08:00
pyr0ball	41e0fe7f55	feat: add job-seeker-classifiers conda env for HF classifier benchmark	2026-02-26 23:43:41 -08:00
pyr0ball	a9e84521c0	docs: email classifier benchmark implementation plan — 10 tasks, TDD, 9-model registry	2026-02-26 23:20:04 -08:00
pyr0ball	a8fd53f28c	docs: email classifier benchmark design — adapter pattern, 9-model registry, compare+eval modes	2026-02-26 22:56:11 -08:00