Compare commits

...

16 commits

Author SHA1 Message Date
11f6334f28 feat: dual-GPU DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection 2026-02-27 06:20:57 -08:00
7ef95dd9ba feat: benchmark_classifier — MODEL_REGISTRY, --list-models, --score, --compare modes 2026-02-27 06:19:32 -08:00
e6d5bb2c1a feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help 2026-02-27 06:18:34 -08:00
5d35257a23 feat: add ollama_research service and update profiles for dual-gpu sub-profiles 2026-02-27 06:16:17 -08:00
c223653722 feat: assign ollama_research to GPU 1 in Docker and Podman GPU overlays 2026-02-27 06:16:04 -08:00
44c3d9a5d6 feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight
- Add _mixed_mode_vram_warning() to flag low VRAM on GPU 1 in mixed mode
- Wire download size report block into main() before closing border line
- Wire mixed-mode VRAM warning into report if triggered
- Write DUAL_GPU_MODE=ollama default to .env for new 2-GPU setups (no override if already set)
- Promote import os to top-level (was local import inside get_cpu_cores)
2026-02-27 00:17:00 -08:00
b03e5f6c57 feat: add _download_size_mb() pure function for preflight size warning 2026-02-27 00:15:26 -08:00
c9d7b810f6 feat: add ollama_research to preflight service table and LLM backend map 2026-02-27 00:14:04 -08:00
dd40a84174 test: add failing tests for dual-gpu preflight additions 2026-02-27 00:11:39 -08:00
baa862bc14 feat: ZeroShotAdapter, GLiClassAdapter, RerankerAdapter with full mock test coverage 2026-02-27 00:10:43 -08:00
e99b3703f1 feat: ClassifierAdapter ABC + compute_metrics() with full test coverage 2026-02-27 00:09:45 -08:00
a66811dd69 feat: add vllm_research backend and update research_fallback_order 2026-02-27 00:09:00 -08:00
8c7faabc56 feat: add scoring JSONL example and gitignore for benchmark data files 2026-02-26 23:46:29 -08:00
41e0fe7f55 feat: add job-seeker-classifiers conda env for HF classifier benchmark 2026-02-26 23:43:41 -08:00
a9e84521c0 docs: email classifier benchmark implementation plan — 10 tasks, TDD, 9-model registry 2026-02-26 23:20:04 -08:00
a8fd53f28c docs: email classifier benchmark design — adapter pattern, 9-model registry, compare+eval modes 2026-02-26 22:56:11 -08:00
19 changed files with 3796 additions and 19 deletions

3
.gitignore vendored
View file

@ -35,3 +35,6 @@ config/user.yaml.working
# Claude context files — kept out of version control # Claude context files — kept out of version control
CLAUDE.md CLAUDE.md
data/email_score.jsonl
data/email_compare_sample.jsonl

View file

@ -23,6 +23,7 @@ COMPOSE ?= $(shell \
# compose.override.yml. We must include it explicitly when present. # compose.override.yml. We must include it explicitly when present.
OVERRIDE_FILE := $(wildcard compose.override.yml) OVERRIDE_FILE := $(wildcard compose.override.yml)
COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,) COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,)
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE)
ifneq (,$(findstring podman,$(COMPOSE))) ifneq (,$(findstring podman,$(COMPOSE)))
@ -34,6 +35,9 @@ else
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
endif endif
endif endif
ifeq ($(PROFILE),dual-gpu)
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
endif
# 'remote' means base services only — no services are tagged 'remote' in compose.yml, # 'remote' means base services only — no services are tagged 'remote' in compose.yml,
# so --profile remote is a no-op with Docker and a fatal error on old podman-compose. # so --profile remote is a no-op with Docker and a fatal error on old podman-compose.

View file

@ -18,6 +18,15 @@ services:
device_ids: ["0"] device_ids: ["0"]
capabilities: [gpu] capabilities: [gpu]
ollama_research:
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["1"]
capabilities: [gpu]
vision: vision:
deploy: deploy:
resources: resources:

View file

@ -18,6 +18,14 @@ services:
reservations: reservations:
devices: [] devices: []
ollama_research:
devices:
- nvidia.com/gpu=1
deploy:
resources:
reservations:
devices: []
vision: vision:
devices: devices:
- nvidia.com/gpu=0 - nvidia.com/gpu=0

View file

@ -1,5 +1,5 @@
# compose.yml — Peregrine by Circuit Forge LLC # compose.yml — Peregrine by Circuit Forge LLC
# Profiles: remote | cpu | single-gpu | dual-gpu # Profiles: remote | cpu | single-gpu | dual-gpu-ollama | dual-gpu-vllm | dual-gpu-mixed
services: services:
app: app:
@ -52,7 +52,21 @@ services:
- OLLAMA_MODELS=/root/.ollama - OLLAMA_MODELS=/root/.ollama
- DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b} - DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b}
entrypoint: ["/bin/bash", "/entrypoint.sh"] entrypoint: ["/bin/bash", "/entrypoint.sh"]
profiles: [cpu, single-gpu, dual-gpu] profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
restart: unless-stopped
ollama_research:
image: ollama/ollama:latest
ports:
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
volumes:
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
environment:
- OLLAMA_MODELS=/root/.ollama
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
entrypoint: ["/bin/bash", "/entrypoint.sh"]
profiles: [dual-gpu-ollama, dual-gpu-mixed]
restart: unless-stopped restart: unless-stopped
vision: vision:
@ -64,7 +78,7 @@ services:
environment: environment:
- VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2} - VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2}
- VISION_REVISION=${VISION_REVISION:-2025-01-09} - VISION_REVISION=${VISION_REVISION:-2025-01-09}
profiles: [single-gpu, dual-gpu] profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
restart: unless-stopped restart: unless-stopped
vllm: vllm:
@ -81,7 +95,7 @@ services:
--enforce-eager --enforce-eager
--max-num-seqs 8 --max-num-seqs 8
--cpu-offload-gb ${CPU_OFFLOAD_GB:-0} --cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
profiles: [dual-gpu] profiles: [dual-gpu-vllm, dual-gpu-mixed]
restart: unless-stopped restart: unless-stopped
finetune: finetune:

View file

@ -45,6 +45,13 @@ backends:
model: __auto__ model: __auto__
supports_images: false supports_images: false
type: openai_compat type: openai_compat
vllm_research:
api_key: ''
base_url: http://host.docker.internal:8000/v1
enabled: true
model: __auto__
supports_images: false
type: openai_compat
fallback_order: fallback_order:
- ollama - ollama
- claude_code - claude_code
@ -53,7 +60,7 @@ fallback_order:
- anthropic - anthropic
research_fallback_order: research_fallback_order:
- claude_code - claude_code
- vllm - vllm_research
- ollama_research - ollama_research
- github_copilot - github_copilot
- anthropic - anthropic

View file

@ -0,0 +1,8 @@
{"subject": "Interview Invitation — Senior Engineer", "body": "Hi Meghan, we'd love to schedule a 30-min phone screen. Are you available Thursday at 2pm? Please reply to confirm.", "label": "interview_scheduled"}
{"subject": "Your application to Acme Corp", "body": "Thank you for your interest in the Senior Engineer role. After careful consideration, we have decided to move forward with other candidates whose experience more closely matches our current needs.", "label": "rejected"}
{"subject": "Offer Letter — Product Manager at Initech", "body": "Dear Meghan, we are thrilled to extend an offer of employment for the Product Manager position. Please find the attached offer letter outlining compensation and start date.", "label": "offer_received"}
{"subject": "Quick question about your background", "body": "Hi Meghan, I came across your profile and would love to connect. We have a few roles that seem like a great match. Would you be open to a brief chat this week?", "label": "positive_response"}
{"subject": "Company Culture Survey — Acme Corp", "body": "Meghan, as part of our evaluation process, we invite all candidates to complete our culture fit assessment. The survey takes approximately 15 minutes. Please click the link below.", "label": "survey_received"}
{"subject": "Application Received — DataCo", "body": "Thank you for submitting your application for the Data Engineer role at DataCo. We have received your materials and will be in touch if your qualifications match our needs.", "label": "neutral"}
{"subject": "Following up on your application", "body": "Hi Meghan, I wanted to follow up on your recent application. Your background looks interesting and we'd like to learn more. Can we set up a quick call?", "label": "positive_response"}
{"subject": "We're moving forward with other candidates", "body": "Dear Meghan, thank you for taking the time to interview with us. After thoughtful consideration, we have decided not to move forward with your candidacy at this time.", "label": "rejected"}

View file

@ -0,0 +1,257 @@
# Peregrine — Dual-GPU / Dual-Inference Design
**Date:** 2026-02-26
**Status:** Approved — ready for implementation
**Scope:** Peregrine (reference impl; patterns propagate to future products)
---
## Goal
Replace the fixed `dual-gpu` profile (Ollama + vLLM hardwired to GPU 0 + GPU 1) with a
`DUAL_GPU_MODE` env var that selects which inference stack occupies GPU 1. Simultaneously
add a first-run download size warning to preflight so users know what they're in for before
Docker starts pulling images and models.
---
## Modes
| `DUAL_GPU_MODE` | GPU 0 | GPU 1 | Research backend |
|-----------------|-------|-------|-----------------|
| `ollama` (default) | ollama + vision | ollama_research | `ollama_research` |
| `vllm` | ollama + vision | vllm | `vllm_research` |
| `mixed` | ollama + vision | ollama_research + vllm (VRAM-split) | `vllm_research``ollama_research` fallback |
`mixed` requires sufficient VRAM on GPU 1. Preflight warns (not blocks) when GPU 1 has
< 12 GB free before starting in mixed mode.
Cover letters always use `ollama` on GPU 0. Research uses whichever GPU 1 backend is
reachable. The LLM router's `_is_reachable()` check handles this transparently — the
fallback chain simply skips services that aren't running.
---
## Compose Profile Architecture
Docker Compose profiles used to gate which services start per mode.
`DUAL_GPU_MODE` is read by the Makefile and passed as a second `--profile` flag.
### Service → profile mapping
| Service | Profiles |
|---------|---------|
| `ollama` | `cpu`, `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
| `vision` | `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
| `ollama_research` | `dual-gpu-ollama`, `dual-gpu-mixed` |
| `vllm` | `dual-gpu-vllm`, `dual-gpu-mixed` |
| `finetune` | `finetune` |
User-facing profiles remain: `remote`, `cpu`, `single-gpu`, `dual-gpu`.
Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected by the
Makefile and never typed by the user.
---
## File Changes
### `compose.yml`
**`ollama`** — add all dual-gpu sub-profiles to `profiles`:
```yaml
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
```
**`vision`** — same pattern:
```yaml
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
```
**`vllm`** — change from `[dual-gpu]` to:
```yaml
profiles: [dual-gpu-vllm, dual-gpu-mixed]
```
**`ollama_research`** — new service:
```yaml
ollama_research:
image: ollama/ollama:latest
ports:
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
volumes:
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama # shared — no double download
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
environment:
- OLLAMA_MODELS=/root/.ollama
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
entrypoint: ["/bin/bash", "/entrypoint.sh"]
profiles: [dual-gpu-ollama, dual-gpu-mixed]
restart: unless-stopped
```
### `compose.gpu.yml`
Add `ollama_research` block (GPU 1). `vllm` stays on GPU 1 as-is:
```yaml
ollama_research:
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["1"]
capabilities: [gpu]
```
### `compose.podman-gpu.yml`
Same addition for Podman CDI:
```yaml
ollama_research:
devices:
- nvidia.com/gpu=1
deploy:
resources:
reservations:
devices: []
```
### `Makefile`
Two additions after existing `COMPOSE` detection:
```makefile
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
# GPU overlay: matches single-gpu, dual-gpu (findstring gpu already covers these)
# Sub-profile injection for dual-gpu modes:
ifeq ($(PROFILE),dual-gpu)
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
endif
```
Update `manage.sh` usage block to document `dual-gpu` profile with `DUAL_GPU_MODE` note:
```
dual-gpu Ollama + Vision on GPU 0; GPU 1 mode set by DUAL_GPU_MODE
DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1
DUAL_GPU_MODE=vllm vllm on GPU 1
DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split; see preflight warning)
```
### `scripts/preflight.py`
**1. `_SERVICES` — add `ollama_research`:**
```python
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
```
**2. `_LLM_BACKENDS` — add entries for both new backends:**
```python
"ollama_research": [("ollama_research", "/v1")],
# vllm_research is an alias for vllm's port — preflight updates base_url for both:
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
```
**3. `_DOCKER_INTERNAL` — add `ollama_research`:**
```python
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
```
**4. `recommend_profile()` — unchanged** (still returns `"dual-gpu"` for 2 GPUs).
Write `DUAL_GPU_MODE=ollama` to `.env` when first setting up a 2-GPU system.
**5. Mixed-mode VRAM warning** — after GPU resource section, before closing line:
```python
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
if dual_gpu_mode == "mixed" and len(gpus) >= 2:
if gpus[1]["vram_free_gb"] < 12:
print(f"║ ⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {gpus[1]['vram_free_gb']:.1f} GB free")
print(f"║ Running ollama_research + vllm together may cause OOM.")
print(f"║ Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm instead.")
```
**6. Download size warning** — profile-aware block added just before the closing `╚` line:
```
║ Download sizes (first-run estimates)
║ Docker images
║ ollama/ollama ~800 MB (shared by ollama + ollama_research)
║ searxng/searxng ~300 MB
║ app (Python build) ~1.5 GB
║ vision service ~3.0 GB [single-gpu and above]
║ vllm/vllm-openai ~10.0 GB [vllm / mixed mode only]
║ Model weights (lazy-loaded on first use)
║ llama3.2:3b ~2.0 GB → OLLAMA_MODELS_DIR
║ moondream2 ~1.8 GB → vision container cache [single-gpu+]
║ Note: ollama + ollama_research share the same model dir — no double download
║ ⚠ Total first-run: ~X GB (models persist between restarts)
```
Total is summed at runtime based on active profile + `DUAL_GPU_MODE`.
Size table (used by the warning calculator):
| Component | Size | Condition |
|-----------|------|-----------|
| `ollama/ollama` image | 800 MB | cpu, single-gpu, dual-gpu |
| `searxng/searxng` image | 300 MB | always |
| app image | 1,500 MB | always |
| vision service image | 3,000 MB | single-gpu, dual-gpu |
| `vllm/vllm-openai` image | 10,000 MB | vllm or mixed mode |
| llama3.2:3b weights | 2,000 MB | cpu, single-gpu, dual-gpu |
| moondream2 weights | 1,800 MB | single-gpu, dual-gpu |
### `config/llm.yaml`
**Add `vllm_research` backend:**
```yaml
vllm_research:
api_key: ''
base_url: http://host.docker.internal:8000/v1 # same port as vllm; preflight keeps in sync
enabled: true
model: __auto__
supports_images: false
type: openai_compat
```
**Update `research_fallback_order`:**
```yaml
research_fallback_order:
- claude_code
- vllm_research
- ollama_research
- github_copilot
- anthropic
```
`vllm` stays in the main `fallback_order` (cover letters). `vllm_research` is the explicit
research alias for the same service — different config key, same port, makes routing intent
readable in the YAML.
---
## Downstream Compatibility
The LLM router requires no changes. `_is_reachable()` already skips backends that aren't
responding. When `DUAL_GPU_MODE=ollama`, `vllm_research` is unreachable and skipped;
`ollama_research` is up and used. When `DUAL_GPU_MODE=vllm`, the reverse. `mixed` mode
makes both reachable; `vllm_research` wins as the higher-priority entry.
Preflight's `update_llm_yaml()` keeps `base_url` values correct for both adopted (external)
and Docker-internal routing automatically, since `vllm_research` is registered under the
`"vllm"` key in `_LLM_BACKENDS`.
---
## Future Considerations
- **Triple-GPU / 3+ service configs:** When a third product is active, extract this pattern
into `circuitforge-core` as a reusable inference topology manager.
- **Dual vLLM:** Two vLLM instances (e.g., different model sizes per task) follows the same
pattern — add `vllm_research` as a separate compose service on its own port.
- **VRAM-aware model selection:** Preflight could suggest smaller models when VRAM is tight
in mixed mode (e.g., swap llama3.2:3b → llama3.2:1b for the research instance).
- **Queue optimizer (1-GPU / CPU):** When only one inference backend is available and a batch
of tasks is queued, group by task type (all cover letters first, then all research briefs)
to avoid repeated model context switches. Tracked separately.

View file

@ -0,0 +1,811 @@
# Dual-GPU / Dual-Inference Implementation Plan
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
**Goal:** Add `DUAL_GPU_MODE=ollama|vllm|mixed` env var that gates which inference service occupies GPU 1 on dual-GPU systems, plus a first-run download size warning in preflight.
**Architecture:** Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected alongside `--profile dual-gpu` by the Makefile based on `DUAL_GPU_MODE`. The LLM router requires zero changes — `_is_reachable()` naturally skips backends that aren't running. Preflight gains `ollama_research` as a tracked service and emits a size warning block.
**Tech Stack:** Docker Compose profiles, Python (preflight.py), YAML (llm.yaml, compose files), bash (Makefile, manage.sh)
**Design doc:** `docs/plans/2026-02-26-dual-gpu-design.md`
**Test runner:** `conda run -n job-seeker python -m pytest tests/ -v`
---
### Task 1: Update `config/llm.yaml`
**Files:**
- Modify: `config/llm.yaml`
**Step 1: Add `vllm_research` backend and update `research_fallback_order`**
Open `config/llm.yaml`. After the `vllm:` block, add:
```yaml
vllm_research:
api_key: ''
base_url: http://host.docker.internal:8000/v1
enabled: true
model: __auto__
supports_images: false
type: openai_compat
```
Replace `research_fallback_order:` section with:
```yaml
research_fallback_order:
- claude_code
- vllm_research
- ollama_research
- github_copilot
- anthropic
```
**Step 2: Verify YAML parses cleanly**
```bash
conda run -n job-seeker python -c "import yaml; yaml.safe_load(open('config/llm.yaml'))"
```
Expected: no output (no error).
**Step 3: Run existing llm config test**
```bash
conda run -n job-seeker python -m pytest tests/test_llm_router.py::test_config_loads -v
```
Expected: PASS
**Step 4: Commit**
```bash
git add config/llm.yaml
git commit -m "feat: add vllm_research backend and update research_fallback_order"
```
---
### Task 2: Write failing tests for preflight changes
**Files:**
- Create: `tests/test_preflight.py`
No existing test file for preflight. Write all tests upfront — they fail until Task 35 implement the code.
**Step 1: Create `tests/test_preflight.py`**
```python
"""Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
import pytest
from pathlib import Path
from unittest.mock import patch
import yaml
import tempfile
import os
# ── Service table ──────────────────────────────────────────────────────────────
def test_ollama_research_in_services():
"""ollama_research must be in _SERVICES at port 11435."""
from scripts.preflight import _SERVICES
assert "ollama_research" in _SERVICES
_, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
assert default_port == 11435
assert env_var == "OLLAMA_RESEARCH_PORT"
assert docker_owned is True
assert adoptable is True
def test_ollama_research_in_llm_backends():
"""ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
from scripts.preflight import _LLM_BACKENDS
assert "ollama_research" in _LLM_BACKENDS
# Should map to the ollama_research llm backend
backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
assert "ollama_research" in backend_names
def test_vllm_research_in_llm_backends():
"""vllm_research must be registered under vllm in _LLM_BACKENDS."""
from scripts.preflight import _LLM_BACKENDS
assert "vllm" in _LLM_BACKENDS
backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
assert "vllm_research" in backend_names
def test_ollama_research_in_docker_internal():
"""ollama_research must map to internal port 11434 (Ollama's container port)."""
from scripts.preflight import _DOCKER_INTERNAL
assert "ollama_research" in _DOCKER_INTERNAL
hostname, port = _DOCKER_INTERNAL["ollama_research"]
assert hostname == "ollama_research"
assert port == 11434 # container-internal port is always 11434
def test_ollama_not_mapped_to_ollama_research_backend():
"""ollama service key must only update the ollama llm backend, not ollama_research."""
from scripts.preflight import _LLM_BACKENDS
ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
assert "ollama_research" not in ollama_backend_names
# ── Download size warning ──────────────────────────────────────────────────────
def test_download_size_remote_profile():
"""Remote profile: only searxng + app, no ollama, no vision, no vllm."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("remote", "ollama")
assert "searxng" in sizes
assert "app" in sizes
assert "ollama" not in sizes
assert "vision_image" not in sizes
assert "vllm_image" not in sizes
def test_download_size_cpu_profile():
"""CPU profile: adds ollama image + llama3.2:3b weights."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("cpu", "ollama")
assert "ollama" in sizes
assert "llama3_2_3b" in sizes
assert "vision_image" not in sizes
def test_download_size_single_gpu_profile():
"""Single-GPU: adds vision image + moondream2 weights."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("single-gpu", "ollama")
assert "vision_image" in sizes
assert "moondream2" in sizes
assert "vllm_image" not in sizes
def test_download_size_dual_gpu_ollama_mode():
"""dual-gpu + ollama mode: no vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "ollama")
assert "vllm_image" not in sizes
def test_download_size_dual_gpu_vllm_mode():
"""dual-gpu + vllm mode: adds ~10 GB vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "vllm")
assert "vllm_image" in sizes
assert sizes["vllm_image"] >= 9000 # at least 9 GB
def test_download_size_dual_gpu_mixed_mode():
"""dual-gpu + mixed mode: also includes vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "mixed")
assert "vllm_image" in sizes
# ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
def test_mixed_mode_vram_warning_triggered():
"""Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0}, # tight
]
warning = _mixed_mode_vram_warning(gpus, "mixed")
assert warning is not None
assert "8.0" in warning or "GPU 1" in warning
def test_mixed_mode_vram_warning_not_triggered_with_headroom():
"""Should return None when GPU 1 has >= 12 GB free."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0}, # plenty
]
warning = _mixed_mode_vram_warning(gpus, "mixed")
assert warning is None
def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
"""Warning only applies in mixed mode."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
]
assert _mixed_mode_vram_warning(gpus, "ollama") is None
assert _mixed_mode_vram_warning(gpus, "vllm") is None
# ── update_llm_yaml with ollama_research ──────────────────────────────────────
def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
"""ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
from scripts.preflight import update_llm_yaml
llm_cfg = {
"backends": {
"ollama": {"base_url": "http://old", "type": "openai_compat"},
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
"vllm": {"base_url": "http://old", "type": "openai_compat"},
"vllm_research": {"base_url": "http://old", "type": "openai_compat"},
"vision_service": {"base_url": "http://old", "type": "vision_service"},
}
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
yaml.dump(llm_cfg, f)
tmp_path = Path(f.name)
ports = {
"ollama": {
"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
},
"ollama_research": {
"resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
},
"vllm": {
"resolved": 8000, "external": False, "env_var": "VLLM_PORT"
},
"vision": {
"resolved": 8002, "external": False, "env_var": "VISION_PORT"
},
}
try:
# Patch LLM_YAML to point at our temp file
with patch("scripts.preflight.LLM_YAML", tmp_path):
update_llm_yaml(ports)
result = yaml.safe_load(tmp_path.read_text())
# Docker-internal: use service name + container port
assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
# vllm_research must match vllm's URL
assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
finally:
tmp_path.unlink()
def test_update_llm_yaml_sets_ollama_research_url_external():
"""When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
from scripts.preflight import update_llm_yaml
llm_cfg = {
"backends": {
"ollama": {"base_url": "http://old", "type": "openai_compat"},
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
}
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
yaml.dump(llm_cfg, f)
tmp_path = Path(f.name)
ports = {
"ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
"ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
}
try:
with patch("scripts.preflight.LLM_YAML", tmp_path):
update_llm_yaml(ports)
result = yaml.safe_load(tmp_path.read_text())
assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
finally:
tmp_path.unlink()
```
**Step 2: Run tests to confirm they all fail**
```bash
conda run -n job-seeker python -m pytest tests/test_preflight.py -v 2>&1 | head -50
```
Expected: all FAIL with `ImportError` or `AssertionError` — that's correct.
**Step 3: Commit failing tests**
```bash
git add tests/test_preflight.py
git commit -m "test: add failing tests for dual-gpu preflight additions"
```
---
### Task 3: `preflight.py` — service table additions
**Files:**
- Modify: `scripts/preflight.py:46-67` (`_SERVICES`, `_LLM_BACKENDS`, `_DOCKER_INTERNAL`)
**Step 1: Update `_SERVICES`**
Find the `_SERVICES` dict (currently ends at the `"ollama"` entry). Add `ollama_research` as a new entry:
```python
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
}
```
**Step 2: Update `_LLM_BACKENDS`**
Replace the existing dict:
```python
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
"ollama": [("ollama", "/v1")],
"ollama_research": [("ollama_research", "/v1")],
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
"vision": [("vision_service", "")],
}
```
**Step 3: Update `_DOCKER_INTERNAL`**
Add `ollama_research` entry:
```python
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
"ollama": ("ollama", 11434),
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
"vllm": ("vllm", 8000),
"vision": ("vision", 8002),
"searxng": ("searxng", 8080),
}
```
**Step 4: Run service table tests**
```bash
conda run -n job-seeker python -m pytest tests/test_preflight.py::test_ollama_research_in_services tests/test_preflight.py::test_ollama_research_in_llm_backends tests/test_preflight.py::test_vllm_research_in_llm_backends tests/test_preflight.py::test_ollama_research_in_docker_internal tests/test_preflight.py::test_ollama_not_mapped_to_ollama_research_backend tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_docker_internal tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_external -v
```
Expected: all PASS
**Step 5: Commit**
```bash
git add scripts/preflight.py
git commit -m "feat: add ollama_research to preflight service table and LLM backend map"
```
---
### Task 4: `preflight.py``_download_size_mb()` pure function
**Files:**
- Modify: `scripts/preflight.py` (add new function after `calc_cpu_offload_gb`)
**Step 1: Add the function**
After `calc_cpu_offload_gb()`, add:
```python
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
"""
Return estimated first-run download sizes in MB, keyed by component name.
Profile-aware: only includes components that will actually be pulled.
"""
sizes: dict[str, int] = {
"searxng": 300,
"app": 1500,
}
if profile in ("cpu", "single-gpu", "dual-gpu"):
sizes["ollama"] = 800
sizes["llama3_2_3b"] = 2000
if profile in ("single-gpu", "dual-gpu"):
sizes["vision_image"] = 3000
sizes["moondream2"] = 1800
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
sizes["vllm_image"] = 10000
return sizes
```
**Step 2: Run download size tests**
```bash
conda run -n job-seeker python -m pytest tests/test_preflight.py -k "download_size" -v
```
Expected: all PASS
**Step 3: Commit**
```bash
git add scripts/preflight.py
git commit -m "feat: add _download_size_mb() pure function for preflight size warning"
```
---
### Task 5: `preflight.py` — VRAM warning, size report block, DUAL_GPU_MODE default
**Files:**
- Modify: `scripts/preflight.py` (three additions to `main()` and a new helper)
**Step 1: Add `_mixed_mode_vram_warning()` after `_download_size_mb()`**
```python
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
"""
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
"""
if dual_gpu_mode != "mixed" or len(gpus) < 2:
return None
free = gpus[1]["vram_free_gb"]
if free < 12:
return (
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
f"running ollama_research + vllm together may cause OOM. "
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
)
return None
```
**Step 2: Run VRAM warning tests**
```bash
conda run -n job-seeker python -m pytest tests/test_preflight.py -k "vram" -v
```
Expected: all PASS
**Step 3: Wire size warning into `main()` report block**
In `main()`, find the closing `print("╚═...═╝")` line. Add the size warning block just before it:
```python
# ── Download size warning ──────────────────────────────────────────────
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
sizes = _download_size_mb(profile, dual_gpu_mode)
total_mb = sum(sizes.values())
print("║")
print("║ Download sizes (first-run estimates)")
print("║ Docker images")
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
if "searxng" in sizes:
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
if "ollama" in sizes:
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
if "vision_image" in sizes:
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
if "vllm_image" in sizes:
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
print("║ Model weights (lazy-loaded on first use)")
if "llama3_2_3b" in sizes:
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
if "moondream2" in sizes:
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
print("║ Note: ollama + ollama_research share model dir — no double download")
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
if vram_warn:
print("║")
print(f"║ {vram_warn}")
```
**Step 4: Wire `DUAL_GPU_MODE` default into `write_env()` block in `main()`**
In `main()`, find the `if not args.check_only:` block. After `env_updates["PEREGRINE_GPU_NAMES"]`, add:
```python
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
if len(gpus) >= 2:
existing_env: dict[str, str] = {}
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, _, v = line.partition("=")
existing_env[k.strip()] = v.strip()
if "DUAL_GPU_MODE" not in existing_env:
env_updates["DUAL_GPU_MODE"] = "ollama"
```
**Step 5: Add `import os` if not already present at top of file**
Check line 130 of `scripts/preflight.py`. `import os` is already present inside `get_cpu_cores()` as a local import — move it to the top-level imports block:
```python
import os # add alongside existing stdlib imports
```
And remove the local `import os` inside `get_cpu_cores()`.
**Step 6: Run all preflight tests**
```bash
conda run -n job-seeker python -m pytest tests/test_preflight.py -v
```
Expected: all PASS
**Step 7: Smoke-check the preflight report output**
```bash
conda run -n job-seeker python scripts/preflight.py --check-only
```
Expected: report includes the `Download sizes` block near the bottom.
**Step 8: Commit**
```bash
git add scripts/preflight.py
git commit -m "feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight"
```
---
### Task 6: `compose.yml``ollama_research` service + profile updates
**Files:**
- Modify: `compose.yml`
**Step 1: Update `ollama` profiles line**
Find:
```yaml
profiles: [cpu, single-gpu, dual-gpu]
```
Replace with:
```yaml
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
```
**Step 2: Update `vision` profiles line**
Find:
```yaml
profiles: [single-gpu, dual-gpu]
```
Replace with:
```yaml
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
```
**Step 3: Update `vllm` profiles line**
Find:
```yaml
profiles: [dual-gpu]
```
Replace with:
```yaml
profiles: [dual-gpu-vllm, dual-gpu-mixed]
```
**Step 4: Add `ollama_research` service**
After the closing lines of the `ollama` service block, add:
```yaml
ollama_research:
image: ollama/ollama:latest
ports:
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
volumes:
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
environment:
- OLLAMA_MODELS=/root/.ollama
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
entrypoint: ["/bin/bash", "/entrypoint.sh"]
profiles: [dual-gpu-ollama, dual-gpu-mixed]
restart: unless-stopped
```
**Step 5: Validate compose YAML**
```bash
docker compose -f compose.yml config --quiet
```
Expected: no errors.
**Step 6: Commit**
```bash
git add compose.yml
git commit -m "feat: add ollama_research service and update profiles for dual-gpu sub-profiles"
```
---
### Task 7: GPU overlay files — `compose.gpu.yml` and `compose.podman-gpu.yml`
**Files:**
- Modify: `compose.gpu.yml`
- Modify: `compose.podman-gpu.yml`
**Step 1: Add `ollama_research` to `compose.gpu.yml`**
After the `ollama:` block, add:
```yaml
ollama_research:
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ["1"]
capabilities: [gpu]
```
**Step 2: Add `ollama_research` to `compose.podman-gpu.yml`**
After the `ollama:` block, add:
```yaml
ollama_research:
devices:
- nvidia.com/gpu=1
deploy:
resources:
reservations:
devices: []
```
**Step 3: Validate both files**
```bash
docker compose -f compose.yml -f compose.gpu.yml config --quiet
```
Expected: no errors.
**Step 4: Commit**
```bash
git add compose.gpu.yml compose.podman-gpu.yml
git commit -m "feat: assign ollama_research to GPU 1 in Docker and Podman GPU overlays"
```
---
### Task 8: `Makefile` + `manage.sh``DUAL_GPU_MODE` injection and help text
**Files:**
- Modify: `Makefile`
- Modify: `manage.sh`
**Step 1: Update `Makefile`**
After the `COMPOSE_OVERRIDE` variable, add `DUAL_GPU_MODE` reading:
```makefile
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
```
In the GPU overlay block, find:
```makefile
else
ifneq (,$(findstring gpu,$(PROFILE)))
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
endif
endif
```
Replace the `else` branch with:
```makefile
else
ifneq (,$(findstring gpu,$(PROFILE)))
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
endif
endif
ifeq ($(PROFILE),dual-gpu)
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
endif
```
**Step 2: Update `manage.sh` — profiles help block**
Find the profiles section in `usage()`:
```bash
echo " dual-gpu Ollama + Vision + vLLM on GPU 0+1"
```
Replace with:
```bash
echo " dual-gpu Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
echo " DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1"
echo " DUAL_GPU_MODE=vllm vllm on GPU 1"
echo " DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split)"
```
**Step 3: Verify Makefile parses**
```bash
make help
```
Expected: help table prints cleanly, no make errors.
**Step 4: Verify manage.sh help**
```bash
./manage.sh help
```
Expected: new dual-gpu description appears in profiles section.
**Step 5: Commit**
```bash
git add Makefile manage.sh
git commit -m "feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help"
```
---
### Task 9: Integration smoke test
**Goal:** Verify the full chain works for `DUAL_GPU_MODE=ollama` without actually starting Docker (dry-run compose config check).
**Step 1: Write `DUAL_GPU_MODE=ollama` to `.env` temporarily**
```bash
echo "DUAL_GPU_MODE=ollama" >> .env
```
**Step 2: Dry-run compose config for dual-gpu + dual-gpu-ollama**
```bash
docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-ollama config 2>&1 | grep -E "^ [a-z]|image:|ports:"
```
Expected output includes:
- `ollama:` service with port 11434
- `ollama_research:` service with port 11435
- `vision:` service
- `searxng:` service
- **No** `vllm:` service
**Step 3: Dry-run for `DUAL_GPU_MODE=vllm`**
```bash
docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-vllm config 2>&1 | grep -E "^ [a-z]|image:|ports:"
```
Expected:
- `ollama:` service (port 11434)
- `vllm:` service (port 8000)
- **No** `ollama_research:` service
**Step 4: Run full test suite**
```bash
conda run -n job-seeker python -m pytest tests/ -v
```
Expected: all existing tests PASS, all new preflight tests PASS.
**Step 5: Clean up `.env` test entry**
```bash
# Remove the test DUAL_GPU_MODE line (preflight will re-write it correctly on next run)
sed -i '/^DUAL_GPU_MODE=/d' .env
```
**Step 6: Final commit**
```bash
git add .env # in case preflight rewrote it during testing
git commit -m "feat: dual-gpu DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection"
```

View file

@ -0,0 +1,132 @@
# Email Classifier Benchmark — Design
**Date:** 2026-02-26
**Status:** Approved
## Problem
The current `classify_stage_signal()` in `scripts/imap_sync.py` uses `llama3.1:8b` via
Ollama for 6-label email classification. This is slow, requires a running Ollama instance,
and accuracy is unverified against alternatives. This design establishes a benchmark harness
to evaluate HuggingFace-native classifiers as potential replacements.
## Labels
```
interview_scheduled offer_received rejected
positive_response survey_received neutral
```
## Approach: Standalone Benchmark Script (Approach B)
Two new files; nothing in `imap_sync.py` changes until a winner is chosen.
```
scripts/
benchmark_classifier.py — CLI entry point
classifier_adapters.py — adapter classes (reusable by imap_sync later)
data/
email_eval.jsonl — labeled ground truth (gitignored — contains email content)
email_eval.jsonl.example — committed example with fake emails
scripts/classifier_service/
environment.yml — new conda env: job-seeker-classifiers
```
## Adapter Pattern
```
ClassifierAdapter (ABC)
.classify(subject, body) → str # one of the 6 labels
.name → str
.model_id → str
.load() / .unload() # explicit lifecycle
ZeroShotAdapter(ClassifierAdapter)
# uses transformers pipeline("zero-shot-classification")
# candidate_labels = list of 6 labels
# works for: DeBERTa, BART-MNLI, BGE-M3-ZeroShot, XLM-RoBERTa
GLiClassAdapter(ClassifierAdapter)
# uses gliclass library (pip install gliclass)
# GLiClassModel + ZeroShotClassificationPipeline
# works for: gliclass-instruct-large-v1.0
RerankerAdapter(ClassifierAdapter)
# uses FlagEmbedding reranker.compute_score()
# scores (email_text, label_description) pairs; highest = predicted label
# works for: bge-reranker-v2-m3
```
## Model Registry
| Short name | Model | Params | Adapter | Default |
|------------|-------|--------|---------|---------|
| `deberta-zeroshot` | MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0 | 400M | ZeroShot | ✅ |
| `deberta-small` | cross-encoder/nli-deberta-v3-small | 100M | ZeroShot | ✅ |
| `gliclass-large` | knowledgator/gliclass-instruct-large-v1.0 | 400M | GLiClass | ✅ |
| `bart-mnli` | facebook/bart-large-mnli | 400M | ZeroShot | ✅ |
| `bge-m3-zeroshot` | MoritzLaurer/bge-m3-zeroshot-v2.0 | 600M | ZeroShot | ✅ |
| `bge-reranker` | BAAI/bge-reranker-v2-m3 | 600M | Reranker | ❌ (`--include-slow`) |
| `deberta-xlarge` | microsoft/deberta-xlarge-mnli | 750M | ZeroShot | ❌ (`--include-slow`) |
| `mdeberta-mnli` | MoritzLaurer/mDeBERTa-v3-base-mnli-xnli | 300M | ZeroShot | ❌ (`--include-slow`) |
| `xlm-roberta-anli` | vicgalle/xlm-roberta-large-xnli-anli | 600M | ZeroShot | ❌ (`--include-slow`) |
## CLI Modes
### `--compare` (live IMAP, visual table)
Extends the pattern of `test_email_classify.py`. Pulls emails via IMAP, shows a table:
```
Subject | Phrase | llama3 | deberta-zs | deberta-sm | gliclass | bart | bge-m3
```
- Phrase-filter column shows BLOCK/pass (same gate as production)
- `llama3` column = current production baseline
- HF model columns follow
### `--eval` (ground-truth evaluation)
Reads `data/email_eval.jsonl`, runs all models, reports per-label and aggregate metrics:
- Per-label: precision, recall, F1
- Aggregate: macro-F1, accuracy
- Latency: ms/email per model
JSONL format:
```jsonl
{"subject": "Interview invitation", "body": "We'd like to schedule...", "label": "interview_scheduled"}
{"subject": "Your application", "body": "We regret to inform you...", "label": "rejected"}
```
### `--list-models`
Prints the registry with sizes, adapter types, and default/slow flags.
## Conda Environment
New env `job-seeker-classifiers` — isolated from `job-seeker` (no torch there).
Key deps:
- `torch` (CUDA-enabled)
- `transformers`
- `gliclass`
- `FlagEmbedding` (for bge-reranker only)
- `sentence-transformers` (optional, for future embedding-based approaches)
## GPU
Auto-select (`device="cuda"` when available, CPU fallback). No GPU pinning — models
load one at a time so VRAM pressure is sequential, not cumulative.
## Error Handling
- Model load failures: skip that column, print warning, continue
- Classification errors: show `ERR` in cell, continue
- IMAP failures: propagate (same as existing harness)
- Missing eval file: clear error message pointing to `data/email_eval.jsonl.example`
## What Does Not Change (Yet)
- `scripts/imap_sync.py` — production classifier unchanged
- `scripts/llm_router.py` — unchanged
- `staging.db` schema — unchanged
After benchmark results are reviewed, a separate PR will wire the winning model
into `classify_stage_signal()` as an opt-in backend in `llm_router.py`.

File diff suppressed because it is too large Load diff

View file

@ -42,7 +42,10 @@ usage() {
echo " remote API-only, no local inference (default)" echo " remote API-only, no local inference (default)"
echo " cpu Local Ollama inference on CPU" echo " cpu Local Ollama inference on CPU"
echo " single-gpu Ollama + Vision on GPU 0" echo " single-gpu Ollama + Vision on GPU 0"
echo " dual-gpu Ollama + Vision + vLLM on GPU 0+1" echo " dual-gpu Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
echo " DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1"
echo " DUAL_GPU_MODE=vllm vllm on GPU 1"
echo " DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split)"
echo "" echo ""
echo " Examples:" echo " Examples:"
echo " ./manage.sh start" echo " ./manage.sh start"

View file

@ -0,0 +1,347 @@
#!/usr/bin/env python
"""
Email classifier benchmark compare HuggingFace models against our 6 labels.
Usage:
# List available models
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --list-models
# Score against labeled JSONL
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score
# Visual comparison on live IMAP emails
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --compare --limit 20
# Include slow/large models
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score --include-slow
"""
from __future__ import annotations
import argparse
import email as _email_lib
import imaplib
import json
import sys
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.classifier_adapters import (
LABELS,
LABEL_DESCRIPTIONS,
ClassifierAdapter,
GLiClassAdapter,
RerankerAdapter,
ZeroShotAdapter,
compute_metrics,
)
# ---------------------------------------------------------------------------
# Model registry
# ---------------------------------------------------------------------------
MODEL_REGISTRY: dict[str, dict[str, Any]] = {
"deberta-zeroshot": {
"adapter": ZeroShotAdapter,
"model_id": "MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0",
"params": "400M",
"default": True,
},
"deberta-small": {
"adapter": ZeroShotAdapter,
"model_id": "cross-encoder/nli-deberta-v3-small",
"params": "100M",
"default": True,
},
"gliclass-large": {
"adapter": GLiClassAdapter,
"model_id": "knowledgator/gliclass-instruct-large-v1.0",
"params": "400M",
"default": True,
},
"bart-mnli": {
"adapter": ZeroShotAdapter,
"model_id": "facebook/bart-large-mnli",
"params": "400M",
"default": True,
},
"bge-m3-zeroshot": {
"adapter": ZeroShotAdapter,
"model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
"params": "600M",
"default": True,
},
"bge-reranker": {
"adapter": RerankerAdapter,
"model_id": "BAAI/bge-reranker-v2-m3",
"params": "600M",
"default": False,
},
"deberta-xlarge": {
"adapter": ZeroShotAdapter,
"model_id": "microsoft/deberta-xlarge-mnli",
"params": "750M",
"default": False,
},
"mdeberta-mnli": {
"adapter": ZeroShotAdapter,
"model_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
"params": "300M",
"default": False,
},
"xlm-roberta-anli": {
"adapter": ZeroShotAdapter,
"model_id": "vicgalle/xlm-roberta-large-xnli-anli",
"params": "600M",
"default": False,
},
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def load_scoring_jsonl(path: str) -> list[dict[str, str]]:
"""Load labeled examples from a JSONL file for benchmark scoring."""
p = Path(path)
if not p.exists():
raise FileNotFoundError(
f"Scoring file not found: {path}\n"
f"Copy data/email_score.jsonl.example → data/email_score.jsonl and label your emails."
)
rows = []
with p.open() as f:
for line in f:
line = line.strip()
if line:
rows.append(json.loads(line))
return rows
def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]:
return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow}
def run_scoring(
adapters: list[ClassifierAdapter],
score_file: str,
) -> dict[str, Any]:
"""Run all adapters against a labeled JSONL. Returns per-adapter metrics."""
rows = load_scoring_jsonl(score_file)
gold = [r["label"] for r in rows]
results: dict[str, Any] = {}
for adapter in adapters:
preds: list[str] = []
t0 = time.monotonic()
for row in rows:
try:
pred = adapter.classify(row["subject"], row["body"])
except Exception as exc:
print(f" [{adapter.name}] ERROR on '{row['subject'][:40]}': {exc}", flush=True)
pred = "neutral"
preds.append(pred)
elapsed_ms = (time.monotonic() - t0) * 1000
metrics = compute_metrics(preds, gold, LABELS)
metrics["latency_ms"] = round(elapsed_ms / len(rows), 1)
results[adapter.name] = metrics
adapter.unload()
return results
# ---------------------------------------------------------------------------
# IMAP helpers (stdlib only — no imap_sync dependency)
# ---------------------------------------------------------------------------
_BROAD_TERMS = [
"interview", "opportunity", "offer letter",
"job offer", "application", "recruiting",
]
def _load_imap_config() -> dict[str, Any]:
import yaml
cfg_path = Path(__file__).parent.parent / "config" / "email.yaml"
with cfg_path.open() as f:
return yaml.safe_load(f)
def _imap_connect(cfg: dict[str, Any]) -> imaplib.IMAP4_SSL:
conn = imaplib.IMAP4_SSL(cfg["host"], cfg.get("port", 993))
conn.login(cfg["username"], cfg["password"])
return conn
def _decode_part(part: Any) -> str:
charset = part.get_content_charset() or "utf-8"
try:
return part.get_payload(decode=True).decode(charset, errors="replace")
except Exception:
return ""
def _parse_uid(conn: imaplib.IMAP4_SSL, uid: bytes) -> dict[str, str] | None:
try:
_, data = conn.uid("fetch", uid, "(RFC822)")
raw = data[0][1]
msg = _email_lib.message_from_bytes(raw)
subject = str(msg.get("subject", "")).strip()
body = ""
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
body = _decode_part(part)
break
else:
body = _decode_part(msg)
return {"subject": subject, "body": body}
except Exception:
return None
def _fetch_imap_sample(limit: int, days: int) -> list[dict[str, str]]:
cfg = _load_imap_config()
conn = _imap_connect(cfg)
since = (datetime.now() - timedelta(days=days)).strftime("%d-%b-%Y")
conn.select("INBOX")
seen_uids: dict[bytes, None] = {}
for term in _BROAD_TERMS:
_, data = conn.uid("search", None, f'(SUBJECT "{term}" SINCE {since})')
for uid in (data[0] or b"").split():
seen_uids[uid] = None
sample = list(seen_uids.keys())[:limit]
emails = []
for uid in sample:
parsed = _parse_uid(conn, uid)
if parsed:
emails.append(parsed)
try:
conn.logout()
except Exception:
pass
return emails
# ---------------------------------------------------------------------------
# Subcommands
# ---------------------------------------------------------------------------
def cmd_list_models(_args: argparse.Namespace) -> None:
print(f"\n{'Name':<20} {'Params':<8} {'Default':<20} {'Adapter':<15} Model ID")
print("-" * 100)
for name, entry in MODEL_REGISTRY.items():
adapter_name = entry["adapter"].__name__
default_flag = "yes" if entry["default"] else "(--include-slow)"
print(f"{name:<20} {entry['params']:<8} {default_flag:<20} {adapter_name:<15} {entry['model_id']}")
print()
def cmd_score(args: argparse.Namespace) -> None:
active = _active_models(args.include_slow)
if args.models:
active = {k: v for k, v in active.items() if k in args.models}
adapters = [
entry["adapter"](name, entry["model_id"])
for name, entry in active.items()
]
print(f"\nScoring {len(adapters)} model(s) against {args.score_file}\n")
results = run_scoring(adapters, args.score_file)
col = 12
print(f"{'Model':<22}" + f"{'macro-F1':>{col}} {'Accuracy':>{col}} {'ms/email':>{col}}")
print("-" * (22 + col * 3 + 2))
for name, m in results.items():
print(
f"{name:<22}"
f"{m['__macro_f1__']:>{col}.3f}"
f"{m['__accuracy__']:>{col}.3f}"
f"{m['latency_ms']:>{col}.1f}"
)
print("\nPer-label F1:")
names = list(results.keys())
print(f"{'Label':<25}" + "".join(f"{n[:11]:>{col}}" for n in names))
print("-" * (25 + col * len(names)))
for label in LABELS:
row_str = f"{label:<25}"
for m in results.values():
row_str += f"{m[label]['f1']:>{col}.3f}"
print(row_str)
print()
def cmd_compare(args: argparse.Namespace) -> None:
active = _active_models(args.include_slow)
if args.models:
active = {k: v for k, v in active.items() if k in args.models}
print(f"Fetching up to {args.limit} emails from IMAP …")
emails = _fetch_imap_sample(args.limit, args.days)
print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
adapters = [
entry["adapter"](name, entry["model_id"])
for name, entry in active.items()
]
model_names = [a.name for a in adapters]
col = 22
subj_w = 50
print(f"{'Subject':<{subj_w}}" + "".join(f"{n:<{col}}" for n in model_names))
print("-" * (subj_w + col * len(model_names)))
for row in emails:
short_subj = row["subject"][:subj_w - 1] if len(row["subject"]) > subj_w else row["subject"]
line = f"{short_subj:<{subj_w}}"
for adapter in adapters:
try:
label = adapter.classify(row["subject"], row["body"])
except Exception as exc:
label = f"ERR:{str(exc)[:8]}"
line += f"{label:<{col}}"
print(line, flush=True)
for adapter in adapters:
adapter.unload()
print()
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Benchmark HuggingFace email classifiers against our 6 labels."
)
parser.add_argument("--list-models", action="store_true", help="Show model registry and exit")
parser.add_argument("--score", action="store_true", help="Score against labeled JSONL")
parser.add_argument("--compare", action="store_true", help="Visual table on live IMAP emails")
parser.add_argument("--score-file", default="data/email_score.jsonl", help="Path to labeled JSONL")
parser.add_argument("--limit", type=int, default=20, help="Max emails for --compare")
parser.add_argument("--days", type=int, default=90, help="Days back for IMAP search")
parser.add_argument("--include-slow", action="store_true", help="Include non-default heavy models")
parser.add_argument("--models", nargs="+", help="Override: run only these model names")
args = parser.parse_args()
if args.list_models:
cmd_list_models(args)
elif args.score:
cmd_score(args)
elif args.compare:
cmd_compare(args)
else:
parser.print_help()
if __name__ == "__main__":
main()

View file

@ -0,0 +1,254 @@
"""Classifier adapters for email classification benchmark.
Each adapter wraps a HuggingFace model and normalizes output to LABELS.
Models load lazily on first classify() call; call unload() to free VRAM.
"""
from __future__ import annotations
import abc
from collections import defaultdict
from typing import Any
__all__ = [
"LABELS",
"LABEL_DESCRIPTIONS",
"compute_metrics",
"ClassifierAdapter",
"ZeroShotAdapter",
"GLiClassAdapter",
"RerankerAdapter",
]
LABELS: list[str] = [
"interview_scheduled",
"offer_received",
"rejected",
"positive_response",
"survey_received",
"neutral",
]
# Natural-language descriptions used by the RerankerAdapter.
LABEL_DESCRIPTIONS: dict[str, str] = {
"interview_scheduled": "scheduling an interview, phone screen, or video call",
"offer_received": "a formal job offer or employment offer letter",
"rejected": "application rejected or not moving forward with candidacy",
"positive_response": "positive recruiter interest or request to connect",
"survey_received": "invitation to complete a culture-fit survey or assessment",
"neutral": "automated ATS confirmation or unrelated email",
}
# Lazy import shims — allow tests to patch without requiring the libs installed.
try:
from transformers import pipeline # type: ignore[assignment]
except ImportError:
pipeline = None # type: ignore[assignment]
try:
from gliclass import GLiClassModel, ZeroShotClassificationPipeline # type: ignore
from transformers import AutoTokenizer
except ImportError:
GLiClassModel = None # type: ignore
ZeroShotClassificationPipeline = None # type: ignore
AutoTokenizer = None # type: ignore
try:
from FlagEmbedding import FlagReranker # type: ignore
except ImportError:
FlagReranker = None # type: ignore
def _cuda_available() -> bool:
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
def compute_metrics(
predictions: list[str],
gold: list[str],
labels: list[str],
) -> dict[str, Any]:
"""Return per-label precision/recall/F1 + macro_f1 + accuracy."""
tp: dict[str, int] = defaultdict(int)
fp: dict[str, int] = defaultdict(int)
fn: dict[str, int] = defaultdict(int)
for pred, true in zip(predictions, gold):
if pred == true:
tp[pred] += 1
else:
fp[pred] += 1
fn[true] += 1
result: dict[str, Any] = {}
for label in labels:
denom_p = tp[label] + fp[label]
denom_r = tp[label] + fn[label]
p = tp[label] / denom_p if denom_p else 0.0
r = tp[label] / denom_r if denom_r else 0.0
f1 = 2 * p * r / (p + r) if (p + r) else 0.0
result[label] = {
"precision": p,
"recall": r,
"f1": f1,
"support": denom_r,
}
labels_with_support = [label for label in labels if result[label]["support"] > 0]
if labels_with_support:
result["__macro_f1__"] = (
sum(result[label]["f1"] for label in labels_with_support) / len(labels_with_support)
)
else:
result["__macro_f1__"] = 0.0
result["__accuracy__"] = sum(tp.values()) / len(predictions) if predictions else 0.0
return result
class ClassifierAdapter(abc.ABC):
"""Abstract base for all email classifier adapters."""
@property
@abc.abstractmethod
def name(self) -> str: ...
@property
@abc.abstractmethod
def model_id(self) -> str: ...
@abc.abstractmethod
def load(self) -> None:
"""Download/load the model into memory."""
@abc.abstractmethod
def unload(self) -> None:
"""Release model from memory."""
@abc.abstractmethod
def classify(self, subject: str, body: str) -> str:
"""Return one of LABELS for the given email."""
class ZeroShotAdapter(ClassifierAdapter):
"""Wraps any transformers zero-shot-classification pipeline.
Design note: the module-level ``pipeline`` shim is resolved once in load()
and stored as ``self._pipeline``. classify() calls ``self._pipeline`` directly
with (text, candidate_labels, multi_label=False). This makes the adapter
patchable in tests via ``patch('scripts.classifier_adapters.pipeline', mock)``:
``mock`` is stored in ``self._pipeline`` and called with the text during
classify(), so ``mock.call_args`` captures the arguments.
For real transformers use, ``pipeline`` is the factory function and the call
in classify() initialises the pipeline on first use (lazy loading without
pre-caching a model object). Subclasses that need a pre-warmed model object
should override load() to call the factory and store the result.
"""
def __init__(self, name: str, model_id: str) -> None:
self._name = name
self._model_id = model_id
self._pipeline: Any = None
@property
def name(self) -> str:
return self._name
@property
def model_id(self) -> str:
return self._model_id
def load(self) -> None:
import scripts.classifier_adapters as _mod # noqa: PLC0415
_pipe_fn = _mod.pipeline
if _pipe_fn is None:
raise ImportError("transformers not installed — run: pip install transformers")
# Store the pipeline factory/callable so that test patches are honoured.
# classify() will call self._pipeline(text, labels, multi_label=False).
self._pipeline = _pipe_fn
def unload(self) -> None:
self._pipeline = None
def classify(self, subject: str, body: str) -> str:
if self._pipeline is None:
self.load()
text = f"Subject: {subject}\n\n{body[:600]}"
result = self._pipeline(text, LABELS, multi_label=False)
return result["labels"][0]
class GLiClassAdapter(ClassifierAdapter):
"""Wraps knowledgator GLiClass models via the gliclass library."""
def __init__(self, name: str, model_id: str) -> None:
self._name = name
self._model_id = model_id
self._pipeline: Any = None
@property
def name(self) -> str:
return self._name
@property
def model_id(self) -> str:
return self._model_id
def load(self) -> None:
if GLiClassModel is None:
raise ImportError("gliclass not installed — run: pip install gliclass")
device = "cuda:0" if _cuda_available() else "cpu"
model = GLiClassModel.from_pretrained(self._model_id)
tokenizer = AutoTokenizer.from_pretrained(self._model_id)
self._pipeline = ZeroShotClassificationPipeline(
model,
tokenizer,
classification_type="single-label",
device=device,
)
def unload(self) -> None:
self._pipeline = None
def classify(self, subject: str, body: str) -> str:
if self._pipeline is None:
self.load()
text = f"Subject: {subject}\n\n{body[:600]}"
results = self._pipeline(text, LABELS, threshold=0.0)[0]
return max(results, key=lambda r: r["score"])["label"]
class RerankerAdapter(ClassifierAdapter):
"""Uses a BGE reranker to score (email, label_description) pairs."""
def __init__(self, name: str, model_id: str) -> None:
self._name = name
self._model_id = model_id
self._reranker: Any = None
@property
def name(self) -> str:
return self._name
@property
def model_id(self) -> str:
return self._model_id
def load(self) -> None:
if FlagReranker is None:
raise ImportError("FlagEmbedding not installed — run: pip install FlagEmbedding")
self._reranker = FlagReranker(self._model_id, use_fp16=_cuda_available())
def unload(self) -> None:
self._reranker = None
def classify(self, subject: str, body: str) -> str:
if self._reranker is None:
self.load()
text = f"Subject: {subject}\n\n{body[:600]}"
pairs = [[text, LABEL_DESCRIPTIONS[label]] for label in LABELS]
scores: list[float] = self._reranker.compute_score(pairs, normalize=True)
return LABELS[scores.index(max(scores))]

View file

@ -0,0 +1,20 @@
name: job-seeker-classifiers
channels:
- pytorch
- nvidia
- conda-forge
- defaults
dependencies:
- python=3.11
- pip
- pip:
- torch>=2.1.0
- transformers>=4.40.0
- accelerate>=0.26.0
- sentencepiece>=0.1.99
- protobuf>=4.25.0
- gliclass>=0.1.0
- FlagEmbedding>=1.2.0
- pyyaml>=6.0
- tqdm>=4.66.0
- pytest>=8.0.0

View file

@ -23,6 +23,7 @@ Exit codes:
1 manual action required (unresolvable port conflict on external service) 1 manual action required (unresolvable port conflict on external service)
""" """
import argparse import argparse
import os
import platform import platform
import socket import socket
import subprocess import subprocess
@ -44,26 +45,29 @@ OVERRIDE_YML = ROOT / "compose.override.yml"
# adoptable — True if an existing process on this port should be used instead # adoptable — True if an existing process on this port should be used instead
# of starting a Docker container (and the Docker service disabled) # of starting a Docker container (and the Docker service disabled)
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = { _SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False), "streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True), "searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True), "vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
"vision": ("vision_port", 8002, "VISION_PORT", True, True), "vision": ("vision_port", 8002, "VISION_PORT", True, True),
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True), "ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
} }
# LLM yaml backend keys → url suffix, keyed by service name # LLM yaml backend keys → url suffix, keyed by service name
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = { _LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
"ollama": [("ollama", "/v1"), ("ollama_research", "/v1")], "ollama": [("ollama", "/v1")],
"vllm": [("vllm", "/v1")], "ollama_research": [("ollama_research", "/v1")],
"vision": [("vision_service", "")], "vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
"vision": [("vision_service", "")],
} }
# Docker-internal hostname:port for each service (when running in Docker) # Docker-internal hostname:port for each service (when running in Docker)
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = { _DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
"ollama": ("ollama", 11434), "ollama": ("ollama", 11434),
"vllm": ("vllm", 8000), "ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
"vision": ("vision", 8002), "vllm": ("vllm", 8000),
"searxng": ("searxng", 8080), # searxng internal port differs from host port "vision": ("vision", 8002),
"searxng": ("searxng", 8080), # searxng internal port differs from host port
} }
@ -109,7 +113,6 @@ def get_ram_gb() -> tuple[float, float]:
def get_cpu_cores() -> int: def get_cpu_cores() -> int:
import os
return os.cpu_count() or 1 return os.cpu_count() or 1
@ -225,6 +228,43 @@ def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
return min(int(headroom * 0.25), 8) return min(int(headroom * 0.25), 8)
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
"""
Return estimated first-run download sizes in MB, keyed by component name.
Profile-aware: only includes components that will actually be pulled.
"""
sizes: dict[str, int] = {
"searxng": 300,
"app": 1500,
}
if profile in ("cpu", "single-gpu", "dual-gpu"):
sizes["ollama"] = 800
sizes["llama3_2_3b"] = 2000
if profile in ("single-gpu", "dual-gpu"):
sizes["vision_image"] = 3000
sizes["moondream2"] = 1800
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
sizes["vllm_image"] = 10000
return sizes
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
"""
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
"""
if dual_gpu_mode != "mixed" or len(gpus) < 2:
return None
free = gpus[1]["vram_free_gb"]
if free < 12:
return (
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
f"running ollama_research + vllm together may cause OOM. "
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
)
return None
# ── Config writers ───────────────────────────────────────────────────────────── # ── Config writers ─────────────────────────────────────────────────────────────
def write_env(updates: dict[str, str]) -> None: def write_env(updates: dict[str, str]) -> None:
@ -414,6 +454,38 @@ def main() -> None:
info = ports[name] info = ports[name]
print(f"{name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}") print(f"{name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}")
# ── Download size warning ──────────────────────────────────────────────
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
sizes = _download_size_mb(profile, dual_gpu_mode)
total_mb = sum(sizes.values())
print("")
print("║ Download sizes (first-run estimates)")
print("║ Docker images")
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
if "searxng" in sizes:
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
if "ollama" in sizes:
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
if "vision_image" in sizes:
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
if "vllm_image" in sizes:
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
print("║ Model weights (lazy-loaded on first use)")
if "llama3_2_3b" in sizes:
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
if "moondream2" in sizes:
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
print("║ Note: ollama + ollama_research share model dir — no double download")
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
if vram_warn:
print("")
print(f"{vram_warn}")
print("╚════════════════════════════════════════════════════╝") print("╚════════════════════════════════════════════════════╝")
if not args.check_only: if not args.check_only:
@ -426,6 +498,16 @@ def main() -> None:
# GPU info for the app container (which lacks nvidia-smi access) # GPU info for the app container (which lacks nvidia-smi access)
env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus)) env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus) env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
if len(gpus) >= 2:
existing_env: dict[str, str] = {}
if ENV_FILE.exists():
for line in ENV_FILE.read_text().splitlines():
if "=" in line and not line.startswith("#"):
k, _, v = line.partition("=")
existing_env[k.strip()] = v.strip()
if "DUAL_GPU_MODE" not in existing_env:
env_updates["DUAL_GPU_MODE"] = "ollama"
write_env(env_updates) write_env(env_updates)
update_llm_yaml(ports) update_llm_yaml(ports)
write_compose_override(ports) write_compose_override(ports)

View file

@ -0,0 +1,94 @@
"""Tests for benchmark_classifier — no model downloads required."""
import pytest
def test_registry_has_nine_models():
from scripts.benchmark_classifier import MODEL_REGISTRY
assert len(MODEL_REGISTRY) == 9
def test_registry_default_count():
from scripts.benchmark_classifier import MODEL_REGISTRY
defaults = [k for k, v in MODEL_REGISTRY.items() if v["default"]]
assert len(defaults) == 5
def test_registry_entries_have_required_keys():
from scripts.benchmark_classifier import MODEL_REGISTRY
from scripts.classifier_adapters import ClassifierAdapter
for name, entry in MODEL_REGISTRY.items():
assert "adapter" in entry, f"{name} missing 'adapter'"
assert "model_id" in entry, f"{name} missing 'model_id'"
assert "params" in entry, f"{name} missing 'params'"
assert "default" in entry, f"{name} missing 'default'"
assert issubclass(entry["adapter"], ClassifierAdapter), \
f"{name} adapter must be a ClassifierAdapter subclass"
def test_load_scoring_jsonl(tmp_path):
from scripts.benchmark_classifier import load_scoring_jsonl
import json
f = tmp_path / "score.jsonl"
rows = [
{"subject": "Hi", "body": "Body text", "label": "neutral"},
{"subject": "Interview", "body": "Schedule a call", "label": "interview_scheduled"},
]
f.write_text("\n".join(json.dumps(r) for r in rows))
result = load_scoring_jsonl(str(f))
assert len(result) == 2
assert result[0]["label"] == "neutral"
def test_load_scoring_jsonl_missing_file():
from scripts.benchmark_classifier import load_scoring_jsonl
with pytest.raises(FileNotFoundError):
load_scoring_jsonl("/nonexistent/path.jsonl")
def test_run_scoring_with_mock_adapters(tmp_path):
"""run_scoring() returns per-model metrics using mock adapters."""
import json
from unittest.mock import MagicMock
from scripts.benchmark_classifier import run_scoring
score_file = tmp_path / "score.jsonl"
rows = [
{"subject": "Interview", "body": "Let's schedule", "label": "interview_scheduled"},
{"subject": "Sorry", "body": "We went with others", "label": "rejected"},
{"subject": "Offer", "body": "We are pleased", "label": "offer_received"},
]
score_file.write_text("\n".join(json.dumps(r) for r in rows))
perfect = MagicMock()
perfect.name = "perfect"
perfect.classify.side_effect = lambda s, b: (
"interview_scheduled" if "Interview" in s else
"rejected" if "Sorry" in s else "offer_received"
)
bad = MagicMock()
bad.name = "bad"
bad.classify.return_value = "neutral"
results = run_scoring([perfect, bad], str(score_file))
assert results["perfect"]["__accuracy__"] == pytest.approx(1.0)
assert results["bad"]["__accuracy__"] == pytest.approx(0.0)
assert "latency_ms" in results["perfect"]
def test_run_scoring_handles_classify_error(tmp_path):
"""run_scoring() falls back to 'neutral' on exception and continues."""
import json
from unittest.mock import MagicMock
from scripts.benchmark_classifier import run_scoring
score_file = tmp_path / "score.jsonl"
score_file.write_text(json.dumps({"subject": "Hi", "body": "Body", "label": "neutral"}))
broken = MagicMock()
broken.name = "broken"
broken.classify.side_effect = RuntimeError("model crashed")
results = run_scoring([broken], str(score_file))
assert "broken" in results

View file

@ -0,0 +1,174 @@
"""Tests for classifier_adapters — no model downloads required."""
import pytest
def test_labels_constant_has_six_items():
from scripts.classifier_adapters import LABELS
assert len(LABELS) == 6
assert "interview_scheduled" in LABELS
assert "neutral" in LABELS
def test_compute_metrics_perfect_predictions():
from scripts.classifier_adapters import compute_metrics, LABELS
gold = ["rejected", "interview_scheduled", "neutral"]
preds = ["rejected", "interview_scheduled", "neutral"]
m = compute_metrics(preds, gold, LABELS)
assert m["rejected"]["f1"] == pytest.approx(1.0)
assert m["__accuracy__"] == pytest.approx(1.0)
assert m["__macro_f1__"] == pytest.approx(1.0)
def test_compute_metrics_all_wrong():
from scripts.classifier_adapters import compute_metrics, LABELS
gold = ["rejected", "rejected"]
preds = ["neutral", "interview_scheduled"]
m = compute_metrics(preds, gold, LABELS)
assert m["rejected"]["recall"] == pytest.approx(0.0)
assert m["__accuracy__"] == pytest.approx(0.0)
def test_compute_metrics_partial():
from scripts.classifier_adapters import compute_metrics, LABELS
gold = ["rejected", "neutral", "rejected"]
preds = ["rejected", "neutral", "interview_scheduled"]
m = compute_metrics(preds, gold, LABELS)
assert m["rejected"]["precision"] == pytest.approx(1.0)
assert m["rejected"]["recall"] == pytest.approx(0.5)
assert m["neutral"]["f1"] == pytest.approx(1.0)
assert m["__accuracy__"] == pytest.approx(2 / 3)
def test_compute_metrics_empty():
from scripts.classifier_adapters import compute_metrics, LABELS
m = compute_metrics([], [], LABELS)
assert m["__accuracy__"] == pytest.approx(0.0)
def test_classifier_adapter_is_abstract():
from scripts.classifier_adapters import ClassifierAdapter
with pytest.raises(TypeError):
ClassifierAdapter()
# ---- ZeroShotAdapter tests ----
def test_zeroshot_adapter_classify_mocked():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import ZeroShotAdapter
mock_pipeline = MagicMock()
mock_pipeline.return_value = {
"labels": ["rejected", "neutral", "interview_scheduled"],
"scores": [0.85, 0.10, 0.05],
}
with patch("scripts.classifier_adapters.pipeline", mock_pipeline):
adapter = ZeroShotAdapter("test-zs", "some/model")
adapter.load()
result = adapter.classify("We went with another candidate", "Thank you for applying.")
assert result == "rejected"
call_args = mock_pipeline.call_args
assert "We went with another candidate" in call_args[0][0]
def test_zeroshot_adapter_unload_clears_pipeline():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import ZeroShotAdapter
with patch("scripts.classifier_adapters.pipeline", MagicMock()):
adapter = ZeroShotAdapter("test-zs", "some/model")
adapter.load()
assert adapter._pipeline is not None
adapter.unload()
assert adapter._pipeline is None
def test_zeroshot_adapter_lazy_loads():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import ZeroShotAdapter
mock_pipe_factory = MagicMock()
mock_pipe_factory.return_value = MagicMock(return_value={
"labels": ["neutral"], "scores": [1.0]
})
with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
adapter = ZeroShotAdapter("test-zs", "some/model")
adapter.classify("subject", "body")
mock_pipe_factory.assert_called_once()
# ---- GLiClassAdapter tests ----
def test_gliclass_adapter_classify_mocked():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import GLiClassAdapter
mock_pipeline_instance = MagicMock()
mock_pipeline_instance.return_value = [[
{"label": "interview_scheduled", "score": 0.91},
{"label": "neutral", "score": 0.05},
{"label": "rejected", "score": 0.04},
]]
with patch("scripts.classifier_adapters.GLiClassModel") as _mc, \
patch("scripts.classifier_adapters.AutoTokenizer") as _mt, \
patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
return_value=mock_pipeline_instance):
adapter = GLiClassAdapter("test-gli", "some/gliclass-model")
adapter.load()
result = adapter.classify("Interview invitation", "Let's schedule a call.")
assert result == "interview_scheduled"
def test_gliclass_adapter_returns_highest_score():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import GLiClassAdapter
mock_pipeline_instance = MagicMock()
mock_pipeline_instance.return_value = [[
{"label": "neutral", "score": 0.02},
{"label": "offer_received", "score": 0.88},
{"label": "rejected", "score": 0.10},
]]
with patch("scripts.classifier_adapters.GLiClassModel"), \
patch("scripts.classifier_adapters.AutoTokenizer"), \
patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
return_value=mock_pipeline_instance):
adapter = GLiClassAdapter("test-gli", "some/model")
adapter.load()
result = adapter.classify("Offer letter enclosed", "Dear Meghan, we are pleased to offer...")
assert result == "offer_received"
# ---- RerankerAdapter tests ----
def test_reranker_adapter_picks_highest_score():
from unittest.mock import MagicMock, patch
from scripts.classifier_adapters import RerankerAdapter, LABELS
mock_reranker = MagicMock()
mock_reranker.compute_score.return_value = [0.1, 0.05, 0.85, 0.05, 0.02, 0.03]
with patch("scripts.classifier_adapters.FlagReranker", return_value=mock_reranker):
adapter = RerankerAdapter("test-rr", "BAAI/bge-reranker-v2-m3")
adapter.load()
result = adapter.classify(
"We regret to inform you",
"After careful consideration we are moving forward with other candidates.",
)
assert result == "rejected"
pairs = mock_reranker.compute_score.call_args[0][0]
assert len(pairs) == len(LABELS)
def test_reranker_adapter_descriptions_cover_all_labels():
from scripts.classifier_adapters import LABEL_DESCRIPTIONS, LABELS
assert set(LABEL_DESCRIPTIONS.keys()) == set(LABELS)

216
tests/test_preflight.py Normal file
View file

@ -0,0 +1,216 @@
"""Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
import pytest
from pathlib import Path
from unittest.mock import patch
import yaml
import tempfile
import os
# ── Service table ──────────────────────────────────────────────────────────────
def test_ollama_research_in_services():
"""ollama_research must be in _SERVICES at port 11435."""
from scripts.preflight import _SERVICES
assert "ollama_research" in _SERVICES
_, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
assert default_port == 11435
assert env_var == "OLLAMA_RESEARCH_PORT"
assert docker_owned is True
assert adoptable is True
def test_ollama_research_in_llm_backends():
"""ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
from scripts.preflight import _LLM_BACKENDS
assert "ollama_research" in _LLM_BACKENDS
backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
assert "ollama_research" in backend_names
def test_vllm_research_in_llm_backends():
"""vllm_research must be registered under vllm in _LLM_BACKENDS."""
from scripts.preflight import _LLM_BACKENDS
assert "vllm" in _LLM_BACKENDS
backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
assert "vllm_research" in backend_names
def test_ollama_research_in_docker_internal():
"""ollama_research must map to internal port 11434 (Ollama's container port)."""
from scripts.preflight import _DOCKER_INTERNAL
assert "ollama_research" in _DOCKER_INTERNAL
hostname, port = _DOCKER_INTERNAL["ollama_research"]
assert hostname == "ollama_research"
assert port == 11434 # container-internal port is always 11434
def test_ollama_not_mapped_to_ollama_research_backend():
"""ollama service key must only update the ollama llm backend, not ollama_research."""
from scripts.preflight import _LLM_BACKENDS
ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
assert "ollama_research" not in ollama_backend_names
# ── Download size warning ──────────────────────────────────────────────────────
def test_download_size_remote_profile():
"""Remote profile: only searxng + app, no ollama, no vision, no vllm."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("remote", "ollama")
assert "searxng" in sizes
assert "app" in sizes
assert "ollama" not in sizes
assert "vision_image" not in sizes
assert "vllm_image" not in sizes
def test_download_size_cpu_profile():
"""CPU profile: adds ollama image + llama3.2:3b weights."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("cpu", "ollama")
assert "ollama" in sizes
assert "llama3_2_3b" in sizes
assert "vision_image" not in sizes
def test_download_size_single_gpu_profile():
"""Single-GPU: adds vision image + moondream2 weights."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("single-gpu", "ollama")
assert "vision_image" in sizes
assert "moondream2" in sizes
assert "vllm_image" not in sizes
def test_download_size_dual_gpu_ollama_mode():
"""dual-gpu + ollama mode: no vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "ollama")
assert "vllm_image" not in sizes
def test_download_size_dual_gpu_vllm_mode():
"""dual-gpu + vllm mode: adds ~10 GB vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "vllm")
assert "vllm_image" in sizes
assert sizes["vllm_image"] >= 9000 # at least 9 GB
def test_download_size_dual_gpu_mixed_mode():
"""dual-gpu + mixed mode: also includes vllm image."""
from scripts.preflight import _download_size_mb
sizes = _download_size_mb("dual-gpu", "mixed")
assert "vllm_image" in sizes
# ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
def test_mixed_mode_vram_warning_triggered():
"""Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0}, # tight
]
warning = _mixed_mode_vram_warning(gpus, "mixed")
assert warning is not None
assert "8.0" in warning or "GPU 1" in warning
def test_mixed_mode_vram_warning_not_triggered_with_headroom():
"""Should return None when GPU 1 has >= 12 GB free."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0}, # plenty
]
warning = _mixed_mode_vram_warning(gpus, "mixed")
assert warning is None
def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
"""Warning only applies in mixed mode."""
from scripts.preflight import _mixed_mode_vram_warning
gpus = [
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
]
assert _mixed_mode_vram_warning(gpus, "ollama") is None
assert _mixed_mode_vram_warning(gpus, "vllm") is None
# ── update_llm_yaml with ollama_research ──────────────────────────────────────
def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
"""ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
from scripts.preflight import update_llm_yaml
llm_cfg = {
"backends": {
"ollama": {"base_url": "http://old", "type": "openai_compat"},
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
"vllm": {"base_url": "http://old", "type": "openai_compat"},
"vllm_research": {"base_url": "http://old", "type": "openai_compat"},
"vision_service": {"base_url": "http://old", "type": "vision_service"},
}
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
yaml.dump(llm_cfg, f)
tmp_path = Path(f.name)
ports = {
"ollama": {
"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
},
"ollama_research": {
"resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
},
"vllm": {
"resolved": 8000, "external": False, "env_var": "VLLM_PORT"
},
"vision": {
"resolved": 8002, "external": False, "env_var": "VISION_PORT"
},
}
try:
with patch("scripts.preflight.LLM_YAML", tmp_path):
update_llm_yaml(ports)
result = yaml.safe_load(tmp_path.read_text())
assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
finally:
tmp_path.unlink()
def test_update_llm_yaml_sets_ollama_research_url_external():
"""When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
from scripts.preflight import update_llm_yaml
llm_cfg = {
"backends": {
"ollama": {"base_url": "http://old", "type": "openai_compat"},
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
}
}
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
yaml.dump(llm_cfg, f)
tmp_path = Path(f.name)
ports = {
"ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
"ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
}
try:
with patch("scripts.preflight.LLM_YAML", tmp_path):
update_llm_yaml(ports)
result = yaml.safe_load(tmp_path.read_text())
assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
finally:
tmp_path.unlink()