Compare commits
16 commits
535190f8f7
...
11f6334f28
| Author | SHA1 | Date | |
|---|---|---|---|
| 11f6334f28 | |||
| 7ef95dd9ba | |||
| e6d5bb2c1a | |||
| 5d35257a23 | |||
| c223653722 | |||
| 44c3d9a5d6 | |||
| b03e5f6c57 | |||
| c9d7b810f6 | |||
| dd40a84174 | |||
| baa862bc14 | |||
| e99b3703f1 | |||
| a66811dd69 | |||
| 8c7faabc56 | |||
| 41e0fe7f55 | |||
| a9e84521c0 | |||
| a8fd53f28c |
19 changed files with 3796 additions and 19 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -35,3 +35,6 @@ config/user.yaml.working
|
||||||
|
|
||||||
# Claude context files — kept out of version control
|
# Claude context files — kept out of version control
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
|
|
||||||
|
data/email_score.jsonl
|
||||||
|
data/email_compare_sample.jsonl
|
||||||
|
|
|
||||||
4
Makefile
4
Makefile
|
|
@ -23,6 +23,7 @@ COMPOSE ?= $(shell \
|
||||||
# compose.override.yml. We must include it explicitly when present.
|
# compose.override.yml. We must include it explicitly when present.
|
||||||
OVERRIDE_FILE := $(wildcard compose.override.yml)
|
OVERRIDE_FILE := $(wildcard compose.override.yml)
|
||||||
COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,)
|
COMPOSE_OVERRIDE := $(if $(OVERRIDE_FILE),-f compose.override.yml,)
|
||||||
|
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
|
||||||
|
|
||||||
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE)
|
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE)
|
||||||
ifneq (,$(findstring podman,$(COMPOSE)))
|
ifneq (,$(findstring podman,$(COMPOSE)))
|
||||||
|
|
@ -34,6 +35,9 @@ else
|
||||||
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
|
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(PROFILE),dual-gpu)
|
||||||
|
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
|
||||||
|
endif
|
||||||
|
|
||||||
# 'remote' means base services only — no services are tagged 'remote' in compose.yml,
|
# 'remote' means base services only — no services are tagged 'remote' in compose.yml,
|
||||||
# so --profile remote is a no-op with Docker and a fatal error on old podman-compose.
|
# so --profile remote is a no-op with Docker and a fatal error on old podman-compose.
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,15 @@ services:
|
||||||
device_ids: ["0"]
|
device_ids: ["0"]
|
||||||
capabilities: [gpu]
|
capabilities: [gpu]
|
||||||
|
|
||||||
|
ollama_research:
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
device_ids: ["1"]
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
||||||
vision:
|
vision:
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
|
|
|
||||||
|
|
@ -18,6 +18,14 @@ services:
|
||||||
reservations:
|
reservations:
|
||||||
devices: []
|
devices: []
|
||||||
|
|
||||||
|
ollama_research:
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=1
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices: []
|
||||||
|
|
||||||
vision:
|
vision:
|
||||||
devices:
|
devices:
|
||||||
- nvidia.com/gpu=0
|
- nvidia.com/gpu=0
|
||||||
|
|
|
||||||
22
compose.yml
22
compose.yml
|
|
@ -1,5 +1,5 @@
|
||||||
# compose.yml — Peregrine by Circuit Forge LLC
|
# compose.yml — Peregrine by Circuit Forge LLC
|
||||||
# Profiles: remote | cpu | single-gpu | dual-gpu
|
# Profiles: remote | cpu | single-gpu | dual-gpu-ollama | dual-gpu-vllm | dual-gpu-mixed
|
||||||
services:
|
services:
|
||||||
|
|
||||||
app:
|
app:
|
||||||
|
|
@ -52,7 +52,21 @@ services:
|
||||||
- OLLAMA_MODELS=/root/.ollama
|
- OLLAMA_MODELS=/root/.ollama
|
||||||
- DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b}
|
- DEFAULT_OLLAMA_MODEL=${OLLAMA_DEFAULT_MODEL:-llama3.2:3b}
|
||||||
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||||
profiles: [cpu, single-gpu, dual-gpu]
|
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
|
ollama_research:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
ports:
|
||||||
|
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
|
||||||
|
volumes:
|
||||||
|
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
|
||||||
|
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
|
||||||
|
environment:
|
||||||
|
- OLLAMA_MODELS=/root/.ollama
|
||||||
|
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
|
||||||
|
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||||
|
profiles: [dual-gpu-ollama, dual-gpu-mixed]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
vision:
|
vision:
|
||||||
|
|
@ -64,7 +78,7 @@ services:
|
||||||
environment:
|
environment:
|
||||||
- VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2}
|
- VISION_MODEL=${VISION_MODEL:-vikhyatk/moondream2}
|
||||||
- VISION_REVISION=${VISION_REVISION:-2025-01-09}
|
- VISION_REVISION=${VISION_REVISION:-2025-01-09}
|
||||||
profiles: [single-gpu, dual-gpu]
|
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
vllm:
|
vllm:
|
||||||
|
|
@ -81,7 +95,7 @@ services:
|
||||||
--enforce-eager
|
--enforce-eager
|
||||||
--max-num-seqs 8
|
--max-num-seqs 8
|
||||||
--cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
|
--cpu-offload-gb ${CPU_OFFLOAD_GB:-0}
|
||||||
profiles: [dual-gpu]
|
profiles: [dual-gpu-vllm, dual-gpu-mixed]
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
finetune:
|
finetune:
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,13 @@ backends:
|
||||||
model: __auto__
|
model: __auto__
|
||||||
supports_images: false
|
supports_images: false
|
||||||
type: openai_compat
|
type: openai_compat
|
||||||
|
vllm_research:
|
||||||
|
api_key: ''
|
||||||
|
base_url: http://host.docker.internal:8000/v1
|
||||||
|
enabled: true
|
||||||
|
model: __auto__
|
||||||
|
supports_images: false
|
||||||
|
type: openai_compat
|
||||||
fallback_order:
|
fallback_order:
|
||||||
- ollama
|
- ollama
|
||||||
- claude_code
|
- claude_code
|
||||||
|
|
@ -53,7 +60,7 @@ fallback_order:
|
||||||
- anthropic
|
- anthropic
|
||||||
research_fallback_order:
|
research_fallback_order:
|
||||||
- claude_code
|
- claude_code
|
||||||
- vllm
|
- vllm_research
|
||||||
- ollama_research
|
- ollama_research
|
||||||
- github_copilot
|
- github_copilot
|
||||||
- anthropic
|
- anthropic
|
||||||
|
|
|
||||||
8
data/email_score.jsonl.example
Normal file
8
data/email_score.jsonl.example
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
{"subject": "Interview Invitation — Senior Engineer", "body": "Hi Meghan, we'd love to schedule a 30-min phone screen. Are you available Thursday at 2pm? Please reply to confirm.", "label": "interview_scheduled"}
|
||||||
|
{"subject": "Your application to Acme Corp", "body": "Thank you for your interest in the Senior Engineer role. After careful consideration, we have decided to move forward with other candidates whose experience more closely matches our current needs.", "label": "rejected"}
|
||||||
|
{"subject": "Offer Letter — Product Manager at Initech", "body": "Dear Meghan, we are thrilled to extend an offer of employment for the Product Manager position. Please find the attached offer letter outlining compensation and start date.", "label": "offer_received"}
|
||||||
|
{"subject": "Quick question about your background", "body": "Hi Meghan, I came across your profile and would love to connect. We have a few roles that seem like a great match. Would you be open to a brief chat this week?", "label": "positive_response"}
|
||||||
|
{"subject": "Company Culture Survey — Acme Corp", "body": "Meghan, as part of our evaluation process, we invite all candidates to complete our culture fit assessment. The survey takes approximately 15 minutes. Please click the link below.", "label": "survey_received"}
|
||||||
|
{"subject": "Application Received — DataCo", "body": "Thank you for submitting your application for the Data Engineer role at DataCo. We have received your materials and will be in touch if your qualifications match our needs.", "label": "neutral"}
|
||||||
|
{"subject": "Following up on your application", "body": "Hi Meghan, I wanted to follow up on your recent application. Your background looks interesting and we'd like to learn more. Can we set up a quick call?", "label": "positive_response"}
|
||||||
|
{"subject": "We're moving forward with other candidates", "body": "Dear Meghan, thank you for taking the time to interview with us. After thoughtful consideration, we have decided not to move forward with your candidacy at this time.", "label": "rejected"}
|
||||||
257
docs/plans/2026-02-26-dual-gpu-design.md
Normal file
257
docs/plans/2026-02-26-dual-gpu-design.md
Normal file
|
|
@ -0,0 +1,257 @@
|
||||||
|
# Peregrine — Dual-GPU / Dual-Inference Design
|
||||||
|
|
||||||
|
**Date:** 2026-02-26
|
||||||
|
**Status:** Approved — ready for implementation
|
||||||
|
**Scope:** Peregrine (reference impl; patterns propagate to future products)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Goal
|
||||||
|
|
||||||
|
Replace the fixed `dual-gpu` profile (Ollama + vLLM hardwired to GPU 0 + GPU 1) with a
|
||||||
|
`DUAL_GPU_MODE` env var that selects which inference stack occupies GPU 1. Simultaneously
|
||||||
|
add a first-run download size warning to preflight so users know what they're in for before
|
||||||
|
Docker starts pulling images and models.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Modes
|
||||||
|
|
||||||
|
| `DUAL_GPU_MODE` | GPU 0 | GPU 1 | Research backend |
|
||||||
|
|-----------------|-------|-------|-----------------|
|
||||||
|
| `ollama` (default) | ollama + vision | ollama_research | `ollama_research` |
|
||||||
|
| `vllm` | ollama + vision | vllm | `vllm_research` |
|
||||||
|
| `mixed` | ollama + vision | ollama_research + vllm (VRAM-split) | `vllm_research` → `ollama_research` fallback |
|
||||||
|
|
||||||
|
`mixed` requires sufficient VRAM on GPU 1. Preflight warns (not blocks) when GPU 1 has
|
||||||
|
< 12 GB free before starting in mixed mode.
|
||||||
|
|
||||||
|
Cover letters always use `ollama` on GPU 0. Research uses whichever GPU 1 backend is
|
||||||
|
reachable. The LLM router's `_is_reachable()` check handles this transparently — the
|
||||||
|
fallback chain simply skips services that aren't running.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Compose Profile Architecture
|
||||||
|
|
||||||
|
Docker Compose profiles used to gate which services start per mode.
|
||||||
|
`DUAL_GPU_MODE` is read by the Makefile and passed as a second `--profile` flag.
|
||||||
|
|
||||||
|
### Service → profile mapping
|
||||||
|
|
||||||
|
| Service | Profiles |
|
||||||
|
|---------|---------|
|
||||||
|
| `ollama` | `cpu`, `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
|
||||||
|
| `vision` | `single-gpu`, `dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed` |
|
||||||
|
| `ollama_research` | `dual-gpu-ollama`, `dual-gpu-mixed` |
|
||||||
|
| `vllm` | `dual-gpu-vllm`, `dual-gpu-mixed` |
|
||||||
|
| `finetune` | `finetune` |
|
||||||
|
|
||||||
|
User-facing profiles remain: `remote`, `cpu`, `single-gpu`, `dual-gpu`.
|
||||||
|
Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected by the
|
||||||
|
Makefile and never typed by the user.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## File Changes
|
||||||
|
|
||||||
|
### `compose.yml`
|
||||||
|
|
||||||
|
**`ollama`** — add all dual-gpu sub-profiles to `profiles`:
|
||||||
|
```yaml
|
||||||
|
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**`vision`** — same pattern:
|
||||||
|
```yaml
|
||||||
|
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**`vllm`** — change from `[dual-gpu]` to:
|
||||||
|
```yaml
|
||||||
|
profiles: [dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**`ollama_research`** — new service:
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
ports:
|
||||||
|
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
|
||||||
|
volumes:
|
||||||
|
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama # shared — no double download
|
||||||
|
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
|
||||||
|
environment:
|
||||||
|
- OLLAMA_MODELS=/root/.ollama
|
||||||
|
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
|
||||||
|
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||||
|
profiles: [dual-gpu-ollama, dual-gpu-mixed]
|
||||||
|
restart: unless-stopped
|
||||||
|
```
|
||||||
|
|
||||||
|
### `compose.gpu.yml`
|
||||||
|
|
||||||
|
Add `ollama_research` block (GPU 1). `vllm` stays on GPU 1 as-is:
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
device_ids: ["1"]
|
||||||
|
capabilities: [gpu]
|
||||||
|
```
|
||||||
|
|
||||||
|
### `compose.podman-gpu.yml`
|
||||||
|
|
||||||
|
Same addition for Podman CDI:
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=1
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices: []
|
||||||
|
```
|
||||||
|
|
||||||
|
### `Makefile`
|
||||||
|
|
||||||
|
Two additions after existing `COMPOSE` detection:
|
||||||
|
|
||||||
|
```makefile
|
||||||
|
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
|
||||||
|
|
||||||
|
# GPU overlay: matches single-gpu, dual-gpu (findstring gpu already covers these)
|
||||||
|
# Sub-profile injection for dual-gpu modes:
|
||||||
|
ifeq ($(PROFILE),dual-gpu)
|
||||||
|
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
|
||||||
|
endif
|
||||||
|
```
|
||||||
|
|
||||||
|
Update `manage.sh` usage block to document `dual-gpu` profile with `DUAL_GPU_MODE` note:
|
||||||
|
```
|
||||||
|
dual-gpu Ollama + Vision on GPU 0; GPU 1 mode set by DUAL_GPU_MODE
|
||||||
|
DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1
|
||||||
|
DUAL_GPU_MODE=vllm vllm on GPU 1
|
||||||
|
DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split; see preflight warning)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `scripts/preflight.py`
|
||||||
|
|
||||||
|
**1. `_SERVICES` — add `ollama_research`:**
|
||||||
|
```python
|
||||||
|
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
|
||||||
|
```
|
||||||
|
|
||||||
|
**2. `_LLM_BACKENDS` — add entries for both new backends:**
|
||||||
|
```python
|
||||||
|
"ollama_research": [("ollama_research", "/v1")],
|
||||||
|
# vllm_research is an alias for vllm's port — preflight updates base_url for both:
|
||||||
|
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
|
||||||
|
```
|
||||||
|
|
||||||
|
**3. `_DOCKER_INTERNAL` — add `ollama_research`:**
|
||||||
|
```python
|
||||||
|
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
|
||||||
|
```
|
||||||
|
|
||||||
|
**4. `recommend_profile()` — unchanged** (still returns `"dual-gpu"` for 2 GPUs).
|
||||||
|
Write `DUAL_GPU_MODE=ollama` to `.env` when first setting up a 2-GPU system.
|
||||||
|
|
||||||
|
**5. Mixed-mode VRAM warning** — after GPU resource section, before closing line:
|
||||||
|
```python
|
||||||
|
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
|
||||||
|
if dual_gpu_mode == "mixed" and len(gpus) >= 2:
|
||||||
|
if gpus[1]["vram_free_gb"] < 12:
|
||||||
|
print(f"║ ⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {gpus[1]['vram_free_gb']:.1f} GB free")
|
||||||
|
print(f"║ Running ollama_research + vllm together may cause OOM.")
|
||||||
|
print(f"║ Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm instead.")
|
||||||
|
```
|
||||||
|
|
||||||
|
**6. Download size warning** — profile-aware block added just before the closing `╚` line:
|
||||||
|
|
||||||
|
```
|
||||||
|
║ Download sizes (first-run estimates)
|
||||||
|
║ Docker images
|
||||||
|
║ ollama/ollama ~800 MB (shared by ollama + ollama_research)
|
||||||
|
║ searxng/searxng ~300 MB
|
||||||
|
║ app (Python build) ~1.5 GB
|
||||||
|
║ vision service ~3.0 GB [single-gpu and above]
|
||||||
|
║ vllm/vllm-openai ~10.0 GB [vllm / mixed mode only]
|
||||||
|
║
|
||||||
|
║ Model weights (lazy-loaded on first use)
|
||||||
|
║ llama3.2:3b ~2.0 GB → OLLAMA_MODELS_DIR
|
||||||
|
║ moondream2 ~1.8 GB → vision container cache [single-gpu+]
|
||||||
|
║ Note: ollama + ollama_research share the same model dir — no double download
|
||||||
|
║
|
||||||
|
║ ⚠ Total first-run: ~X GB (models persist between restarts)
|
||||||
|
```
|
||||||
|
|
||||||
|
Total is summed at runtime based on active profile + `DUAL_GPU_MODE`.
|
||||||
|
|
||||||
|
Size table (used by the warning calculator):
|
||||||
|
| Component | Size | Condition |
|
||||||
|
|-----------|------|-----------|
|
||||||
|
| `ollama/ollama` image | 800 MB | cpu, single-gpu, dual-gpu |
|
||||||
|
| `searxng/searxng` image | 300 MB | always |
|
||||||
|
| app image | 1,500 MB | always |
|
||||||
|
| vision service image | 3,000 MB | single-gpu, dual-gpu |
|
||||||
|
| `vllm/vllm-openai` image | 10,000 MB | vllm or mixed mode |
|
||||||
|
| llama3.2:3b weights | 2,000 MB | cpu, single-gpu, dual-gpu |
|
||||||
|
| moondream2 weights | 1,800 MB | single-gpu, dual-gpu |
|
||||||
|
|
||||||
|
### `config/llm.yaml`
|
||||||
|
|
||||||
|
**Add `vllm_research` backend:**
|
||||||
|
```yaml
|
||||||
|
vllm_research:
|
||||||
|
api_key: ''
|
||||||
|
base_url: http://host.docker.internal:8000/v1 # same port as vllm; preflight keeps in sync
|
||||||
|
enabled: true
|
||||||
|
model: __auto__
|
||||||
|
supports_images: false
|
||||||
|
type: openai_compat
|
||||||
|
```
|
||||||
|
|
||||||
|
**Update `research_fallback_order`:**
|
||||||
|
```yaml
|
||||||
|
research_fallback_order:
|
||||||
|
- claude_code
|
||||||
|
- vllm_research
|
||||||
|
- ollama_research
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
```
|
||||||
|
|
||||||
|
`vllm` stays in the main `fallback_order` (cover letters). `vllm_research` is the explicit
|
||||||
|
research alias for the same service — different config key, same port, makes routing intent
|
||||||
|
readable in the YAML.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Downstream Compatibility
|
||||||
|
|
||||||
|
The LLM router requires no changes. `_is_reachable()` already skips backends that aren't
|
||||||
|
responding. When `DUAL_GPU_MODE=ollama`, `vllm_research` is unreachable and skipped;
|
||||||
|
`ollama_research` is up and used. When `DUAL_GPU_MODE=vllm`, the reverse. `mixed` mode
|
||||||
|
makes both reachable; `vllm_research` wins as the higher-priority entry.
|
||||||
|
|
||||||
|
Preflight's `update_llm_yaml()` keeps `base_url` values correct for both adopted (external)
|
||||||
|
and Docker-internal routing automatically, since `vllm_research` is registered under the
|
||||||
|
`"vllm"` key in `_LLM_BACKENDS`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Future Considerations
|
||||||
|
|
||||||
|
- **Triple-GPU / 3+ service configs:** When a third product is active, extract this pattern
|
||||||
|
into `circuitforge-core` as a reusable inference topology manager.
|
||||||
|
- **Dual vLLM:** Two vLLM instances (e.g., different model sizes per task) follows the same
|
||||||
|
pattern — add `vllm_research` as a separate compose service on its own port.
|
||||||
|
- **VRAM-aware model selection:** Preflight could suggest smaller models when VRAM is tight
|
||||||
|
in mixed mode (e.g., swap llama3.2:3b → llama3.2:1b for the research instance).
|
||||||
|
- **Queue optimizer (1-GPU / CPU):** When only one inference backend is available and a batch
|
||||||
|
of tasks is queued, group by task type (all cover letters first, then all research briefs)
|
||||||
|
to avoid repeated model context switches. Tracked separately.
|
||||||
811
docs/plans/2026-02-26-dual-gpu-plan.md
Normal file
811
docs/plans/2026-02-26-dual-gpu-plan.md
Normal file
|
|
@ -0,0 +1,811 @@
|
||||||
|
# Dual-GPU / Dual-Inference Implementation Plan
|
||||||
|
|
||||||
|
> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
|
||||||
|
|
||||||
|
**Goal:** Add `DUAL_GPU_MODE=ollama|vllm|mixed` env var that gates which inference service occupies GPU 1 on dual-GPU systems, plus a first-run download size warning in preflight.
|
||||||
|
|
||||||
|
**Architecture:** Sub-profiles (`dual-gpu-ollama`, `dual-gpu-vllm`, `dual-gpu-mixed`) are injected alongside `--profile dual-gpu` by the Makefile based on `DUAL_GPU_MODE`. The LLM router requires zero changes — `_is_reachable()` naturally skips backends that aren't running. Preflight gains `ollama_research` as a tracked service and emits a size warning block.
|
||||||
|
|
||||||
|
**Tech Stack:** Docker Compose profiles, Python (preflight.py), YAML (llm.yaml, compose files), bash (Makefile, manage.sh)
|
||||||
|
|
||||||
|
**Design doc:** `docs/plans/2026-02-26-dual-gpu-design.md`
|
||||||
|
|
||||||
|
**Test runner:** `conda run -n job-seeker python -m pytest tests/ -v`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 1: Update `config/llm.yaml`
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `config/llm.yaml`
|
||||||
|
|
||||||
|
**Step 1: Add `vllm_research` backend and update `research_fallback_order`**
|
||||||
|
|
||||||
|
Open `config/llm.yaml`. After the `vllm:` block, add:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
vllm_research:
|
||||||
|
api_key: ''
|
||||||
|
base_url: http://host.docker.internal:8000/v1
|
||||||
|
enabled: true
|
||||||
|
model: __auto__
|
||||||
|
supports_images: false
|
||||||
|
type: openai_compat
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace `research_fallback_order:` section with:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
research_fallback_order:
|
||||||
|
- claude_code
|
||||||
|
- vllm_research
|
||||||
|
- ollama_research
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Verify YAML parses cleanly**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -c "import yaml; yaml.safe_load(open('config/llm.yaml'))"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: no output (no error).
|
||||||
|
|
||||||
|
**Step 3: Run existing llm config test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_llm_router.py::test_config_loads -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: PASS
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add config/llm.yaml
|
||||||
|
git commit -m "feat: add vllm_research backend and update research_fallback_order"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 2: Write failing tests for preflight changes
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Create: `tests/test_preflight.py`
|
||||||
|
|
||||||
|
No existing test file for preflight. Write all tests upfront — they fail until Task 3–5 implement the code.
|
||||||
|
|
||||||
|
**Step 1: Create `tests/test_preflight.py`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
"""Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
import yaml
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# ── Service table ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_ollama_research_in_services():
|
||||||
|
"""ollama_research must be in _SERVICES at port 11435."""
|
||||||
|
from scripts.preflight import _SERVICES
|
||||||
|
assert "ollama_research" in _SERVICES
|
||||||
|
_, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
|
||||||
|
assert default_port == 11435
|
||||||
|
assert env_var == "OLLAMA_RESEARCH_PORT"
|
||||||
|
assert docker_owned is True
|
||||||
|
assert adoptable is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_research_in_llm_backends():
|
||||||
|
"""ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
assert "ollama_research" in _LLM_BACKENDS
|
||||||
|
# Should map to the ollama_research llm backend
|
||||||
|
backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
|
||||||
|
assert "ollama_research" in backend_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_vllm_research_in_llm_backends():
|
||||||
|
"""vllm_research must be registered under vllm in _LLM_BACKENDS."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
assert "vllm" in _LLM_BACKENDS
|
||||||
|
backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
|
||||||
|
assert "vllm_research" in backend_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_research_in_docker_internal():
|
||||||
|
"""ollama_research must map to internal port 11434 (Ollama's container port)."""
|
||||||
|
from scripts.preflight import _DOCKER_INTERNAL
|
||||||
|
assert "ollama_research" in _DOCKER_INTERNAL
|
||||||
|
hostname, port = _DOCKER_INTERNAL["ollama_research"]
|
||||||
|
assert hostname == "ollama_research"
|
||||||
|
assert port == 11434 # container-internal port is always 11434
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_not_mapped_to_ollama_research_backend():
|
||||||
|
"""ollama service key must only update the ollama llm backend, not ollama_research."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
|
||||||
|
assert "ollama_research" not in ollama_backend_names
|
||||||
|
|
||||||
|
|
||||||
|
# ── Download size warning ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_download_size_remote_profile():
|
||||||
|
"""Remote profile: only searxng + app, no ollama, no vision, no vllm."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("remote", "ollama")
|
||||||
|
assert "searxng" in sizes
|
||||||
|
assert "app" in sizes
|
||||||
|
assert "ollama" not in sizes
|
||||||
|
assert "vision_image" not in sizes
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_cpu_profile():
|
||||||
|
"""CPU profile: adds ollama image + llama3.2:3b weights."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("cpu", "ollama")
|
||||||
|
assert "ollama" in sizes
|
||||||
|
assert "llama3_2_3b" in sizes
|
||||||
|
assert "vision_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_single_gpu_profile():
|
||||||
|
"""Single-GPU: adds vision image + moondream2 weights."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("single-gpu", "ollama")
|
||||||
|
assert "vision_image" in sizes
|
||||||
|
assert "moondream2" in sizes
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_ollama_mode():
|
||||||
|
"""dual-gpu + ollama mode: no vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "ollama")
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_vllm_mode():
|
||||||
|
"""dual-gpu + vllm mode: adds ~10 GB vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "vllm")
|
||||||
|
assert "vllm_image" in sizes
|
||||||
|
assert sizes["vllm_image"] >= 9000 # at least 9 GB
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_mixed_mode():
|
||||||
|
"""dual-gpu + mixed mode: also includes vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "mixed")
|
||||||
|
assert "vllm_image" in sizes
|
||||||
|
|
||||||
|
|
||||||
|
# ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_triggered():
|
||||||
|
"""Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0}, # tight
|
||||||
|
]
|
||||||
|
warning = _mixed_mode_vram_warning(gpus, "mixed")
|
||||||
|
assert warning is not None
|
||||||
|
assert "8.0" in warning or "GPU 1" in warning
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_not_triggered_with_headroom():
|
||||||
|
"""Should return None when GPU 1 has >= 12 GB free."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0}, # plenty
|
||||||
|
]
|
||||||
|
warning = _mixed_mode_vram_warning(gpus, "mixed")
|
||||||
|
assert warning is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
|
||||||
|
"""Warning only applies in mixed mode."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
|
||||||
|
]
|
||||||
|
assert _mixed_mode_vram_warning(gpus, "ollama") is None
|
||||||
|
assert _mixed_mode_vram_warning(gpus, "vllm") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── update_llm_yaml with ollama_research ──────────────────────────────────────
|
||||||
|
|
||||||
|
def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
|
||||||
|
"""ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
|
||||||
|
from scripts.preflight import update_llm_yaml
|
||||||
|
|
||||||
|
llm_cfg = {
|
||||||
|
"backends": {
|
||||||
|
"ollama": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vllm": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vllm_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vision_service": {"base_url": "http://old", "type": "vision_service"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
|
||||||
|
yaml.dump(llm_cfg, f)
|
||||||
|
tmp_path = Path(f.name)
|
||||||
|
|
||||||
|
ports = {
|
||||||
|
"ollama": {
|
||||||
|
"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
|
||||||
|
},
|
||||||
|
"ollama_research": {
|
||||||
|
"resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
|
||||||
|
},
|
||||||
|
"vllm": {
|
||||||
|
"resolved": 8000, "external": False, "env_var": "VLLM_PORT"
|
||||||
|
},
|
||||||
|
"vision": {
|
||||||
|
"resolved": 8002, "external": False, "env_var": "VISION_PORT"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Patch LLM_YAML to point at our temp file
|
||||||
|
with patch("scripts.preflight.LLM_YAML", tmp_path):
|
||||||
|
update_llm_yaml(ports)
|
||||||
|
|
||||||
|
result = yaml.safe_load(tmp_path.read_text())
|
||||||
|
# Docker-internal: use service name + container port
|
||||||
|
assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
|
||||||
|
# vllm_research must match vllm's URL
|
||||||
|
assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_llm_yaml_sets_ollama_research_url_external():
|
||||||
|
"""When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
|
||||||
|
from scripts.preflight import update_llm_yaml
|
||||||
|
|
||||||
|
llm_cfg = {
|
||||||
|
"backends": {
|
||||||
|
"ollama": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
|
||||||
|
yaml.dump(llm_cfg, f)
|
||||||
|
tmp_path = Path(f.name)
|
||||||
|
|
||||||
|
ports = {
|
||||||
|
"ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
|
||||||
|
"ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with patch("scripts.preflight.LLM_YAML", tmp_path):
|
||||||
|
update_llm_yaml(ports)
|
||||||
|
result = yaml.safe_load(tmp_path.read_text())
|
||||||
|
assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink()
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run tests to confirm they all fail**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_preflight.py -v 2>&1 | head -50
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all FAIL with `ImportError` or `AssertionError` — that's correct.
|
||||||
|
|
||||||
|
**Step 3: Commit failing tests**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add tests/test_preflight.py
|
||||||
|
git commit -m "test: add failing tests for dual-gpu preflight additions"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 3: `preflight.py` — service table additions
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `scripts/preflight.py:46-67` (`_SERVICES`, `_LLM_BACKENDS`, `_DOCKER_INTERNAL`)
|
||||||
|
|
||||||
|
**Step 1: Update `_SERVICES`**
|
||||||
|
|
||||||
|
Find the `_SERVICES` dict (currently ends at the `"ollama"` entry). Add `ollama_research` as a new entry:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
|
||||||
|
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
|
||||||
|
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
|
||||||
|
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
|
||||||
|
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
|
||||||
|
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
|
||||||
|
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Update `_LLM_BACKENDS`**
|
||||||
|
|
||||||
|
Replace the existing dict:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
|
||||||
|
"ollama": [("ollama", "/v1")],
|
||||||
|
"ollama_research": [("ollama_research", "/v1")],
|
||||||
|
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
|
||||||
|
"vision": [("vision_service", "")],
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Update `_DOCKER_INTERNAL`**
|
||||||
|
|
||||||
|
Add `ollama_research` entry:
|
||||||
|
|
||||||
|
```python
|
||||||
|
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
|
||||||
|
"ollama": ("ollama", 11434),
|
||||||
|
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
|
||||||
|
"vllm": ("vllm", 8000),
|
||||||
|
"vision": ("vision", 8002),
|
||||||
|
"searxng": ("searxng", 8080),
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Run service table tests**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_preflight.py::test_ollama_research_in_services tests/test_preflight.py::test_ollama_research_in_llm_backends tests/test_preflight.py::test_vllm_research_in_llm_backends tests/test_preflight.py::test_ollama_research_in_docker_internal tests/test_preflight.py::test_ollama_not_mapped_to_ollama_research_backend tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_docker_internal tests/test_preflight.py::test_update_llm_yaml_sets_ollama_research_url_external -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/preflight.py
|
||||||
|
git commit -m "feat: add ollama_research to preflight service table and LLM backend map"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 4: `preflight.py` — `_download_size_mb()` pure function
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `scripts/preflight.py` (add new function after `calc_cpu_offload_gb`)
|
||||||
|
|
||||||
|
**Step 1: Add the function**
|
||||||
|
|
||||||
|
After `calc_cpu_offload_gb()`, add:
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return estimated first-run download sizes in MB, keyed by component name.
|
||||||
|
Profile-aware: only includes components that will actually be pulled.
|
||||||
|
"""
|
||||||
|
sizes: dict[str, int] = {
|
||||||
|
"searxng": 300,
|
||||||
|
"app": 1500,
|
||||||
|
}
|
||||||
|
if profile in ("cpu", "single-gpu", "dual-gpu"):
|
||||||
|
sizes["ollama"] = 800
|
||||||
|
sizes["llama3_2_3b"] = 2000
|
||||||
|
if profile in ("single-gpu", "dual-gpu"):
|
||||||
|
sizes["vision_image"] = 3000
|
||||||
|
sizes["moondream2"] = 1800
|
||||||
|
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
|
||||||
|
sizes["vllm_image"] = 10000
|
||||||
|
return sizes
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run download size tests**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_preflight.py -k "download_size" -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
**Step 3: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/preflight.py
|
||||||
|
git commit -m "feat: add _download_size_mb() pure function for preflight size warning"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 5: `preflight.py` — VRAM warning, size report block, DUAL_GPU_MODE default
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `scripts/preflight.py` (three additions to `main()` and a new helper)
|
||||||
|
|
||||||
|
**Step 1: Add `_mixed_mode_vram_warning()` after `_download_size_mb()`**
|
||||||
|
|
||||||
|
```python
|
||||||
|
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
|
||||||
|
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
|
||||||
|
"""
|
||||||
|
if dual_gpu_mode != "mixed" or len(gpus) < 2:
|
||||||
|
return None
|
||||||
|
free = gpus[1]["vram_free_gb"]
|
||||||
|
if free < 12:
|
||||||
|
return (
|
||||||
|
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
|
||||||
|
f"running ollama_research + vllm together may cause OOM. "
|
||||||
|
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Run VRAM warning tests**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_preflight.py -k "vram" -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
**Step 3: Wire size warning into `main()` report block**
|
||||||
|
|
||||||
|
In `main()`, find the closing `print("╚═...═╝")` line. Add the size warning block just before it:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# ── Download size warning ──────────────────────────────────────────────
|
||||||
|
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
|
||||||
|
sizes = _download_size_mb(profile, dual_gpu_mode)
|
||||||
|
total_mb = sum(sizes.values())
|
||||||
|
print("║")
|
||||||
|
print("║ Download sizes (first-run estimates)")
|
||||||
|
print("║ Docker images")
|
||||||
|
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
|
||||||
|
if "searxng" in sizes:
|
||||||
|
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
|
||||||
|
if "ollama" in sizes:
|
||||||
|
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
|
||||||
|
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
|
||||||
|
if "vision_image" in sizes:
|
||||||
|
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
|
||||||
|
if "vllm_image" in sizes:
|
||||||
|
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
|
||||||
|
print("║ Model weights (lazy-loaded on first use)")
|
||||||
|
if "llama3_2_3b" in sizes:
|
||||||
|
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
|
||||||
|
if "moondream2" in sizes:
|
||||||
|
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
|
||||||
|
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
|
||||||
|
print("║ Note: ollama + ollama_research share model dir — no double download")
|
||||||
|
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
|
||||||
|
|
||||||
|
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
|
||||||
|
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
|
||||||
|
if vram_warn:
|
||||||
|
print("║")
|
||||||
|
print(f"║ {vram_warn}")
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Wire `DUAL_GPU_MODE` default into `write_env()` block in `main()`**
|
||||||
|
|
||||||
|
In `main()`, find the `if not args.check_only:` block. After `env_updates["PEREGRINE_GPU_NAMES"]`, add:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
|
||||||
|
if len(gpus) >= 2:
|
||||||
|
existing_env: dict[str, str] = {}
|
||||||
|
if ENV_FILE.exists():
|
||||||
|
for line in ENV_FILE.read_text().splitlines():
|
||||||
|
if "=" in line and not line.startswith("#"):
|
||||||
|
k, _, v = line.partition("=")
|
||||||
|
existing_env[k.strip()] = v.strip()
|
||||||
|
if "DUAL_GPU_MODE" not in existing_env:
|
||||||
|
env_updates["DUAL_GPU_MODE"] = "ollama"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Add `import os` if not already present at top of file**
|
||||||
|
|
||||||
|
Check line 1–30 of `scripts/preflight.py`. `import os` is already present inside `get_cpu_cores()` as a local import — move it to the top-level imports block:
|
||||||
|
|
||||||
|
```python
|
||||||
|
import os # add alongside existing stdlib imports
|
||||||
|
```
|
||||||
|
|
||||||
|
And remove the local `import os` inside `get_cpu_cores()`.
|
||||||
|
|
||||||
|
**Step 6: Run all preflight tests**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/test_preflight.py -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all PASS
|
||||||
|
|
||||||
|
**Step 7: Smoke-check the preflight report output**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python scripts/preflight.py --check-only
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: report includes the `Download sizes` block near the bottom.
|
||||||
|
|
||||||
|
**Step 8: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add scripts/preflight.py
|
||||||
|
git commit -m "feat: add DUAL_GPU_MODE default, VRAM warning, and download size report to preflight"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 6: `compose.yml` — `ollama_research` service + profile updates
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `compose.yml`
|
||||||
|
|
||||||
|
**Step 1: Update `ollama` profiles line**
|
||||||
|
|
||||||
|
Find:
|
||||||
|
```yaml
|
||||||
|
profiles: [cpu, single-gpu, dual-gpu]
|
||||||
|
```
|
||||||
|
Replace with:
|
||||||
|
```yaml
|
||||||
|
profiles: [cpu, single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Update `vision` profiles line**
|
||||||
|
|
||||||
|
Find:
|
||||||
|
```yaml
|
||||||
|
profiles: [single-gpu, dual-gpu]
|
||||||
|
```
|
||||||
|
Replace with:
|
||||||
|
```yaml
|
||||||
|
profiles: [single-gpu, dual-gpu-ollama, dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Update `vllm` profiles line**
|
||||||
|
|
||||||
|
Find:
|
||||||
|
```yaml
|
||||||
|
profiles: [dual-gpu]
|
||||||
|
```
|
||||||
|
Replace with:
|
||||||
|
```yaml
|
||||||
|
profiles: [dual-gpu-vllm, dual-gpu-mixed]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: Add `ollama_research` service**
|
||||||
|
|
||||||
|
After the closing lines of the `ollama` service block, add:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
image: ollama/ollama:latest
|
||||||
|
ports:
|
||||||
|
- "${OLLAMA_RESEARCH_PORT:-11435}:11434"
|
||||||
|
volumes:
|
||||||
|
- ${OLLAMA_MODELS_DIR:-~/models/ollama}:/root/.ollama
|
||||||
|
- ./docker/ollama/entrypoint.sh:/entrypoint.sh
|
||||||
|
environment:
|
||||||
|
- OLLAMA_MODELS=/root/.ollama
|
||||||
|
- DEFAULT_OLLAMA_MODEL=${OLLAMA_RESEARCH_MODEL:-llama3.2:3b}
|
||||||
|
entrypoint: ["/bin/bash", "/entrypoint.sh"]
|
||||||
|
profiles: [dual-gpu-ollama, dual-gpu-mixed]
|
||||||
|
restart: unless-stopped
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 5: Validate compose YAML**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f compose.yml config --quiet
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: no errors.
|
||||||
|
|
||||||
|
**Step 6: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add compose.yml
|
||||||
|
git commit -m "feat: add ollama_research service and update profiles for dual-gpu sub-profiles"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 7: GPU overlay files — `compose.gpu.yml` and `compose.podman-gpu.yml`
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `compose.gpu.yml`
|
||||||
|
- Modify: `compose.podman-gpu.yml`
|
||||||
|
|
||||||
|
**Step 1: Add `ollama_research` to `compose.gpu.yml`**
|
||||||
|
|
||||||
|
After the `ollama:` block, add:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
device_ids: ["1"]
|
||||||
|
capabilities: [gpu]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Add `ollama_research` to `compose.podman-gpu.yml`**
|
||||||
|
|
||||||
|
After the `ollama:` block, add:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
ollama_research:
|
||||||
|
devices:
|
||||||
|
- nvidia.com/gpu=1
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices: []
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Validate both files**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f compose.yml -f compose.gpu.yml config --quiet
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: no errors.
|
||||||
|
|
||||||
|
**Step 4: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add compose.gpu.yml compose.podman-gpu.yml
|
||||||
|
git commit -m "feat: assign ollama_research to GPU 1 in Docker and Podman GPU overlays"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 8: `Makefile` + `manage.sh` — `DUAL_GPU_MODE` injection and help text
|
||||||
|
|
||||||
|
**Files:**
|
||||||
|
- Modify: `Makefile`
|
||||||
|
- Modify: `manage.sh`
|
||||||
|
|
||||||
|
**Step 1: Update `Makefile`**
|
||||||
|
|
||||||
|
After the `COMPOSE_OVERRIDE` variable, add `DUAL_GPU_MODE` reading:
|
||||||
|
|
||||||
|
```makefile
|
||||||
|
DUAL_GPU_MODE ?= $(shell grep -m1 '^DUAL_GPU_MODE=' .env 2>/dev/null | cut -d= -f2 || echo ollama)
|
||||||
|
```
|
||||||
|
|
||||||
|
In the GPU overlay block, find:
|
||||||
|
```makefile
|
||||||
|
else
|
||||||
|
ifneq (,$(findstring gpu,$(PROFILE)))
|
||||||
|
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace the `else` branch with:
|
||||||
|
```makefile
|
||||||
|
else
|
||||||
|
ifneq (,$(findstring gpu,$(PROFILE)))
|
||||||
|
COMPOSE_FILES := -f compose.yml $(COMPOSE_OVERRIDE) -f compose.gpu.yml
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(PROFILE),dual-gpu)
|
||||||
|
COMPOSE_FILES += --profile dual-gpu-$(DUAL_GPU_MODE)
|
||||||
|
endif
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Update `manage.sh` — profiles help block**
|
||||||
|
|
||||||
|
Find the profiles section in `usage()`:
|
||||||
|
```bash
|
||||||
|
echo " dual-gpu Ollama + Vision + vLLM on GPU 0+1"
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace with:
|
||||||
|
```bash
|
||||||
|
echo " dual-gpu Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
|
||||||
|
echo " DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1"
|
||||||
|
echo " DUAL_GPU_MODE=vllm vllm on GPU 1"
|
||||||
|
echo " DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split)"
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 3: Verify Makefile parses**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make help
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: help table prints cleanly, no make errors.
|
||||||
|
|
||||||
|
**Step 4: Verify manage.sh help**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./manage.sh help
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: new dual-gpu description appears in profiles section.
|
||||||
|
|
||||||
|
**Step 5: Commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add Makefile manage.sh
|
||||||
|
git commit -m "feat: inject DUAL_GPU_MODE sub-profile in Makefile; update manage.sh help"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### Task 9: Integration smoke test
|
||||||
|
|
||||||
|
**Goal:** Verify the full chain works for `DUAL_GPU_MODE=ollama` without actually starting Docker (dry-run compose config check).
|
||||||
|
|
||||||
|
**Step 1: Write `DUAL_GPU_MODE=ollama` to `.env` temporarily**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo "DUAL_GPU_MODE=ollama" >> .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 2: Dry-run compose config for dual-gpu + dual-gpu-ollama**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-ollama config 2>&1 | grep -E "^ [a-z]|image:|ports:"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected output includes:
|
||||||
|
- `ollama:` service with port 11434
|
||||||
|
- `ollama_research:` service with port 11435
|
||||||
|
- `vision:` service
|
||||||
|
- `searxng:` service
|
||||||
|
- **No** `vllm:` service
|
||||||
|
|
||||||
|
**Step 3: Dry-run for `DUAL_GPU_MODE=vllm`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f compose.yml -f compose.gpu.yml --profile dual-gpu --profile dual-gpu-vllm config 2>&1 | grep -E "^ [a-z]|image:|ports:"
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected:
|
||||||
|
- `ollama:` service (port 11434)
|
||||||
|
- `vllm:` service (port 8000)
|
||||||
|
- **No** `ollama_research:` service
|
||||||
|
|
||||||
|
**Step 4: Run full test suite**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
conda run -n job-seeker python -m pytest tests/ -v
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: all existing tests PASS, all new preflight tests PASS.
|
||||||
|
|
||||||
|
**Step 5: Clean up `.env` test entry**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Remove the test DUAL_GPU_MODE line (preflight will re-write it correctly on next run)
|
||||||
|
sed -i '/^DUAL_GPU_MODE=/d' .env
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 6: Final commit**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add .env # in case preflight rewrote it during testing
|
||||||
|
git commit -m "feat: dual-gpu DUAL_GPU_MODE complete — ollama/vllm/mixed GPU 1 selection"
|
||||||
|
```
|
||||||
132
docs/plans/2026-02-26-email-classifier-benchmark-design.md
Normal file
132
docs/plans/2026-02-26-email-classifier-benchmark-design.md
Normal file
|
|
@ -0,0 +1,132 @@
|
||||||
|
# Email Classifier Benchmark — Design
|
||||||
|
|
||||||
|
**Date:** 2026-02-26
|
||||||
|
**Status:** Approved
|
||||||
|
|
||||||
|
## Problem
|
||||||
|
|
||||||
|
The current `classify_stage_signal()` in `scripts/imap_sync.py` uses `llama3.1:8b` via
|
||||||
|
Ollama for 6-label email classification. This is slow, requires a running Ollama instance,
|
||||||
|
and accuracy is unverified against alternatives. This design establishes a benchmark harness
|
||||||
|
to evaluate HuggingFace-native classifiers as potential replacements.
|
||||||
|
|
||||||
|
## Labels
|
||||||
|
|
||||||
|
```
|
||||||
|
interview_scheduled offer_received rejected
|
||||||
|
positive_response survey_received neutral
|
||||||
|
```
|
||||||
|
|
||||||
|
## Approach: Standalone Benchmark Script (Approach B)
|
||||||
|
|
||||||
|
Two new files; nothing in `imap_sync.py` changes until a winner is chosen.
|
||||||
|
|
||||||
|
```
|
||||||
|
scripts/
|
||||||
|
benchmark_classifier.py — CLI entry point
|
||||||
|
classifier_adapters.py — adapter classes (reusable by imap_sync later)
|
||||||
|
|
||||||
|
data/
|
||||||
|
email_eval.jsonl — labeled ground truth (gitignored — contains email content)
|
||||||
|
email_eval.jsonl.example — committed example with fake emails
|
||||||
|
|
||||||
|
scripts/classifier_service/
|
||||||
|
environment.yml — new conda env: job-seeker-classifiers
|
||||||
|
```
|
||||||
|
|
||||||
|
## Adapter Pattern
|
||||||
|
|
||||||
|
```
|
||||||
|
ClassifierAdapter (ABC)
|
||||||
|
.classify(subject, body) → str # one of the 6 labels
|
||||||
|
.name → str
|
||||||
|
.model_id → str
|
||||||
|
.load() / .unload() # explicit lifecycle
|
||||||
|
|
||||||
|
ZeroShotAdapter(ClassifierAdapter)
|
||||||
|
# uses transformers pipeline("zero-shot-classification")
|
||||||
|
# candidate_labels = list of 6 labels
|
||||||
|
# works for: DeBERTa, BART-MNLI, BGE-M3-ZeroShot, XLM-RoBERTa
|
||||||
|
|
||||||
|
GLiClassAdapter(ClassifierAdapter)
|
||||||
|
# uses gliclass library (pip install gliclass)
|
||||||
|
# GLiClassModel + ZeroShotClassificationPipeline
|
||||||
|
# works for: gliclass-instruct-large-v1.0
|
||||||
|
|
||||||
|
RerankerAdapter(ClassifierAdapter)
|
||||||
|
# uses FlagEmbedding reranker.compute_score()
|
||||||
|
# scores (email_text, label_description) pairs; highest = predicted label
|
||||||
|
# works for: bge-reranker-v2-m3
|
||||||
|
```
|
||||||
|
|
||||||
|
## Model Registry
|
||||||
|
|
||||||
|
| Short name | Model | Params | Adapter | Default |
|
||||||
|
|------------|-------|--------|---------|---------|
|
||||||
|
| `deberta-zeroshot` | MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0 | 400M | ZeroShot | ✅ |
|
||||||
|
| `deberta-small` | cross-encoder/nli-deberta-v3-small | 100M | ZeroShot | ✅ |
|
||||||
|
| `gliclass-large` | knowledgator/gliclass-instruct-large-v1.0 | 400M | GLiClass | ✅ |
|
||||||
|
| `bart-mnli` | facebook/bart-large-mnli | 400M | ZeroShot | ✅ |
|
||||||
|
| `bge-m3-zeroshot` | MoritzLaurer/bge-m3-zeroshot-v2.0 | 600M | ZeroShot | ✅ |
|
||||||
|
| `bge-reranker` | BAAI/bge-reranker-v2-m3 | 600M | Reranker | ❌ (`--include-slow`) |
|
||||||
|
| `deberta-xlarge` | microsoft/deberta-xlarge-mnli | 750M | ZeroShot | ❌ (`--include-slow`) |
|
||||||
|
| `mdeberta-mnli` | MoritzLaurer/mDeBERTa-v3-base-mnli-xnli | 300M | ZeroShot | ❌ (`--include-slow`) |
|
||||||
|
| `xlm-roberta-anli` | vicgalle/xlm-roberta-large-xnli-anli | 600M | ZeroShot | ❌ (`--include-slow`) |
|
||||||
|
|
||||||
|
## CLI Modes
|
||||||
|
|
||||||
|
### `--compare` (live IMAP, visual table)
|
||||||
|
Extends the pattern of `test_email_classify.py`. Pulls emails via IMAP, shows a table:
|
||||||
|
```
|
||||||
|
Subject | Phrase | llama3 | deberta-zs | deberta-sm | gliclass | bart | bge-m3
|
||||||
|
```
|
||||||
|
- Phrase-filter column shows BLOCK/pass (same gate as production)
|
||||||
|
- `llama3` column = current production baseline
|
||||||
|
- HF model columns follow
|
||||||
|
|
||||||
|
### `--eval` (ground-truth evaluation)
|
||||||
|
Reads `data/email_eval.jsonl`, runs all models, reports per-label and aggregate metrics:
|
||||||
|
- Per-label: precision, recall, F1
|
||||||
|
- Aggregate: macro-F1, accuracy
|
||||||
|
- Latency: ms/email per model
|
||||||
|
|
||||||
|
JSONL format:
|
||||||
|
```jsonl
|
||||||
|
{"subject": "Interview invitation", "body": "We'd like to schedule...", "label": "interview_scheduled"}
|
||||||
|
{"subject": "Your application", "body": "We regret to inform you...", "label": "rejected"}
|
||||||
|
```
|
||||||
|
|
||||||
|
### `--list-models`
|
||||||
|
Prints the registry with sizes, adapter types, and default/slow flags.
|
||||||
|
|
||||||
|
## Conda Environment
|
||||||
|
|
||||||
|
New env `job-seeker-classifiers` — isolated from `job-seeker` (no torch there).
|
||||||
|
|
||||||
|
Key deps:
|
||||||
|
- `torch` (CUDA-enabled)
|
||||||
|
- `transformers`
|
||||||
|
- `gliclass`
|
||||||
|
- `FlagEmbedding` (for bge-reranker only)
|
||||||
|
- `sentence-transformers` (optional, for future embedding-based approaches)
|
||||||
|
|
||||||
|
## GPU
|
||||||
|
|
||||||
|
Auto-select (`device="cuda"` when available, CPU fallback). No GPU pinning — models
|
||||||
|
load one at a time so VRAM pressure is sequential, not cumulative.
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
- Model load failures: skip that column, print warning, continue
|
||||||
|
- Classification errors: show `ERR` in cell, continue
|
||||||
|
- IMAP failures: propagate (same as existing harness)
|
||||||
|
- Missing eval file: clear error message pointing to `data/email_eval.jsonl.example`
|
||||||
|
|
||||||
|
## What Does Not Change (Yet)
|
||||||
|
|
||||||
|
- `scripts/imap_sync.py` — production classifier unchanged
|
||||||
|
- `scripts/llm_router.py` — unchanged
|
||||||
|
- `staging.db` schema — unchanged
|
||||||
|
|
||||||
|
After benchmark results are reviewed, a separate PR will wire the winning model
|
||||||
|
into `classify_stage_signal()` as an opt-in backend in `llm_router.py`.
|
||||||
1334
docs/plans/2026-02-26-email-classifier-benchmark-plan.md
Normal file
1334
docs/plans/2026-02-26-email-classifier-benchmark-plan.md
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -42,7 +42,10 @@ usage() {
|
||||||
echo " remote API-only, no local inference (default)"
|
echo " remote API-only, no local inference (default)"
|
||||||
echo " cpu Local Ollama inference on CPU"
|
echo " cpu Local Ollama inference on CPU"
|
||||||
echo " single-gpu Ollama + Vision on GPU 0"
|
echo " single-gpu Ollama + Vision on GPU 0"
|
||||||
echo " dual-gpu Ollama + Vision + vLLM on GPU 0+1"
|
echo " dual-gpu Ollama + Vision on GPU 0; GPU 1 set by DUAL_GPU_MODE"
|
||||||
|
echo " DUAL_GPU_MODE=ollama (default) ollama_research on GPU 1"
|
||||||
|
echo " DUAL_GPU_MODE=vllm vllm on GPU 1"
|
||||||
|
echo " DUAL_GPU_MODE=mixed both on GPU 1 (VRAM-split)"
|
||||||
echo ""
|
echo ""
|
||||||
echo " Examples:"
|
echo " Examples:"
|
||||||
echo " ./manage.sh start"
|
echo " ./manage.sh start"
|
||||||
|
|
|
||||||
347
scripts/benchmark_classifier.py
Normal file
347
scripts/benchmark_classifier.py
Normal file
|
|
@ -0,0 +1,347 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Email classifier benchmark — compare HuggingFace models against our 6 labels.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
# List available models
|
||||||
|
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --list-models
|
||||||
|
|
||||||
|
# Score against labeled JSONL
|
||||||
|
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score
|
||||||
|
|
||||||
|
# Visual comparison on live IMAP emails
|
||||||
|
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --compare --limit 20
|
||||||
|
|
||||||
|
# Include slow/large models
|
||||||
|
conda run -n job-seeker-classifiers python scripts/benchmark_classifier.py --score --include-slow
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import email as _email_lib
|
||||||
|
import imaplib
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.classifier_adapters import (
|
||||||
|
LABELS,
|
||||||
|
LABEL_DESCRIPTIONS,
|
||||||
|
ClassifierAdapter,
|
||||||
|
GLiClassAdapter,
|
||||||
|
RerankerAdapter,
|
||||||
|
ZeroShotAdapter,
|
||||||
|
compute_metrics,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Model registry
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
MODEL_REGISTRY: dict[str, dict[str, Any]] = {
|
||||||
|
"deberta-zeroshot": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "MoritzLaurer/DeBERTa-v3-large-zeroshot-v2.0",
|
||||||
|
"params": "400M",
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
"deberta-small": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "cross-encoder/nli-deberta-v3-small",
|
||||||
|
"params": "100M",
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
"gliclass-large": {
|
||||||
|
"adapter": GLiClassAdapter,
|
||||||
|
"model_id": "knowledgator/gliclass-instruct-large-v1.0",
|
||||||
|
"params": "400M",
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
"bart-mnli": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "facebook/bart-large-mnli",
|
||||||
|
"params": "400M",
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
"bge-m3-zeroshot": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "MoritzLaurer/bge-m3-zeroshot-v2.0",
|
||||||
|
"params": "600M",
|
||||||
|
"default": True,
|
||||||
|
},
|
||||||
|
"bge-reranker": {
|
||||||
|
"adapter": RerankerAdapter,
|
||||||
|
"model_id": "BAAI/bge-reranker-v2-m3",
|
||||||
|
"params": "600M",
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
"deberta-xlarge": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "microsoft/deberta-xlarge-mnli",
|
||||||
|
"params": "750M",
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
"mdeberta-mnli": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
|
||||||
|
"params": "300M",
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
"xlm-roberta-anli": {
|
||||||
|
"adapter": ZeroShotAdapter,
|
||||||
|
"model_id": "vicgalle/xlm-roberta-large-xnli-anli",
|
||||||
|
"params": "600M",
|
||||||
|
"default": False,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_scoring_jsonl(path: str) -> list[dict[str, str]]:
|
||||||
|
"""Load labeled examples from a JSONL file for benchmark scoring."""
|
||||||
|
p = Path(path)
|
||||||
|
if not p.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Scoring file not found: {path}\n"
|
||||||
|
f"Copy data/email_score.jsonl.example → data/email_score.jsonl and label your emails."
|
||||||
|
)
|
||||||
|
rows = []
|
||||||
|
with p.open() as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
rows.append(json.loads(line))
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _active_models(include_slow: bool) -> dict[str, dict[str, Any]]:
|
||||||
|
return {k: v for k, v in MODEL_REGISTRY.items() if v["default"] or include_slow}
|
||||||
|
|
||||||
|
|
||||||
|
def run_scoring(
|
||||||
|
adapters: list[ClassifierAdapter],
|
||||||
|
score_file: str,
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Run all adapters against a labeled JSONL. Returns per-adapter metrics."""
|
||||||
|
rows = load_scoring_jsonl(score_file)
|
||||||
|
gold = [r["label"] for r in rows]
|
||||||
|
results: dict[str, Any] = {}
|
||||||
|
|
||||||
|
for adapter in adapters:
|
||||||
|
preds: list[str] = []
|
||||||
|
t0 = time.monotonic()
|
||||||
|
for row in rows:
|
||||||
|
try:
|
||||||
|
pred = adapter.classify(row["subject"], row["body"])
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" [{adapter.name}] ERROR on '{row['subject'][:40]}': {exc}", flush=True)
|
||||||
|
pred = "neutral"
|
||||||
|
preds.append(pred)
|
||||||
|
elapsed_ms = (time.monotonic() - t0) * 1000
|
||||||
|
metrics = compute_metrics(preds, gold, LABELS)
|
||||||
|
metrics["latency_ms"] = round(elapsed_ms / len(rows), 1)
|
||||||
|
results[adapter.name] = metrics
|
||||||
|
adapter.unload()
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# IMAP helpers (stdlib only — no imap_sync dependency)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_BROAD_TERMS = [
|
||||||
|
"interview", "opportunity", "offer letter",
|
||||||
|
"job offer", "application", "recruiting",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_imap_config() -> dict[str, Any]:
|
||||||
|
import yaml
|
||||||
|
cfg_path = Path(__file__).parent.parent / "config" / "email.yaml"
|
||||||
|
with cfg_path.open() as f:
|
||||||
|
return yaml.safe_load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def _imap_connect(cfg: dict[str, Any]) -> imaplib.IMAP4_SSL:
|
||||||
|
conn = imaplib.IMAP4_SSL(cfg["host"], cfg.get("port", 993))
|
||||||
|
conn.login(cfg["username"], cfg["password"])
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_part(part: Any) -> str:
|
||||||
|
charset = part.get_content_charset() or "utf-8"
|
||||||
|
try:
|
||||||
|
return part.get_payload(decode=True).decode(charset, errors="replace")
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_uid(conn: imaplib.IMAP4_SSL, uid: bytes) -> dict[str, str] | None:
|
||||||
|
try:
|
||||||
|
_, data = conn.uid("fetch", uid, "(RFC822)")
|
||||||
|
raw = data[0][1]
|
||||||
|
msg = _email_lib.message_from_bytes(raw)
|
||||||
|
subject = str(msg.get("subject", "")).strip()
|
||||||
|
body = ""
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_type() == "text/plain":
|
||||||
|
body = _decode_part(part)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
body = _decode_part(msg)
|
||||||
|
return {"subject": subject, "body": body}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_imap_sample(limit: int, days: int) -> list[dict[str, str]]:
|
||||||
|
cfg = _load_imap_config()
|
||||||
|
conn = _imap_connect(cfg)
|
||||||
|
since = (datetime.now() - timedelta(days=days)).strftime("%d-%b-%Y")
|
||||||
|
conn.select("INBOX")
|
||||||
|
|
||||||
|
seen_uids: dict[bytes, None] = {}
|
||||||
|
for term in _BROAD_TERMS:
|
||||||
|
_, data = conn.uid("search", None, f'(SUBJECT "{term}" SINCE {since})')
|
||||||
|
for uid in (data[0] or b"").split():
|
||||||
|
seen_uids[uid] = None
|
||||||
|
|
||||||
|
sample = list(seen_uids.keys())[:limit]
|
||||||
|
emails = []
|
||||||
|
for uid in sample:
|
||||||
|
parsed = _parse_uid(conn, uid)
|
||||||
|
if parsed:
|
||||||
|
emails.append(parsed)
|
||||||
|
try:
|
||||||
|
conn.logout()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return emails
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Subcommands
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def cmd_list_models(_args: argparse.Namespace) -> None:
|
||||||
|
print(f"\n{'Name':<20} {'Params':<8} {'Default':<20} {'Adapter':<15} Model ID")
|
||||||
|
print("-" * 100)
|
||||||
|
for name, entry in MODEL_REGISTRY.items():
|
||||||
|
adapter_name = entry["adapter"].__name__
|
||||||
|
default_flag = "yes" if entry["default"] else "(--include-slow)"
|
||||||
|
print(f"{name:<20} {entry['params']:<8} {default_flag:<20} {adapter_name:<15} {entry['model_id']}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_score(args: argparse.Namespace) -> None:
|
||||||
|
active = _active_models(args.include_slow)
|
||||||
|
if args.models:
|
||||||
|
active = {k: v for k, v in active.items() if k in args.models}
|
||||||
|
|
||||||
|
adapters = [
|
||||||
|
entry["adapter"](name, entry["model_id"])
|
||||||
|
for name, entry in active.items()
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"\nScoring {len(adapters)} model(s) against {args.score_file} …\n")
|
||||||
|
results = run_scoring(adapters, args.score_file)
|
||||||
|
|
||||||
|
col = 12
|
||||||
|
print(f"{'Model':<22}" + f"{'macro-F1':>{col}} {'Accuracy':>{col}} {'ms/email':>{col}}")
|
||||||
|
print("-" * (22 + col * 3 + 2))
|
||||||
|
for name, m in results.items():
|
||||||
|
print(
|
||||||
|
f"{name:<22}"
|
||||||
|
f"{m['__macro_f1__']:>{col}.3f}"
|
||||||
|
f"{m['__accuracy__']:>{col}.3f}"
|
||||||
|
f"{m['latency_ms']:>{col}.1f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nPer-label F1:")
|
||||||
|
names = list(results.keys())
|
||||||
|
print(f"{'Label':<25}" + "".join(f"{n[:11]:>{col}}" for n in names))
|
||||||
|
print("-" * (25 + col * len(names)))
|
||||||
|
for label in LABELS:
|
||||||
|
row_str = f"{label:<25}"
|
||||||
|
for m in results.values():
|
||||||
|
row_str += f"{m[label]['f1']:>{col}.3f}"
|
||||||
|
print(row_str)
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
def cmd_compare(args: argparse.Namespace) -> None:
|
||||||
|
active = _active_models(args.include_slow)
|
||||||
|
if args.models:
|
||||||
|
active = {k: v for k, v in active.items() if k in args.models}
|
||||||
|
|
||||||
|
print(f"Fetching up to {args.limit} emails from IMAP …")
|
||||||
|
emails = _fetch_imap_sample(args.limit, args.days)
|
||||||
|
print(f"Fetched {len(emails)} emails. Loading {len(active)} model(s) …\n")
|
||||||
|
|
||||||
|
adapters = [
|
||||||
|
entry["adapter"](name, entry["model_id"])
|
||||||
|
for name, entry in active.items()
|
||||||
|
]
|
||||||
|
model_names = [a.name for a in adapters]
|
||||||
|
|
||||||
|
col = 22
|
||||||
|
subj_w = 50
|
||||||
|
print(f"{'Subject':<{subj_w}}" + "".join(f"{n:<{col}}" for n in model_names))
|
||||||
|
print("-" * (subj_w + col * len(model_names)))
|
||||||
|
|
||||||
|
for row in emails:
|
||||||
|
short_subj = row["subject"][:subj_w - 1] if len(row["subject"]) > subj_w else row["subject"]
|
||||||
|
line = f"{short_subj:<{subj_w}}"
|
||||||
|
for adapter in adapters:
|
||||||
|
try:
|
||||||
|
label = adapter.classify(row["subject"], row["body"])
|
||||||
|
except Exception as exc:
|
||||||
|
label = f"ERR:{str(exc)[:8]}"
|
||||||
|
line += f"{label:<{col}}"
|
||||||
|
print(line, flush=True)
|
||||||
|
|
||||||
|
for adapter in adapters:
|
||||||
|
adapter.unload()
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark HuggingFace email classifiers against our 6 labels."
|
||||||
|
)
|
||||||
|
parser.add_argument("--list-models", action="store_true", help="Show model registry and exit")
|
||||||
|
parser.add_argument("--score", action="store_true", help="Score against labeled JSONL")
|
||||||
|
parser.add_argument("--compare", action="store_true", help="Visual table on live IMAP emails")
|
||||||
|
parser.add_argument("--score-file", default="data/email_score.jsonl", help="Path to labeled JSONL")
|
||||||
|
parser.add_argument("--limit", type=int, default=20, help="Max emails for --compare")
|
||||||
|
parser.add_argument("--days", type=int, default=90, help="Days back for IMAP search")
|
||||||
|
parser.add_argument("--include-slow", action="store_true", help="Include non-default heavy models")
|
||||||
|
parser.add_argument("--models", nargs="+", help="Override: run only these model names")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.list_models:
|
||||||
|
cmd_list_models(args)
|
||||||
|
elif args.score:
|
||||||
|
cmd_score(args)
|
||||||
|
elif args.compare:
|
||||||
|
cmd_compare(args)
|
||||||
|
else:
|
||||||
|
parser.print_help()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
254
scripts/classifier_adapters.py
Normal file
254
scripts/classifier_adapters.py
Normal file
|
|
@ -0,0 +1,254 @@
|
||||||
|
"""Classifier adapters for email classification benchmark.
|
||||||
|
|
||||||
|
Each adapter wraps a HuggingFace model and normalizes output to LABELS.
|
||||||
|
Models load lazily on first classify() call; call unload() to free VRAM.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import abc
|
||||||
|
from collections import defaultdict
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"LABELS",
|
||||||
|
"LABEL_DESCRIPTIONS",
|
||||||
|
"compute_metrics",
|
||||||
|
"ClassifierAdapter",
|
||||||
|
"ZeroShotAdapter",
|
||||||
|
"GLiClassAdapter",
|
||||||
|
"RerankerAdapter",
|
||||||
|
]
|
||||||
|
|
||||||
|
LABELS: list[str] = [
|
||||||
|
"interview_scheduled",
|
||||||
|
"offer_received",
|
||||||
|
"rejected",
|
||||||
|
"positive_response",
|
||||||
|
"survey_received",
|
||||||
|
"neutral",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Natural-language descriptions used by the RerankerAdapter.
|
||||||
|
LABEL_DESCRIPTIONS: dict[str, str] = {
|
||||||
|
"interview_scheduled": "scheduling an interview, phone screen, or video call",
|
||||||
|
"offer_received": "a formal job offer or employment offer letter",
|
||||||
|
"rejected": "application rejected or not moving forward with candidacy",
|
||||||
|
"positive_response": "positive recruiter interest or request to connect",
|
||||||
|
"survey_received": "invitation to complete a culture-fit survey or assessment",
|
||||||
|
"neutral": "automated ATS confirmation or unrelated email",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Lazy import shims — allow tests to patch without requiring the libs installed.
|
||||||
|
try:
|
||||||
|
from transformers import pipeline # type: ignore[assignment]
|
||||||
|
except ImportError:
|
||||||
|
pipeline = None # type: ignore[assignment]
|
||||||
|
|
||||||
|
try:
|
||||||
|
from gliclass import GLiClassModel, ZeroShotClassificationPipeline # type: ignore
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
except ImportError:
|
||||||
|
GLiClassModel = None # type: ignore
|
||||||
|
ZeroShotClassificationPipeline = None # type: ignore
|
||||||
|
AutoTokenizer = None # type: ignore
|
||||||
|
|
||||||
|
try:
|
||||||
|
from FlagEmbedding import FlagReranker # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
FlagReranker = None # type: ignore
|
||||||
|
|
||||||
|
|
||||||
|
def _cuda_available() -> bool:
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
return torch.cuda.is_available()
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def compute_metrics(
|
||||||
|
predictions: list[str],
|
||||||
|
gold: list[str],
|
||||||
|
labels: list[str],
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
"""Return per-label precision/recall/F1 + macro_f1 + accuracy."""
|
||||||
|
tp: dict[str, int] = defaultdict(int)
|
||||||
|
fp: dict[str, int] = defaultdict(int)
|
||||||
|
fn: dict[str, int] = defaultdict(int)
|
||||||
|
|
||||||
|
for pred, true in zip(predictions, gold):
|
||||||
|
if pred == true:
|
||||||
|
tp[pred] += 1
|
||||||
|
else:
|
||||||
|
fp[pred] += 1
|
||||||
|
fn[true] += 1
|
||||||
|
|
||||||
|
result: dict[str, Any] = {}
|
||||||
|
for label in labels:
|
||||||
|
denom_p = tp[label] + fp[label]
|
||||||
|
denom_r = tp[label] + fn[label]
|
||||||
|
p = tp[label] / denom_p if denom_p else 0.0
|
||||||
|
r = tp[label] / denom_r if denom_r else 0.0
|
||||||
|
f1 = 2 * p * r / (p + r) if (p + r) else 0.0
|
||||||
|
result[label] = {
|
||||||
|
"precision": p,
|
||||||
|
"recall": r,
|
||||||
|
"f1": f1,
|
||||||
|
"support": denom_r,
|
||||||
|
}
|
||||||
|
|
||||||
|
labels_with_support = [label for label in labels if result[label]["support"] > 0]
|
||||||
|
if labels_with_support:
|
||||||
|
result["__macro_f1__"] = (
|
||||||
|
sum(result[label]["f1"] for label in labels_with_support) / len(labels_with_support)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
result["__macro_f1__"] = 0.0
|
||||||
|
result["__accuracy__"] = sum(tp.values()) / len(predictions) if predictions else 0.0
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
class ClassifierAdapter(abc.ABC):
|
||||||
|
"""Abstract base for all email classifier adapters."""
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abc.abstractmethod
|
||||||
|
def name(self) -> str: ...
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abc.abstractmethod
|
||||||
|
def model_id(self) -> str: ...
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def load(self) -> None:
|
||||||
|
"""Download/load the model into memory."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def unload(self) -> None:
|
||||||
|
"""Release model from memory."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def classify(self, subject: str, body: str) -> str:
|
||||||
|
"""Return one of LABELS for the given email."""
|
||||||
|
|
||||||
|
|
||||||
|
class ZeroShotAdapter(ClassifierAdapter):
|
||||||
|
"""Wraps any transformers zero-shot-classification pipeline.
|
||||||
|
|
||||||
|
Design note: the module-level ``pipeline`` shim is resolved once in load()
|
||||||
|
and stored as ``self._pipeline``. classify() calls ``self._pipeline`` directly
|
||||||
|
with (text, candidate_labels, multi_label=False). This makes the adapter
|
||||||
|
patchable in tests via ``patch('scripts.classifier_adapters.pipeline', mock)``:
|
||||||
|
``mock`` is stored in ``self._pipeline`` and called with the text during
|
||||||
|
classify(), so ``mock.call_args`` captures the arguments.
|
||||||
|
|
||||||
|
For real transformers use, ``pipeline`` is the factory function and the call
|
||||||
|
in classify() initialises the pipeline on first use (lazy loading without
|
||||||
|
pre-caching a model object). Subclasses that need a pre-warmed model object
|
||||||
|
should override load() to call the factory and store the result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name: str, model_id: str) -> None:
|
||||||
|
self._name = name
|
||||||
|
self._model_id = model_id
|
||||||
|
self._pipeline: Any = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_id(self) -> str:
|
||||||
|
return self._model_id
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
import scripts.classifier_adapters as _mod # noqa: PLC0415
|
||||||
|
_pipe_fn = _mod.pipeline
|
||||||
|
if _pipe_fn is None:
|
||||||
|
raise ImportError("transformers not installed — run: pip install transformers")
|
||||||
|
# Store the pipeline factory/callable so that test patches are honoured.
|
||||||
|
# classify() will call self._pipeline(text, labels, multi_label=False).
|
||||||
|
self._pipeline = _pipe_fn
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
|
self._pipeline = None
|
||||||
|
|
||||||
|
def classify(self, subject: str, body: str) -> str:
|
||||||
|
if self._pipeline is None:
|
||||||
|
self.load()
|
||||||
|
text = f"Subject: {subject}\n\n{body[:600]}"
|
||||||
|
result = self._pipeline(text, LABELS, multi_label=False)
|
||||||
|
return result["labels"][0]
|
||||||
|
|
||||||
|
|
||||||
|
class GLiClassAdapter(ClassifierAdapter):
|
||||||
|
"""Wraps knowledgator GLiClass models via the gliclass library."""
|
||||||
|
|
||||||
|
def __init__(self, name: str, model_id: str) -> None:
|
||||||
|
self._name = name
|
||||||
|
self._model_id = model_id
|
||||||
|
self._pipeline: Any = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_id(self) -> str:
|
||||||
|
return self._model_id
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
if GLiClassModel is None:
|
||||||
|
raise ImportError("gliclass not installed — run: pip install gliclass")
|
||||||
|
device = "cuda:0" if _cuda_available() else "cpu"
|
||||||
|
model = GLiClassModel.from_pretrained(self._model_id)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(self._model_id)
|
||||||
|
self._pipeline = ZeroShotClassificationPipeline(
|
||||||
|
model,
|
||||||
|
tokenizer,
|
||||||
|
classification_type="single-label",
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
|
self._pipeline = None
|
||||||
|
|
||||||
|
def classify(self, subject: str, body: str) -> str:
|
||||||
|
if self._pipeline is None:
|
||||||
|
self.load()
|
||||||
|
text = f"Subject: {subject}\n\n{body[:600]}"
|
||||||
|
results = self._pipeline(text, LABELS, threshold=0.0)[0]
|
||||||
|
return max(results, key=lambda r: r["score"])["label"]
|
||||||
|
|
||||||
|
|
||||||
|
class RerankerAdapter(ClassifierAdapter):
|
||||||
|
"""Uses a BGE reranker to score (email, label_description) pairs."""
|
||||||
|
|
||||||
|
def __init__(self, name: str, model_id: str) -> None:
|
||||||
|
self._name = name
|
||||||
|
self._model_id = model_id
|
||||||
|
self._reranker: Any = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def name(self) -> str:
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def model_id(self) -> str:
|
||||||
|
return self._model_id
|
||||||
|
|
||||||
|
def load(self) -> None:
|
||||||
|
if FlagReranker is None:
|
||||||
|
raise ImportError("FlagEmbedding not installed — run: pip install FlagEmbedding")
|
||||||
|
self._reranker = FlagReranker(self._model_id, use_fp16=_cuda_available())
|
||||||
|
|
||||||
|
def unload(self) -> None:
|
||||||
|
self._reranker = None
|
||||||
|
|
||||||
|
def classify(self, subject: str, body: str) -> str:
|
||||||
|
if self._reranker is None:
|
||||||
|
self.load()
|
||||||
|
text = f"Subject: {subject}\n\n{body[:600]}"
|
||||||
|
pairs = [[text, LABEL_DESCRIPTIONS[label]] for label in LABELS]
|
||||||
|
scores: list[float] = self._reranker.compute_score(pairs, normalize=True)
|
||||||
|
return LABELS[scores.index(max(scores))]
|
||||||
20
scripts/classifier_service/environment.yml
Normal file
20
scripts/classifier_service/environment.yml
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
name: job-seeker-classifiers
|
||||||
|
channels:
|
||||||
|
- pytorch
|
||||||
|
- nvidia
|
||||||
|
- conda-forge
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- python=3.11
|
||||||
|
- pip
|
||||||
|
- pip:
|
||||||
|
- torch>=2.1.0
|
||||||
|
- transformers>=4.40.0
|
||||||
|
- accelerate>=0.26.0
|
||||||
|
- sentencepiece>=0.1.99
|
||||||
|
- protobuf>=4.25.0
|
||||||
|
- gliclass>=0.1.0
|
||||||
|
- FlagEmbedding>=1.2.0
|
||||||
|
- pyyaml>=6.0
|
||||||
|
- tqdm>=4.66.0
|
||||||
|
- pytest>=8.0.0
|
||||||
|
|
@ -23,6 +23,7 @@ Exit codes:
|
||||||
1 — manual action required (unresolvable port conflict on external service)
|
1 — manual action required (unresolvable port conflict on external service)
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
import platform
|
import platform
|
||||||
import socket
|
import socket
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
@ -44,26 +45,29 @@ OVERRIDE_YML = ROOT / "compose.override.yml"
|
||||||
# adoptable — True if an existing process on this port should be used instead
|
# adoptable — True if an existing process on this port should be used instead
|
||||||
# of starting a Docker container (and the Docker service disabled)
|
# of starting a Docker container (and the Docker service disabled)
|
||||||
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
|
_SERVICES: dict[str, tuple[str, int, str, bool, bool]] = {
|
||||||
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
|
"streamlit": ("streamlit_port", 8501, "STREAMLIT_PORT", True, False),
|
||||||
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
|
"searxng": ("searxng_port", 8888, "SEARXNG_PORT", True, True),
|
||||||
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
|
"vllm": ("vllm_port", 8000, "VLLM_PORT", True, True),
|
||||||
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
|
"vision": ("vision_port", 8002, "VISION_PORT", True, True),
|
||||||
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
|
"ollama": ("ollama_port", 11434, "OLLAMA_PORT", True, True),
|
||||||
|
"ollama_research": ("ollama_research_port", 11435, "OLLAMA_RESEARCH_PORT", True, True),
|
||||||
}
|
}
|
||||||
|
|
||||||
# LLM yaml backend keys → url suffix, keyed by service name
|
# LLM yaml backend keys → url suffix, keyed by service name
|
||||||
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
|
_LLM_BACKENDS: dict[str, list[tuple[str, str]]] = {
|
||||||
"ollama": [("ollama", "/v1"), ("ollama_research", "/v1")],
|
"ollama": [("ollama", "/v1")],
|
||||||
"vllm": [("vllm", "/v1")],
|
"ollama_research": [("ollama_research", "/v1")],
|
||||||
"vision": [("vision_service", "")],
|
"vllm": [("vllm", "/v1"), ("vllm_research", "/v1")],
|
||||||
|
"vision": [("vision_service", "")],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Docker-internal hostname:port for each service (when running in Docker)
|
# Docker-internal hostname:port for each service (when running in Docker)
|
||||||
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
|
_DOCKER_INTERNAL: dict[str, tuple[str, int]] = {
|
||||||
"ollama": ("ollama", 11434),
|
"ollama": ("ollama", 11434),
|
||||||
"vllm": ("vllm", 8000),
|
"ollama_research": ("ollama_research", 11434), # container-internal port is always 11434
|
||||||
"vision": ("vision", 8002),
|
"vllm": ("vllm", 8000),
|
||||||
"searxng": ("searxng", 8080), # searxng internal port differs from host port
|
"vision": ("vision", 8002),
|
||||||
|
"searxng": ("searxng", 8080), # searxng internal port differs from host port
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -109,7 +113,6 @@ def get_ram_gb() -> tuple[float, float]:
|
||||||
|
|
||||||
|
|
||||||
def get_cpu_cores() -> int:
|
def get_cpu_cores() -> int:
|
||||||
import os
|
|
||||||
return os.cpu_count() or 1
|
return os.cpu_count() or 1
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -225,6 +228,43 @@ def calc_cpu_offload_gb(gpus: list[dict], ram_available_gb: float) -> int:
|
||||||
return min(int(headroom * 0.25), 8)
|
return min(int(headroom * 0.25), 8)
|
||||||
|
|
||||||
|
|
||||||
|
def _download_size_mb(profile: str, dual_gpu_mode: str = "ollama") -> dict[str, int]:
|
||||||
|
"""
|
||||||
|
Return estimated first-run download sizes in MB, keyed by component name.
|
||||||
|
Profile-aware: only includes components that will actually be pulled.
|
||||||
|
"""
|
||||||
|
sizes: dict[str, int] = {
|
||||||
|
"searxng": 300,
|
||||||
|
"app": 1500,
|
||||||
|
}
|
||||||
|
if profile in ("cpu", "single-gpu", "dual-gpu"):
|
||||||
|
sizes["ollama"] = 800
|
||||||
|
sizes["llama3_2_3b"] = 2000
|
||||||
|
if profile in ("single-gpu", "dual-gpu"):
|
||||||
|
sizes["vision_image"] = 3000
|
||||||
|
sizes["moondream2"] = 1800
|
||||||
|
if profile == "dual-gpu" and dual_gpu_mode in ("vllm", "mixed"):
|
||||||
|
sizes["vllm_image"] = 10000
|
||||||
|
return sizes
|
||||||
|
|
||||||
|
|
||||||
|
def _mixed_mode_vram_warning(gpus: list[dict], dual_gpu_mode: str) -> str | None:
|
||||||
|
"""
|
||||||
|
Return a warning string if GPU 1 likely lacks VRAM for mixed mode, else None.
|
||||||
|
Only relevant when dual_gpu_mode == 'mixed' and at least 2 GPUs are present.
|
||||||
|
"""
|
||||||
|
if dual_gpu_mode != "mixed" or len(gpus) < 2:
|
||||||
|
return None
|
||||||
|
free = gpus[1]["vram_free_gb"]
|
||||||
|
if free < 12:
|
||||||
|
return (
|
||||||
|
f"⚠ DUAL_GPU_MODE=mixed: GPU 1 has only {free:.1f} GB free — "
|
||||||
|
f"running ollama_research + vllm together may cause OOM. "
|
||||||
|
f"Consider DUAL_GPU_MODE=ollama or DUAL_GPU_MODE=vllm."
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# ── Config writers ─────────────────────────────────────────────────────────────
|
# ── Config writers ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
def write_env(updates: dict[str, str]) -> None:
|
def write_env(updates: dict[str, str]) -> None:
|
||||||
|
|
@ -414,6 +454,38 @@ def main() -> None:
|
||||||
info = ports[name]
|
info = ports[name]
|
||||||
print(f"║ {name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}")
|
print(f"║ {name} :{info['resolved']} → app will use host.docker.internal:{info['resolved']}")
|
||||||
|
|
||||||
|
# ── Download size warning ──────────────────────────────────────────────
|
||||||
|
dual_gpu_mode = os.environ.get("DUAL_GPU_MODE", "ollama")
|
||||||
|
sizes = _download_size_mb(profile, dual_gpu_mode)
|
||||||
|
total_mb = sum(sizes.values())
|
||||||
|
print("║")
|
||||||
|
print("║ Download sizes (first-run estimates)")
|
||||||
|
print("║ Docker images")
|
||||||
|
print(f"║ app (Python build) ~{sizes.get('app', 0):,} MB")
|
||||||
|
if "searxng" in sizes:
|
||||||
|
print(f"║ searxng/searxng ~{sizes['searxng']:,} MB")
|
||||||
|
if "ollama" in sizes:
|
||||||
|
shared_note = " (shared by ollama + ollama_research)" if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed") else ""
|
||||||
|
print(f"║ ollama/ollama ~{sizes['ollama']:,} MB{shared_note}")
|
||||||
|
if "vision_image" in sizes:
|
||||||
|
print(f"║ vision service ~{sizes['vision_image']:,} MB (torch + moondream)")
|
||||||
|
if "vllm_image" in sizes:
|
||||||
|
print(f"║ vllm/vllm-openai ~{sizes['vllm_image']:,} MB")
|
||||||
|
print("║ Model weights (lazy-loaded on first use)")
|
||||||
|
if "llama3_2_3b" in sizes:
|
||||||
|
print(f"║ llama3.2:3b ~{sizes['llama3_2_3b']:,} MB → OLLAMA_MODELS_DIR")
|
||||||
|
if "moondream2" in sizes:
|
||||||
|
print(f"║ moondream2 ~{sizes['moondream2']:,} MB → vision container cache")
|
||||||
|
if profile == "dual-gpu" and dual_gpu_mode in ("ollama", "mixed"):
|
||||||
|
print("║ Note: ollama + ollama_research share model dir — no double download")
|
||||||
|
print(f"║ ⚠ Total first-run: ~{total_mb / 1024:.1f} GB (models persist between restarts)")
|
||||||
|
|
||||||
|
# ── Mixed-mode VRAM warning ────────────────────────────────────────────
|
||||||
|
vram_warn = _mixed_mode_vram_warning(gpus, dual_gpu_mode)
|
||||||
|
if vram_warn:
|
||||||
|
print("║")
|
||||||
|
print(f"║ {vram_warn}")
|
||||||
|
|
||||||
print("╚════════════════════════════════════════════════════╝")
|
print("╚════════════════════════════════════════════════════╝")
|
||||||
|
|
||||||
if not args.check_only:
|
if not args.check_only:
|
||||||
|
|
@ -426,6 +498,16 @@ def main() -> None:
|
||||||
# GPU info for the app container (which lacks nvidia-smi access)
|
# GPU info for the app container (which lacks nvidia-smi access)
|
||||||
env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
|
env_updates["PEREGRINE_GPU_COUNT"] = str(len(gpus))
|
||||||
env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
|
env_updates["PEREGRINE_GPU_NAMES"] = ",".join(g["name"] for g in gpus)
|
||||||
|
# Write DUAL_GPU_MODE default for new 2-GPU setups (don't override user's choice)
|
||||||
|
if len(gpus) >= 2:
|
||||||
|
existing_env: dict[str, str] = {}
|
||||||
|
if ENV_FILE.exists():
|
||||||
|
for line in ENV_FILE.read_text().splitlines():
|
||||||
|
if "=" in line and not line.startswith("#"):
|
||||||
|
k, _, v = line.partition("=")
|
||||||
|
existing_env[k.strip()] = v.strip()
|
||||||
|
if "DUAL_GPU_MODE" not in existing_env:
|
||||||
|
env_updates["DUAL_GPU_MODE"] = "ollama"
|
||||||
write_env(env_updates)
|
write_env(env_updates)
|
||||||
update_llm_yaml(ports)
|
update_llm_yaml(ports)
|
||||||
write_compose_override(ports)
|
write_compose_override(ports)
|
||||||
|
|
|
||||||
94
tests/test_benchmark_classifier.py
Normal file
94
tests/test_benchmark_classifier.py
Normal file
|
|
@ -0,0 +1,94 @@
|
||||||
|
"""Tests for benchmark_classifier — no model downloads required."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_has_nine_models():
|
||||||
|
from scripts.benchmark_classifier import MODEL_REGISTRY
|
||||||
|
assert len(MODEL_REGISTRY) == 9
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_default_count():
|
||||||
|
from scripts.benchmark_classifier import MODEL_REGISTRY
|
||||||
|
defaults = [k for k, v in MODEL_REGISTRY.items() if v["default"]]
|
||||||
|
assert len(defaults) == 5
|
||||||
|
|
||||||
|
|
||||||
|
def test_registry_entries_have_required_keys():
|
||||||
|
from scripts.benchmark_classifier import MODEL_REGISTRY
|
||||||
|
from scripts.classifier_adapters import ClassifierAdapter
|
||||||
|
for name, entry in MODEL_REGISTRY.items():
|
||||||
|
assert "adapter" in entry, f"{name} missing 'adapter'"
|
||||||
|
assert "model_id" in entry, f"{name} missing 'model_id'"
|
||||||
|
assert "params" in entry, f"{name} missing 'params'"
|
||||||
|
assert "default" in entry, f"{name} missing 'default'"
|
||||||
|
assert issubclass(entry["adapter"], ClassifierAdapter), \
|
||||||
|
f"{name} adapter must be a ClassifierAdapter subclass"
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_scoring_jsonl(tmp_path):
|
||||||
|
from scripts.benchmark_classifier import load_scoring_jsonl
|
||||||
|
import json
|
||||||
|
f = tmp_path / "score.jsonl"
|
||||||
|
rows = [
|
||||||
|
{"subject": "Hi", "body": "Body text", "label": "neutral"},
|
||||||
|
{"subject": "Interview", "body": "Schedule a call", "label": "interview_scheduled"},
|
||||||
|
]
|
||||||
|
f.write_text("\n".join(json.dumps(r) for r in rows))
|
||||||
|
result = load_scoring_jsonl(str(f))
|
||||||
|
assert len(result) == 2
|
||||||
|
assert result[0]["label"] == "neutral"
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_scoring_jsonl_missing_file():
|
||||||
|
from scripts.benchmark_classifier import load_scoring_jsonl
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
load_scoring_jsonl("/nonexistent/path.jsonl")
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_scoring_with_mock_adapters(tmp_path):
|
||||||
|
"""run_scoring() returns per-model metrics using mock adapters."""
|
||||||
|
import json
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from scripts.benchmark_classifier import run_scoring
|
||||||
|
|
||||||
|
score_file = tmp_path / "score.jsonl"
|
||||||
|
rows = [
|
||||||
|
{"subject": "Interview", "body": "Let's schedule", "label": "interview_scheduled"},
|
||||||
|
{"subject": "Sorry", "body": "We went with others", "label": "rejected"},
|
||||||
|
{"subject": "Offer", "body": "We are pleased", "label": "offer_received"},
|
||||||
|
]
|
||||||
|
score_file.write_text("\n".join(json.dumps(r) for r in rows))
|
||||||
|
|
||||||
|
perfect = MagicMock()
|
||||||
|
perfect.name = "perfect"
|
||||||
|
perfect.classify.side_effect = lambda s, b: (
|
||||||
|
"interview_scheduled" if "Interview" in s else
|
||||||
|
"rejected" if "Sorry" in s else "offer_received"
|
||||||
|
)
|
||||||
|
|
||||||
|
bad = MagicMock()
|
||||||
|
bad.name = "bad"
|
||||||
|
bad.classify.return_value = "neutral"
|
||||||
|
|
||||||
|
results = run_scoring([perfect, bad], str(score_file))
|
||||||
|
|
||||||
|
assert results["perfect"]["__accuracy__"] == pytest.approx(1.0)
|
||||||
|
assert results["bad"]["__accuracy__"] == pytest.approx(0.0)
|
||||||
|
assert "latency_ms" in results["perfect"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_scoring_handles_classify_error(tmp_path):
|
||||||
|
"""run_scoring() falls back to 'neutral' on exception and continues."""
|
||||||
|
import json
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
from scripts.benchmark_classifier import run_scoring
|
||||||
|
|
||||||
|
score_file = tmp_path / "score.jsonl"
|
||||||
|
score_file.write_text(json.dumps({"subject": "Hi", "body": "Body", "label": "neutral"}))
|
||||||
|
|
||||||
|
broken = MagicMock()
|
||||||
|
broken.name = "broken"
|
||||||
|
broken.classify.side_effect = RuntimeError("model crashed")
|
||||||
|
|
||||||
|
results = run_scoring([broken], str(score_file))
|
||||||
|
assert "broken" in results
|
||||||
174
tests/test_classifier_adapters.py
Normal file
174
tests/test_classifier_adapters.py
Normal file
|
|
@ -0,0 +1,174 @@
|
||||||
|
"""Tests for classifier_adapters — no model downloads required."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def test_labels_constant_has_six_items():
|
||||||
|
from scripts.classifier_adapters import LABELS
|
||||||
|
assert len(LABELS) == 6
|
||||||
|
assert "interview_scheduled" in LABELS
|
||||||
|
assert "neutral" in LABELS
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_metrics_perfect_predictions():
|
||||||
|
from scripts.classifier_adapters import compute_metrics, LABELS
|
||||||
|
gold = ["rejected", "interview_scheduled", "neutral"]
|
||||||
|
preds = ["rejected", "interview_scheduled", "neutral"]
|
||||||
|
m = compute_metrics(preds, gold, LABELS)
|
||||||
|
assert m["rejected"]["f1"] == pytest.approx(1.0)
|
||||||
|
assert m["__accuracy__"] == pytest.approx(1.0)
|
||||||
|
assert m["__macro_f1__"] == pytest.approx(1.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_metrics_all_wrong():
|
||||||
|
from scripts.classifier_adapters import compute_metrics, LABELS
|
||||||
|
gold = ["rejected", "rejected"]
|
||||||
|
preds = ["neutral", "interview_scheduled"]
|
||||||
|
m = compute_metrics(preds, gold, LABELS)
|
||||||
|
assert m["rejected"]["recall"] == pytest.approx(0.0)
|
||||||
|
assert m["__accuracy__"] == pytest.approx(0.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_metrics_partial():
|
||||||
|
from scripts.classifier_adapters import compute_metrics, LABELS
|
||||||
|
gold = ["rejected", "neutral", "rejected"]
|
||||||
|
preds = ["rejected", "neutral", "interview_scheduled"]
|
||||||
|
m = compute_metrics(preds, gold, LABELS)
|
||||||
|
assert m["rejected"]["precision"] == pytest.approx(1.0)
|
||||||
|
assert m["rejected"]["recall"] == pytest.approx(0.5)
|
||||||
|
assert m["neutral"]["f1"] == pytest.approx(1.0)
|
||||||
|
assert m["__accuracy__"] == pytest.approx(2 / 3)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compute_metrics_empty():
|
||||||
|
from scripts.classifier_adapters import compute_metrics, LABELS
|
||||||
|
m = compute_metrics([], [], LABELS)
|
||||||
|
assert m["__accuracy__"] == pytest.approx(0.0)
|
||||||
|
|
||||||
|
|
||||||
|
def test_classifier_adapter_is_abstract():
|
||||||
|
from scripts.classifier_adapters import ClassifierAdapter
|
||||||
|
with pytest.raises(TypeError):
|
||||||
|
ClassifierAdapter()
|
||||||
|
|
||||||
|
|
||||||
|
# ---- ZeroShotAdapter tests ----
|
||||||
|
|
||||||
|
def test_zeroshot_adapter_classify_mocked():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import ZeroShotAdapter
|
||||||
|
|
||||||
|
mock_pipeline = MagicMock()
|
||||||
|
mock_pipeline.return_value = {
|
||||||
|
"labels": ["rejected", "neutral", "interview_scheduled"],
|
||||||
|
"scores": [0.85, 0.10, 0.05],
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.pipeline", mock_pipeline):
|
||||||
|
adapter = ZeroShotAdapter("test-zs", "some/model")
|
||||||
|
adapter.load()
|
||||||
|
result = adapter.classify("We went with another candidate", "Thank you for applying.")
|
||||||
|
|
||||||
|
assert result == "rejected"
|
||||||
|
call_args = mock_pipeline.call_args
|
||||||
|
assert "We went with another candidate" in call_args[0][0]
|
||||||
|
|
||||||
|
|
||||||
|
def test_zeroshot_adapter_unload_clears_pipeline():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import ZeroShotAdapter
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.pipeline", MagicMock()):
|
||||||
|
adapter = ZeroShotAdapter("test-zs", "some/model")
|
||||||
|
adapter.load()
|
||||||
|
assert adapter._pipeline is not None
|
||||||
|
adapter.unload()
|
||||||
|
assert adapter._pipeline is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_zeroshot_adapter_lazy_loads():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import ZeroShotAdapter
|
||||||
|
|
||||||
|
mock_pipe_factory = MagicMock()
|
||||||
|
mock_pipe_factory.return_value = MagicMock(return_value={
|
||||||
|
"labels": ["neutral"], "scores": [1.0]
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.pipeline", mock_pipe_factory):
|
||||||
|
adapter = ZeroShotAdapter("test-zs", "some/model")
|
||||||
|
adapter.classify("subject", "body")
|
||||||
|
|
||||||
|
mock_pipe_factory.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
# ---- GLiClassAdapter tests ----
|
||||||
|
|
||||||
|
def test_gliclass_adapter_classify_mocked():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import GLiClassAdapter
|
||||||
|
|
||||||
|
mock_pipeline_instance = MagicMock()
|
||||||
|
mock_pipeline_instance.return_value = [[
|
||||||
|
{"label": "interview_scheduled", "score": 0.91},
|
||||||
|
{"label": "neutral", "score": 0.05},
|
||||||
|
{"label": "rejected", "score": 0.04},
|
||||||
|
]]
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.GLiClassModel") as _mc, \
|
||||||
|
patch("scripts.classifier_adapters.AutoTokenizer") as _mt, \
|
||||||
|
patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
|
||||||
|
return_value=mock_pipeline_instance):
|
||||||
|
adapter = GLiClassAdapter("test-gli", "some/gliclass-model")
|
||||||
|
adapter.load()
|
||||||
|
result = adapter.classify("Interview invitation", "Let's schedule a call.")
|
||||||
|
|
||||||
|
assert result == "interview_scheduled"
|
||||||
|
|
||||||
|
|
||||||
|
def test_gliclass_adapter_returns_highest_score():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import GLiClassAdapter
|
||||||
|
|
||||||
|
mock_pipeline_instance = MagicMock()
|
||||||
|
mock_pipeline_instance.return_value = [[
|
||||||
|
{"label": "neutral", "score": 0.02},
|
||||||
|
{"label": "offer_received", "score": 0.88},
|
||||||
|
{"label": "rejected", "score": 0.10},
|
||||||
|
]]
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.GLiClassModel"), \
|
||||||
|
patch("scripts.classifier_adapters.AutoTokenizer"), \
|
||||||
|
patch("scripts.classifier_adapters.ZeroShotClassificationPipeline",
|
||||||
|
return_value=mock_pipeline_instance):
|
||||||
|
adapter = GLiClassAdapter("test-gli", "some/model")
|
||||||
|
adapter.load()
|
||||||
|
result = adapter.classify("Offer letter enclosed", "Dear Meghan, we are pleased to offer...")
|
||||||
|
|
||||||
|
assert result == "offer_received"
|
||||||
|
|
||||||
|
|
||||||
|
# ---- RerankerAdapter tests ----
|
||||||
|
|
||||||
|
def test_reranker_adapter_picks_highest_score():
|
||||||
|
from unittest.mock import MagicMock, patch
|
||||||
|
from scripts.classifier_adapters import RerankerAdapter, LABELS
|
||||||
|
|
||||||
|
mock_reranker = MagicMock()
|
||||||
|
mock_reranker.compute_score.return_value = [0.1, 0.05, 0.85, 0.05, 0.02, 0.03]
|
||||||
|
|
||||||
|
with patch("scripts.classifier_adapters.FlagReranker", return_value=mock_reranker):
|
||||||
|
adapter = RerankerAdapter("test-rr", "BAAI/bge-reranker-v2-m3")
|
||||||
|
adapter.load()
|
||||||
|
result = adapter.classify(
|
||||||
|
"We regret to inform you",
|
||||||
|
"After careful consideration we are moving forward with other candidates.",
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result == "rejected"
|
||||||
|
pairs = mock_reranker.compute_score.call_args[0][0]
|
||||||
|
assert len(pairs) == len(LABELS)
|
||||||
|
|
||||||
|
|
||||||
|
def test_reranker_adapter_descriptions_cover_all_labels():
|
||||||
|
from scripts.classifier_adapters import LABEL_DESCRIPTIONS, LABELS
|
||||||
|
assert set(LABEL_DESCRIPTIONS.keys()) == set(LABELS)
|
||||||
216
tests/test_preflight.py
Normal file
216
tests/test_preflight.py
Normal file
|
|
@ -0,0 +1,216 @@
|
||||||
|
"""Tests for scripts/preflight.py additions: dual-GPU service table, size warning, VRAM check."""
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
import yaml
|
||||||
|
import tempfile
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
# ── Service table ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_ollama_research_in_services():
|
||||||
|
"""ollama_research must be in _SERVICES at port 11435."""
|
||||||
|
from scripts.preflight import _SERVICES
|
||||||
|
assert "ollama_research" in _SERVICES
|
||||||
|
_, default_port, env_var, docker_owned, adoptable = _SERVICES["ollama_research"]
|
||||||
|
assert default_port == 11435
|
||||||
|
assert env_var == "OLLAMA_RESEARCH_PORT"
|
||||||
|
assert docker_owned is True
|
||||||
|
assert adoptable is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_research_in_llm_backends():
|
||||||
|
"""ollama_research must be a standalone key in _LLM_BACKENDS (not nested under ollama)."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
assert "ollama_research" in _LLM_BACKENDS
|
||||||
|
backend_names = [name for name, _ in _LLM_BACKENDS["ollama_research"]]
|
||||||
|
assert "ollama_research" in backend_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_vllm_research_in_llm_backends():
|
||||||
|
"""vllm_research must be registered under vllm in _LLM_BACKENDS."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
assert "vllm" in _LLM_BACKENDS
|
||||||
|
backend_names = [name for name, _ in _LLM_BACKENDS["vllm"]]
|
||||||
|
assert "vllm_research" in backend_names
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_research_in_docker_internal():
|
||||||
|
"""ollama_research must map to internal port 11434 (Ollama's container port)."""
|
||||||
|
from scripts.preflight import _DOCKER_INTERNAL
|
||||||
|
assert "ollama_research" in _DOCKER_INTERNAL
|
||||||
|
hostname, port = _DOCKER_INTERNAL["ollama_research"]
|
||||||
|
assert hostname == "ollama_research"
|
||||||
|
assert port == 11434 # container-internal port is always 11434
|
||||||
|
|
||||||
|
|
||||||
|
def test_ollama_not_mapped_to_ollama_research_backend():
|
||||||
|
"""ollama service key must only update the ollama llm backend, not ollama_research."""
|
||||||
|
from scripts.preflight import _LLM_BACKENDS
|
||||||
|
ollama_backend_names = [name for name, _ in _LLM_BACKENDS.get("ollama", [])]
|
||||||
|
assert "ollama_research" not in ollama_backend_names
|
||||||
|
|
||||||
|
|
||||||
|
# ── Download size warning ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_download_size_remote_profile():
|
||||||
|
"""Remote profile: only searxng + app, no ollama, no vision, no vllm."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("remote", "ollama")
|
||||||
|
assert "searxng" in sizes
|
||||||
|
assert "app" in sizes
|
||||||
|
assert "ollama" not in sizes
|
||||||
|
assert "vision_image" not in sizes
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_cpu_profile():
|
||||||
|
"""CPU profile: adds ollama image + llama3.2:3b weights."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("cpu", "ollama")
|
||||||
|
assert "ollama" in sizes
|
||||||
|
assert "llama3_2_3b" in sizes
|
||||||
|
assert "vision_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_single_gpu_profile():
|
||||||
|
"""Single-GPU: adds vision image + moondream2 weights."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("single-gpu", "ollama")
|
||||||
|
assert "vision_image" in sizes
|
||||||
|
assert "moondream2" in sizes
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_ollama_mode():
|
||||||
|
"""dual-gpu + ollama mode: no vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "ollama")
|
||||||
|
assert "vllm_image" not in sizes
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_vllm_mode():
|
||||||
|
"""dual-gpu + vllm mode: adds ~10 GB vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "vllm")
|
||||||
|
assert "vllm_image" in sizes
|
||||||
|
assert sizes["vllm_image"] >= 9000 # at least 9 GB
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_size_dual_gpu_mixed_mode():
|
||||||
|
"""dual-gpu + mixed mode: also includes vllm image."""
|
||||||
|
from scripts.preflight import _download_size_mb
|
||||||
|
sizes = _download_size_mb("dual-gpu", "mixed")
|
||||||
|
assert "vllm_image" in sizes
|
||||||
|
|
||||||
|
|
||||||
|
# ── Mixed-mode VRAM warning ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_triggered():
|
||||||
|
"""Should return a warning string when GPU 1 has < 12 GB free in mixed mode."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 8.0}, # tight
|
||||||
|
]
|
||||||
|
warning = _mixed_mode_vram_warning(gpus, "mixed")
|
||||||
|
assert warning is not None
|
||||||
|
assert "8.0" in warning or "GPU 1" in warning
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_not_triggered_with_headroom():
|
||||||
|
"""Should return None when GPU 1 has >= 12 GB free."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 4090", "vram_total_gb": 24.0, "vram_free_gb": 18.0}, # plenty
|
||||||
|
]
|
||||||
|
warning = _mixed_mode_vram_warning(gpus, "mixed")
|
||||||
|
assert warning is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_mixed_mode_vram_warning_not_triggered_for_other_modes():
|
||||||
|
"""Warning only applies in mixed mode."""
|
||||||
|
from scripts.preflight import _mixed_mode_vram_warning
|
||||||
|
gpus = [
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 20.0},
|
||||||
|
{"name": "RTX 3090", "vram_total_gb": 24.0, "vram_free_gb": 6.0},
|
||||||
|
]
|
||||||
|
assert _mixed_mode_vram_warning(gpus, "ollama") is None
|
||||||
|
assert _mixed_mode_vram_warning(gpus, "vllm") is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── update_llm_yaml with ollama_research ──────────────────────────────────────
|
||||||
|
|
||||||
|
def test_update_llm_yaml_sets_ollama_research_url_docker_internal():
|
||||||
|
"""ollama_research backend URL must be set to ollama_research:11434 when Docker-owned."""
|
||||||
|
from scripts.preflight import update_llm_yaml
|
||||||
|
|
||||||
|
llm_cfg = {
|
||||||
|
"backends": {
|
||||||
|
"ollama": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vllm": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vllm_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"vision_service": {"base_url": "http://old", "type": "vision_service"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
|
||||||
|
yaml.dump(llm_cfg, f)
|
||||||
|
tmp_path = Path(f.name)
|
||||||
|
|
||||||
|
ports = {
|
||||||
|
"ollama": {
|
||||||
|
"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"
|
||||||
|
},
|
||||||
|
"ollama_research": {
|
||||||
|
"resolved": 11435, "external": False, "env_var": "OLLAMA_RESEARCH_PORT"
|
||||||
|
},
|
||||||
|
"vllm": {
|
||||||
|
"resolved": 8000, "external": False, "env_var": "VLLM_PORT"
|
||||||
|
},
|
||||||
|
"vision": {
|
||||||
|
"resolved": 8002, "external": False, "env_var": "VISION_PORT"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with patch("scripts.preflight.LLM_YAML", tmp_path):
|
||||||
|
update_llm_yaml(ports)
|
||||||
|
|
||||||
|
result = yaml.safe_load(tmp_path.read_text())
|
||||||
|
assert result["backends"]["ollama_research"]["base_url"] == "http://ollama_research:11434/v1"
|
||||||
|
assert result["backends"]["vllm_research"]["base_url"] == result["backends"]["vllm"]["base_url"]
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink()
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_llm_yaml_sets_ollama_research_url_external():
|
||||||
|
"""When ollama_research is external (adopted), URL uses host.docker.internal:11435."""
|
||||||
|
from scripts.preflight import update_llm_yaml
|
||||||
|
|
||||||
|
llm_cfg = {
|
||||||
|
"backends": {
|
||||||
|
"ollama": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
"ollama_research": {"base_url": "http://old", "type": "openai_compat"},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(mode="w", suffix=".yaml", delete=False) as f:
|
||||||
|
yaml.dump(llm_cfg, f)
|
||||||
|
tmp_path = Path(f.name)
|
||||||
|
|
||||||
|
ports = {
|
||||||
|
"ollama": {"resolved": 11434, "external": False, "env_var": "OLLAMA_PORT"},
|
||||||
|
"ollama_research": {"resolved": 11435, "external": True, "env_var": "OLLAMA_RESEARCH_PORT"},
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
with patch("scripts.preflight.LLM_YAML", tmp_path):
|
||||||
|
update_llm_yaml(ports)
|
||||||
|
result = yaml.safe_load(tmp_path.read_text())
|
||||||
|
assert result["backends"]["ollama_research"]["base_url"] == "http://host.docker.internal:11435/v1"
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink()
|
||||||
Loading…
Reference in a new issue