From 54de37e5fa378bb3e93dd6e9b75a74d37d15afc7 Mon Sep 17 00:00:00 2001
From: pyr0ball <pyroballpcs@gmail.com>
Date: Wed, 25 Feb 2026 16:22:48 -0800
Subject: [PATCH] feat: containerize fine-tune pipeline (Dockerfile.finetune +
 make finetune)

- Dockerfile.finetune: PyTorch 2.3/CUDA 12.1 base + unsloth + training stack
- finetune_local.py: auto-register model via Ollama HTTP API after GGUF
  export; path-translate between finetune container mount and Ollama's view;
  update config/llm.yaml automatically; DOCS_DIR env override for Docker
- prepare_training_data.py: DOCS_DIR env override so make prepare-training
  works correctly inside the app container
- compose.yml: add finetune service (cpu/single-gpu/dual-gpu profiles);
  DOCS_DIR=/docs injected into app + finetune containers
- compose.podman-gpu.yml: CDI device override for finetune service
- Makefile: make prepare-training + make finetune targets
---
 Dockerfile.finetune              |  38 +++++++++
 Makefile                         |  11 ++-
 compose.podman-gpu.yml           |   8 ++
 compose.yml                      |  20 +++++
 scripts/finetune_local.py        | 134 ++++++++++++++++++++++++-------
 scripts/prepare_training_data.py |   6 +-
 6 files changed, 183 insertions(+), 34 deletions(-)
 create mode 100644 Dockerfile.finetune
diff --git a/Dockerfile.finetune b/Dockerfile.finetune
new file mode 100644
index 0000000..bf3a70e
--- /dev/null
+++ b/Dockerfile.finetune
@@ -0,0 +1,38 @@
+# Dockerfile.finetune — Cover letter LoRA fine-tuner (QLoRA via unsloth)
+# Large image (~12-15 GB after build). Built once, cached on rebuilds.
+# GPU strongly recommended. CPU fallback works but training is very slow.
+#
+# Tested base: pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+# If your GPU requires a different CUDA version, change the FROM line and
+# reinstall bitsandbytes for the matching CUDA (e.g. bitsandbytes-cuda121).
+FROM pytorch/pytorch:2.3.0-cuda12.1-cudnn8-runtime
+
+WORKDIR /app
+
+# Build tools needed by bitsandbytes CUDA kernels and unsloth
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc g++ git libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install training stack.
+# unsloth detects CUDA version automatically from the base image.
+RUN pip install --no-cache-dir \
+    "unsloth @ git+https://github.com/unslothai/unsloth.git" \
+    "datasets>=2.18" "trl>=0.8" peft transformers \
+    "bitsandbytes>=0.43.0" accelerate sentencepiece \
+    requests pyyaml
+
+COPY scripts/ /app/scripts/
+COPY config/  /app/config/
+
+ENV PYTHONUNBUFFERED=1
+# Pin to GPU 0; overridable at runtime with --env CUDA_VISIBLE_DEVICES=
+ENV CUDA_VISIBLE_DEVICES=0
+
+# Runtime env vars injected by compose.yml:
+#   OLLAMA_URL              — Ollama API base (default: http://ollama:11434)
+#   OLLAMA_MODELS_MOUNT     — finetune container's mount path for ollama models volume
+#   OLLAMA_MODELS_OLLAMA_PATH — Ollama container's mount path for same volume
+#   DOCS_DIR                — cover letters + training data root (default: /docs)
+
+ENTRYPOINT ["python", "scripts/finetune_local.py"]
diff --git a/Makefile b/Makefile
index 1e5a1f7..4576ebf 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
 # Makefile — Peregrine convenience targets
 # Usage: make <target>
 
-.PHONY: setup preflight start stop restart logs test clean help
+.PHONY: setup preflight start stop restart logs test prepare-training finetune clean help
 
 PROFILE ?= remote
 PYTHON  ?= python3
@@ -43,7 +43,14 @@ logs:           ## Tail app logs
 	$(COMPOSE) logs -f app
 
 test:           ## Run the test suite
-	$(PYTHON) -m pytest tests/ -v
+	@$(PYTHON) -m pytest tests/ -v
+
+prepare-training: ## Scan docs_dir for cover letters and build training JSONL
+	$(COMPOSE) $(COMPOSE_FILES) run --rm app python scripts/prepare_training_data.py
+
+finetune:       ## Fine-tune your personal cover letter model (run prepare-training first)
+	@echo "Starting fine-tune (30-90 min on GPU, much longer on CPU)..."
+	$(COMPOSE) $(COMPOSE_FILES) --profile $(PROFILE) run --rm finetune
 
 clean:          ## Remove containers, images, and data volumes (DESTRUCTIVE)
 	@echo "WARNING: This will delete all Peregrine containers and data."
diff --git a/compose.podman-gpu.yml b/compose.podman-gpu.yml
index 46d5465..e812287 100644
--- a/compose.podman-gpu.yml
+++ b/compose.podman-gpu.yml
@@ -33,3 +33,11 @@ services:
       resources:
         reservations:
           devices: []
+
+  finetune:
+    devices:
+      - nvidia.com/gpu=0
+    deploy:
+      resources:
+        reservations:
+          devices: []
diff --git a/compose.yml b/compose.yml
index 79d8ba2..46b9bff 100644
--- a/compose.yml
+++ b/compose.yml
@@ -12,6 +12,7 @@ services:
       - ${DOCS_DIR:-~/Documents/JobSearch}:/docs
     environment:
       - STAGING_DB=/app/data/staging.db
+      - DOCS_DIR=/docs
       - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
       - OPENAI_COMPAT_URL=${OPENAI_COMPAT_URL:-}
       - OPENAI_COMPAT_KEY=${OPENAI_COMPAT_KEY:-}
@@ -101,3 +102,22 @@ services:
               capabilities: [gpu]
     profiles: [dual-gpu]
     restart: unless-stopped
+
+  finetune:
+    build:
+      context: .
+      dockerfile: Dockerfile.finetune
+    volumes:
+      - ${DOCS_DIR:-~/Documents/JobSearch}:/docs
+      - ${OLLAMA_MODELS_DIR:-~/models/ollama}:/ollama-models
+      - ./config:/app/config
+    environment:
+      - DOCS_DIR=/docs
+      - OLLAMA_URL=http://ollama:11434
+      - OLLAMA_MODELS_MOUNT=/ollama-models
+      - OLLAMA_MODELS_OLLAMA_PATH=/root/.ollama
+    depends_on:
+      ollama:
+        condition: service_started
+    profiles: [cpu, single-gpu, dual-gpu]
+    restart: "no"
diff --git a/scripts/finetune_local.py b/scripts/finetune_local.py
index bfbf199..c096e33 100644
--- a/scripts/finetune_local.py
+++ b/scripts/finetune_local.py
@@ -32,7 +32,12 @@ _profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
 # ── Config ────────────────────────────────────────────────────────────────────
 DEFAULT_MODEL   = "unsloth/Llama-3.2-3B-Instruct"   # safe on 8 GB VRAM
 
-_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+# DOCS_DIR env var overrides user_profile when running inside Docker
+_docs_env = os.environ.get("DOCS_DIR", "")
+_docs = Path(_docs_env) if _docs_env else (
+    _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+)
+
 LETTERS_JSONL   = _docs / "training_data" / "cover_letters.jsonl"
 OUTPUT_DIR      = _docs / "training_data" / "finetune_output"
 GGUF_DIR        = _docs / "training_data" / "gguf"
@@ -66,7 +71,7 @@ print(f"{'='*60}\n")
 # ── Load dataset ──────────────────────────────────────────────────────────────
 if not LETTERS_JSONL.exists():
     sys.exit(f"ERROR: Dataset not found at {LETTERS_JSONL}\n"
-             "Run: conda run -n job-seeker python scripts/prepare_training_data.py")
+             "Run: make prepare-training  (or: python scripts/prepare_training_data.py)")
 
 records = [json.loads(l) for l in LETTERS_JSONL.read_text().splitlines() if l.strip()]
 print(f"Loaded {len(records)} training examples.")
@@ -222,35 +227,102 @@ if not args.no_gguf and USE_UNSLOTH:
 else:
     gguf_path = None
 
-# ── Print next steps ──────────────────────────────────────────────────────────
-print(f"\n{'='*60}")
-print("  DONE — next steps to load into Ollama:")
-print(f"{'='*60}")
+# ── Register with Ollama (auto) ────────────────────────────────────────────────
+
+def _auto_register_ollama(gguf_path: Path, model_name: str, system_prompt: str) -> bool:
+    """
+    Copy GGUF into the shared Ollama models volume and register via the API.
+
+    Works in two modes:
+      Containerised — OLLAMA_MODELS_MOUNT + OLLAMA_MODELS_OLLAMA_PATH env vars
+                      translate the container path into Ollama's view of the file.
+      Local         — gguf_path is an absolute path Ollama can read directly.
+    """
+    import shutil
+    import requests
+
+    ollama_url        = os.environ.get("OLLAMA_URL", "http://localhost:11434")
+    models_mount      = os.environ.get("OLLAMA_MODELS_MOUNT", "")
+    ollama_models_dir = os.environ.get("OLLAMA_MODELS_OLLAMA_PATH", "")
+
+    # ── Place GGUF where Ollama can read it ───────────────────────────────────
+    if models_mount and ollama_models_dir:
+        # Containerised: write into the shared volume; Ollama reads from its own mount.
+        dest_dir = Path(models_mount) / "custom"
+        dest_dir.mkdir(parents=True, exist_ok=True)
+        dest = dest_dir / gguf_path.name
+        if dest != gguf_path:
+            print(f"Copying GGUF → shared volume: {dest}")
+            shutil.copy2(gguf_path, dest)
+        ollama_gguf = f"{ollama_models_dir}/custom/{gguf_path.name}"
+    else:
+        # Local: pass the absolute path directly.
+        ollama_gguf = str(gguf_path.resolve())
+
+    modelfile_text = (
+        f"FROM {ollama_gguf}\n"
+        f"SYSTEM \"\"\"\n{system_prompt}\n\"\"\"\n"
+        f"PARAMETER temperature 0.7\n"
+        f"PARAMETER top_p 0.9\n"
+        f"PARAMETER num_ctx 32768\n"
+    )
+
+    # Write Modelfile to disk as a reference (useful for debugging)
+    (OUTPUT_DIR / "Modelfile").write_text(modelfile_text)
+
+    # ── Create via Ollama API ─────────────────────────────────────────────────
+    print(f"\nRegistering '{model_name}' with Ollama at {ollama_url} …")
+    try:
+        r = requests.post(
+            f"{ollama_url}/api/create",
+            json={"name": model_name, "modelfile": modelfile_text},
+            timeout=300,
+            stream=True,
+        )
+        for line in r.iter_lines():
+            if line:
+                import json as _json
+                try:
+                    msg = _json.loads(line).get("status", "")
+                except Exception:
+                    msg = line.decode()
+                if msg:
+                    print(f"  {msg}")
+        if r.status_code != 200:
+            print(f"  WARNING: Ollama returned HTTP {r.status_code}")
+            return False
+    except Exception as exc:
+        print(f"  Ollama registration failed: {exc}")
+        print(f"  Run manually: ollama create {model_name} -f {OUTPUT_DIR / 'Modelfile'}")
+        return False
+
+    # ── Update config/llm.yaml ────────────────────────────────────────────────
+    llm_yaml = Path(__file__).parent.parent / "config" / "llm.yaml"
+    if llm_yaml.exists():
+        try:
+            import yaml as _yaml
+            cfg = _yaml.safe_load(llm_yaml.read_text()) or {}
+            if "backends" in cfg and "ollama" in cfg["backends"]:
+                cfg["backends"]["ollama"]["model"] = f"{model_name}:latest"
+                llm_yaml.write_text(
+                    _yaml.dump(cfg, default_flow_style=False, allow_unicode=True)
+                )
+                print(f"  llm.yaml updated → ollama.model = {model_name}:latest")
+        except Exception as exc:
+            print(f"  Could not update llm.yaml automatically: {exc}")
+
+    print(f"\n{'='*60}")
+    print(f"  Model ready: {model_name}:latest")
+    print(f"  Test: ollama run {model_name} 'Write a cover letter for a Senior Engineer role at Acme Corp.'")
+    print(f"{'='*60}\n")
+    return True
+
 
 if gguf_path and gguf_path.exists():
-    modelfile = OUTPUT_DIR / "Modelfile"
-    modelfile.write_text(f"""FROM {gguf_path}
-SYSTEM \"\"\"
-{SYSTEM_PROMPT}
-\"\"\"
-PARAMETER temperature 0.7
-PARAMETER top_p 0.9
-PARAMETER num_ctx 32768
-""")
-    print(f"\n1. Modelfile written to: {modelfile}")
-    print(f"\n2. Create the Ollama model:")
-    print(f"     ollama create {OLLAMA_NAME} -f {modelfile}")
-    print(f"\n3. Test it:")
-    print(f"     ollama run {OLLAMA_NAME} 'Write a cover letter for a Senior Customer Success Manager position at Acme Corp.'")
-    print(f"\n4. Update llm.yaml to use '{OLLAMA_NAME}:latest' as the ollama model,")
-    print(f"   then pick it in Settings → LLM Backends → Ollama → Model.")
+    _auto_register_ollama(gguf_path, OLLAMA_NAME, SYSTEM_PROMPT)
 else:
-    print(f"\n  Adapter only (no GGUF). To convert manually:")
-    print(f"  1. Merge adapter:")
-    print(f"       conda run -n ogma python -c \"")
-    print(f"         from peft import AutoPeftModelForCausalLM")
-    print(f"         m = AutoPeftModelForCausalLM.from_pretrained('{adapter_path}')")
-    print(f"         m.merge_and_unload().save_pretrained('{OUTPUT_DIR}/merged')\"")
-    print(f"  2. Convert to GGUF using textgen env's convert_hf_to_gguf.py")
-    print(f"  3. ollama create {OLLAMA_NAME} -f Modelfile")
-print()
+    print(f"\n{'='*60}")
+    print("  Adapter saved (no GGUF produced).")
+    print(f"  Re-run without --no-gguf to generate a GGUF for Ollama registration.")
+    print(f"  Adapter path: {adapter_path}")
+    print(f"{'='*60}\n")
diff --git a/scripts/prepare_training_data.py b/scripts/prepare_training_data.py
index 9b7441c..e0bc046 100644
--- a/scripts/prepare_training_data.py
+++ b/scripts/prepare_training_data.py
@@ -12,6 +12,7 @@ Usage:
 """
 import argparse
 import json
+import os
 import re
 import sys
 from pathlib import Path
@@ -22,7 +23,10 @@ from scripts.user_profile import UserProfile
 _USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
 _profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
 
-_docs = _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+_docs_env = os.environ.get("DOCS_DIR", "")
+_docs = Path(_docs_env) if _docs_env else (
+    _profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
+)
 LETTERS_DIR = _docs
 # Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
 LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]