peregrine/scripts/prepare_training_data.py
pyr0ball 740b0ea45a feat: containerize fine-tune pipeline (Dockerfile.finetune + make finetune)
- Dockerfile.finetune: PyTorch 2.3/CUDA 12.1 base + unsloth + training stack
- finetune_local.py: auto-register model via Ollama HTTP API after GGUF
  export; path-translate between finetune container mount and Ollama's view;
  update config/llm.yaml automatically; DOCS_DIR env override for Docker
- prepare_training_data.py: DOCS_DIR env override so make prepare-training
  works correctly inside the app container
- compose.yml: add finetune service (cpu/single-gpu/dual-gpu profiles);
  DOCS_DIR=/docs injected into app + finetune containers
- compose.podman-gpu.yml: CDI device override for finetune service
- Makefile: make prepare-training + make finetune targets
2026-02-25 16:22:48 -08:00

145 lines
5.4 KiB
Python

# scripts/prepare_training_data.py
"""
Extract training pairs from the candidate's cover letter corpus for LoRA fine-tuning.
Outputs a JSONL file where each line is:
{"instruction": "Write a cover letter for the [role] position at [company].",
"output": "<full letter text>"}
Usage:
conda run -n job-seeker python scripts/prepare_training_data.py
conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl
"""
import argparse
import json
import os
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from scripts.user_profile import UserProfile
_USER_YAML = Path(__file__).parent.parent / "config" / "user.yaml"
_profile = UserProfile(_USER_YAML) if UserProfile.exists(_USER_YAML) else None
_docs_env = os.environ.get("DOCS_DIR", "")
_docs = Path(_docs_env) if _docs_env else (
_profile.docs_dir if _profile else Path.home() / "Documents" / "JobSearch"
)
LETTERS_DIR = _docs
# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
DEFAULT_OUTPUT = _docs / "training_data" / "cover_letters.jsonl"
# Patterns that appear in opening sentences to extract role
ROLE_PATTERNS = [
r"apply for (?:the )?(.+?) (?:position|role|opportunity) at",
r"apply for (?:the )?(.+?) (?:at|with)\b",
]
def extract_role_from_text(text: str) -> str:
"""Try to extract the role title from the first ~500 chars of a cover letter."""
# Search the opening of the letter, skipping past any greeting line
search_text = text[:600]
for pattern in ROLE_PATTERNS:
m = re.search(pattern, search_text, re.IGNORECASE)
if m:
role = m.group(1).strip().rstrip(".")
# Filter out noise — role should be ≤6 words
if 1 <= len(role.split()) <= 6:
return role
return ""
def extract_company_from_filename(stem: str) -> str:
"""Extract company name from cover letter filename stem."""
return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip()
def strip_greeting(text: str) -> str:
"""Remove the 'Dear X,' line so the output is just the letter body + sign-off."""
lines = text.splitlines()
for i, line in enumerate(lines):
if line.strip().lower().startswith("dear "):
# Skip the greeting line and any following blank lines
rest = lines[i + 1:]
while rest and not rest[0].strip():
rest = rest[1:]
return "\n".join(rest).strip()
return text.strip()
def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
"""Parse all cover letters and return list of training records."""
records = []
seen: set[Path] = set()
all_paths = []
for glob in LETTER_GLOBS:
for p in letters_dir.glob(glob):
if p not in seen:
seen.add(p)
all_paths.append(p)
for path in sorted(all_paths):
text = path.read_text(encoding="utf-8", errors="ignore").strip()
if not text or len(text) < 100:
continue
company = extract_company_from_filename(path.stem)
role = extract_role_from_text(text)
body = strip_greeting(text)
if not role:
# Use a generic instruction when role extraction fails
instruction = f"Write a cover letter for a position at {company}."
else:
instruction = f"Write a cover letter for the {role} position at {company}."
records.append({
"instruction": instruction,
"output": body,
"source_file": path.name,
})
return records
def write_jsonl(records: list[dict], output_path: Path) -> None:
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
for record in records:
f.write(json.dumps(record, ensure_ascii=False) + "\n")
def main() -> None:
parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus")
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path")
parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters")
parser.add_argument("--stats", action="store_true", help="Print statistics and exit")
args = parser.parse_args()
records = build_records(Path(args.letters_dir))
if args.stats:
print(f"Total letters: {len(records)}")
with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position"))
print(f"Role extracted: {with_role}/{len(records)}")
avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1)
print(f"Avg letter length: {avg_len:.0f} chars")
for r in records:
print(f" {r['source_file']!r:55s}{r['instruction'][:70]}")
return
output_path = Path(args.output)
write_jsonl(records, output_path)
print(f"Wrote {len(records)} training records to {output_path}")
print()
print("Next step for LoRA fine-tuning:")
print(" 1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct")
print(" 2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)")
print(" 3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh — hugging-face-model-trainer skill")
if __name__ == "__main__":
main()