App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
134 lines
5 KiB
Python
134 lines
5 KiB
Python
# scripts/prepare_training_data.py
|
|
"""
|
|
Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
|
|
|
|
Outputs a JSONL file where each line is:
|
|
{"instruction": "Write a cover letter for the [role] position at [company].",
|
|
"output": "<full letter text>"}
|
|
|
|
Usage:
|
|
conda run -n job-seeker python scripts/prepare_training_data.py
|
|
conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl
|
|
"""
|
|
import argparse
|
|
import json
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
LETTERS_DIR = Path("/Library/Documents/JobSearch")
|
|
# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
|
|
LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
|
|
DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
|
|
|
|
# Patterns that appear in opening sentences to extract role
|
|
ROLE_PATTERNS = [
|
|
r"apply for (?:the )?(.+?) (?:position|role|opportunity) at",
|
|
r"apply for (?:the )?(.+?) (?:at|with)\b",
|
|
]
|
|
|
|
|
|
def extract_role_from_text(text: str) -> str:
|
|
"""Try to extract the role title from the first ~500 chars of a cover letter."""
|
|
# Search the opening of the letter, skipping past any greeting line
|
|
search_text = text[:600]
|
|
for pattern in ROLE_PATTERNS:
|
|
m = re.search(pattern, search_text, re.IGNORECASE)
|
|
if m:
|
|
role = m.group(1).strip().rstrip(".")
|
|
# Filter out noise — role should be ≤6 words
|
|
if 1 <= len(role.split()) <= 6:
|
|
return role
|
|
return ""
|
|
|
|
|
|
def extract_company_from_filename(stem: str) -> str:
|
|
"""Extract company name from cover letter filename stem."""
|
|
return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip()
|
|
|
|
|
|
def strip_greeting(text: str) -> str:
|
|
"""Remove the 'Dear X,' line so the output is just the letter body + sign-off."""
|
|
lines = text.splitlines()
|
|
for i, line in enumerate(lines):
|
|
if line.strip().lower().startswith("dear "):
|
|
# Skip the greeting line and any following blank lines
|
|
rest = lines[i + 1:]
|
|
while rest and not rest[0].strip():
|
|
rest = rest[1:]
|
|
return "\n".join(rest).strip()
|
|
return text.strip()
|
|
|
|
|
|
def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
|
|
"""Parse all cover letters and return list of training records."""
|
|
records = []
|
|
seen: set[Path] = set()
|
|
all_paths = []
|
|
for glob in LETTER_GLOBS:
|
|
for p in letters_dir.glob(glob):
|
|
if p not in seen:
|
|
seen.add(p)
|
|
all_paths.append(p)
|
|
for path in sorted(all_paths):
|
|
text = path.read_text(encoding="utf-8", errors="ignore").strip()
|
|
if not text or len(text) < 100:
|
|
continue
|
|
|
|
company = extract_company_from_filename(path.stem)
|
|
role = extract_role_from_text(text)
|
|
body = strip_greeting(text)
|
|
|
|
if not role:
|
|
# Use a generic instruction when role extraction fails
|
|
instruction = f"Write a cover letter for a position at {company}."
|
|
else:
|
|
instruction = f"Write a cover letter for the {role} position at {company}."
|
|
|
|
records.append({
|
|
"instruction": instruction,
|
|
"output": body,
|
|
"source_file": path.name,
|
|
})
|
|
|
|
return records
|
|
|
|
|
|
def write_jsonl(records: list[dict], output_path: Path) -> None:
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
for record in records:
|
|
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus")
|
|
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path")
|
|
parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters")
|
|
parser.add_argument("--stats", action="store_true", help="Print statistics and exit")
|
|
args = parser.parse_args()
|
|
|
|
records = build_records(Path(args.letters_dir))
|
|
|
|
if args.stats:
|
|
print(f"Total letters: {len(records)}")
|
|
with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position"))
|
|
print(f"Role extracted: {with_role}/{len(records)}")
|
|
avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1)
|
|
print(f"Avg letter length: {avg_len:.0f} chars")
|
|
for r in records:
|
|
print(f" {r['source_file']!r:55s} → {r['instruction'][:70]}")
|
|
return
|
|
|
|
output_path = Path(args.output)
|
|
write_jsonl(records, output_path)
|
|
print(f"Wrote {len(records)} training records to {output_path}")
|
|
print()
|
|
print("Next step for LoRA fine-tuning:")
|
|
print(" 1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct")
|
|
print(" 2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)")
|
|
print(" 3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh — hugging-face-model-trainer skill")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|