diff --git a/migrations/009_training_export.sql b/migrations/009_training_export.sql new file mode 100644 index 0000000..87ce531 --- /dev/null +++ b/migrations/009_training_export.sql @@ -0,0 +1 @@ +ALTER TABLE jobs ADD COLUMN excluded_from_training INTEGER DEFAULT 0; diff --git a/scripts/db.py b/scripts/db.py index 6ea82a8..6daf69e 100644 --- a/scripts/db.py +++ b/scripts/db.py @@ -170,7 +170,8 @@ _MIGRATIONS = [ ("optimized_resume", "TEXT"), # ATS-rewritten resume text (paid tier) ("ats_gap_report", "TEXT"), # JSON gap report (free tier) ("date_posted", "TEXT"), # Original posting date from job board (shadow listing detection) - ("hired_feedback", "TEXT"), # JSON: optional post-hire "what helped" response + ("hired_feedback", "TEXT"), # JSON: optional post-hire "what helped" response + ("excluded_from_training", "INTEGER DEFAULT 0"), # opt-out of training export ] @@ -1142,3 +1143,99 @@ def set_job_resume(db_path: Path = DEFAULT_DB, job_id: int = 0, resume_id: int = conn.commit() finally: conn.close() + +# ── Training export helpers ─────────────────────────────────────────────────── + +def _strip_greeting(text: str) -> str: + """Remove 'Dear X,' greeting line from cover letter text.""" + lines = text.splitlines() + for i, line in enumerate(lines): + stripped_line = line.strip() + if stripped_line.lower().startswith("dear ") and stripped_line.endswith((",", ":")): + rest = lines[i + 1:] + while rest and not rest[0].strip(): + rest = rest[1:] + result = "\n".join(rest).strip() + return result if result else text.strip() + return text.strip() + + +def get_db_pairs(db_path: Path) -> list[dict]: + """Return curation metadata for ALL qualifying jobs (included and excluded). + + Used by the curation UI. Includes excluded=True rows so users can restore them. + """ + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + try: + rows = conn.execute( + "SELECT id, title, company, description, status, " + " excluded_from_training " + "FROM jobs " + "WHERE status IN ('applied','phone_screen','interviewing','offer','hired') " + " AND cover_letter IS NOT NULL AND cover_letter != '' " + "ORDER BY applied_at DESC", + ).fetchall() + finally: + conn.close() + return [ + { + "job_id": row["id"], + "title": row["title"] or "", + "company": row["company"] or "", + "status": row["status"], + "instruction": ( + f"Write a cover letter for the {row['title'] or 'unknown'} " + f"position at {row['company'] or 'unknown'}." + ), + "input_preview": (row["description"] or "")[:200], + "excluded": bool(row["excluded_from_training"]), + } + for row in rows + ] + + +def get_training_pairs(db_path: Path) -> list[dict]: + """Return Alpaca-format training pairs for non-excluded qualifying jobs. + + Used by the JSONL export endpoint. + """ + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + try: + rows = conn.execute( + "SELECT id, title, company, description, cover_letter " + "FROM jobs " + "WHERE status IN ('applied','phone_screen','interviewing','offer','hired') " + " AND cover_letter IS NOT NULL AND cover_letter != '' " + " AND excluded_from_training = 0 " + "ORDER BY applied_at DESC", + ).fetchall() + finally: + conn.close() + return [ + { + "instruction": ( + f"Write a cover letter for the {row['title'] or 'unknown'} " + f"position at {row['company'] or 'unknown'}." + ), + "input": row["description"] or "", + "output": _strip_greeting(row["cover_letter"]), + "source": "db", + "job_id": row["id"], + } + for row in rows + ] + + +def set_training_exclusion(db_path: Path, job_id: int, excluded: bool) -> None: + """Set excluded_from_training flag on a job.""" + conn = sqlite3.connect(db_path) + try: + conn.execute( + "UPDATE jobs SET excluded_from_training = ? WHERE id = ?", + (1 if excluded else 0, job_id), + ) + conn.commit() + finally: + conn.close() diff --git a/tests/test_training_export.py b/tests/test_training_export.py new file mode 100644 index 0000000..e32b06e --- /dev/null +++ b/tests/test_training_export.py @@ -0,0 +1,122 @@ +"""Tests for cover letter training export helpers.""" +import json +import sqlite3 +import pytest +from pathlib import Path + + +def _make_db(tmp_path: Path) -> Path: + from scripts.db import init_db + db = tmp_path / "test.db" + init_db(db) + # excluded_from_training column is added by _migrate_db via _MIGRATIONS — no manual ALTER needed + return db + + +def _insert_job(db: Path, *, title="Engineer", company="Acme", status="applied", + cover_letter="Dear Hiring Manager,\n\nI am excited.", description="Build stuff.", + excluded=0) -> int: + conn = sqlite3.connect(db) + cur = conn.execute( + "INSERT INTO jobs (title, company, url, source, location, is_remote, salary, " + "description, date_found, status, cover_letter, excluded_from_training) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?,?)", + (title, company, f"https://example.com/{title}", "test", "Remote", 1, "", + description, "2026-01-01", status, cover_letter, excluded), + ) + conn.commit() + job_id = cur.lastrowid + conn.close() + return job_id + + +def test_get_training_pairs_returns_applied_jobs(tmp_path): + from scripts.db import get_training_pairs + db = _make_db(tmp_path) + _insert_job(db, title="Engineer", company="Acme", status="applied") + pairs = get_training_pairs(db) + assert len(pairs) == 1 + assert pairs[0]["source"] == "db" + assert pairs[0]["instruction"] == "Write a cover letter for the Engineer position at Acme." + assert "job_id" in pairs[0] + + +def test_get_training_pairs_strips_greeting(tmp_path): + from scripts.db import get_training_pairs + db = _make_db(tmp_path) + _insert_job(db, cover_letter="Dear Hiring Manager,\n\nI am excited to apply.\n\nSincerely, Me") + pairs = get_training_pairs(db) + assert not pairs[0]["output"].startswith("Dear") + assert "I am excited" in pairs[0]["output"] + + +def test_get_training_pairs_excludes_non_applied(tmp_path): + from scripts.db import get_training_pairs + db = _make_db(tmp_path) + _insert_job(db, title="PendingJob", status="pending") + _insert_job(db, title="ApprovedJob", status="approved") + pairs = get_training_pairs(db) + assert len(pairs) == 0 + + +def test_get_training_pairs_excludes_opted_out(tmp_path): + from scripts.db import get_training_pairs + db = _make_db(tmp_path) + _insert_job(db, excluded=1) + pairs = get_training_pairs(db) + assert len(pairs) == 0 + + +def test_get_training_pairs_null_description_gives_empty_input(tmp_path): + from scripts.db import get_training_pairs + db = _make_db(tmp_path) + conn = sqlite3.connect(db) + conn.execute( + "INSERT INTO jobs (title, company, url, source, location, is_remote, salary, " + "date_found, status, cover_letter, excluded_from_training) " + "VALUES (?,?,?,?,?,?,?,?,?,?,?)", + ("Dev", "Corp", "https://x.com/1", "test", "Remote", 1, "", + "2026-01-01", "applied", "Great letter body", 0), + ) + conn.commit() + conn.close() + pairs = get_training_pairs(db) + assert pairs[0]["input"] == "" + + +def test_get_db_pairs_includes_excluded_with_flag(tmp_path): + from scripts.db import get_db_pairs + db = _make_db(tmp_path) + _insert_job(db, excluded=0) + _insert_job(db, title="Other", excluded=1) + pairs = get_db_pairs(db) + assert len(pairs) == 2 + excluded = [p for p in pairs if p["excluded"]] + included = [p for p in pairs if not p["excluded"]] + assert len(excluded) == 1 + assert len(included) == 1 + + +def test_set_training_exclusion_excludes(tmp_path): + from scripts.db import get_training_pairs, set_training_exclusion + db = _make_db(tmp_path) + job_id = _insert_job(db) + assert len(get_training_pairs(db)) == 1 + set_training_exclusion(db, job_id, excluded=True) + assert len(get_training_pairs(db)) == 0 + + +def test_set_training_exclusion_restores(tmp_path): + from scripts.db import get_training_pairs, set_training_exclusion + db = _make_db(tmp_path) + job_id = _insert_job(db, excluded=1) + assert len(get_training_pairs(db)) == 0 + set_training_exclusion(db, job_id, excluded=False) + assert len(get_training_pairs(db)) == 1 + + +def test_strip_greeting_returns_original_when_no_body(tmp_path): + from scripts.db import _strip_greeting + # A letter that is only a salutation with no body should return the original text + result = _strip_greeting("Dear Hiring Manager,") + assert result == "Dear Hiring Manager,"