feat: add training export DB migration and db.py helpers
Add excluded_from_training column to jobs table (migration 009 + _MIGRATIONS entry for existing DBs). Add get_db_pairs(), get_training_pairs(), and set_training_exclusion() helpers for the cover letter training export pipeline. Add test_training_export.py with 8 tests covering all helpers (all passing).
This commit is contained in:
parent
b03add8663
commit
148aaf00cb
3 changed files with 221 additions and 1 deletions
1
migrations/009_training_export.sql
Normal file
1
migrations/009_training_export.sql
Normal file
|
|
@ -0,0 +1 @@
|
|||
ALTER TABLE jobs ADD COLUMN excluded_from_training INTEGER DEFAULT 0;
|
||||
|
|
@ -170,7 +170,8 @@ _MIGRATIONS = [
|
|||
("optimized_resume", "TEXT"), # ATS-rewritten resume text (paid tier)
|
||||
("ats_gap_report", "TEXT"), # JSON gap report (free tier)
|
||||
("date_posted", "TEXT"), # Original posting date from job board (shadow listing detection)
|
||||
("hired_feedback", "TEXT"), # JSON: optional post-hire "what helped" response
|
||||
("hired_feedback", "TEXT"), # JSON: optional post-hire "what helped" response
|
||||
("excluded_from_training", "INTEGER DEFAULT 0"), # opt-out of training export
|
||||
]
|
||||
|
||||
|
||||
|
|
@ -1142,3 +1143,99 @@ def set_job_resume(db_path: Path = DEFAULT_DB, job_id: int = 0, resume_id: int =
|
|||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
# ── Training export helpers ───────────────────────────────────────────────────
|
||||
|
||||
def _strip_greeting(text: str) -> str:
|
||||
"""Remove 'Dear X,' greeting line from cover letter text."""
|
||||
lines = text.splitlines()
|
||||
for i, line in enumerate(lines):
|
||||
stripped_line = line.strip()
|
||||
if stripped_line.lower().startswith("dear ") and stripped_line.endswith((",", ":")):
|
||||
rest = lines[i + 1:]
|
||||
while rest and not rest[0].strip():
|
||||
rest = rest[1:]
|
||||
result = "\n".join(rest).strip()
|
||||
return result if result else text.strip()
|
||||
return text.strip()
|
||||
|
||||
|
||||
def get_db_pairs(db_path: Path) -> list[dict]:
|
||||
"""Return curation metadata for ALL qualifying jobs (included and excluded).
|
||||
|
||||
Used by the curation UI. Includes excluded=True rows so users can restore them.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT id, title, company, description, status, "
|
||||
" excluded_from_training "
|
||||
"FROM jobs "
|
||||
"WHERE status IN ('applied','phone_screen','interviewing','offer','hired') "
|
||||
" AND cover_letter IS NOT NULL AND cover_letter != '' "
|
||||
"ORDER BY applied_at DESC",
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"job_id": row["id"],
|
||||
"title": row["title"] or "",
|
||||
"company": row["company"] or "",
|
||||
"status": row["status"],
|
||||
"instruction": (
|
||||
f"Write a cover letter for the {row['title'] or 'unknown'} "
|
||||
f"position at {row['company'] or 'unknown'}."
|
||||
),
|
||||
"input_preview": (row["description"] or "")[:200],
|
||||
"excluded": bool(row["excluded_from_training"]),
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def get_training_pairs(db_path: Path) -> list[dict]:
|
||||
"""Return Alpaca-format training pairs for non-excluded qualifying jobs.
|
||||
|
||||
Used by the JSONL export endpoint.
|
||||
"""
|
||||
conn = sqlite3.connect(db_path)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
rows = conn.execute(
|
||||
"SELECT id, title, company, description, cover_letter "
|
||||
"FROM jobs "
|
||||
"WHERE status IN ('applied','phone_screen','interviewing','offer','hired') "
|
||||
" AND cover_letter IS NOT NULL AND cover_letter != '' "
|
||||
" AND excluded_from_training = 0 "
|
||||
"ORDER BY applied_at DESC",
|
||||
).fetchall()
|
||||
finally:
|
||||
conn.close()
|
||||
return [
|
||||
{
|
||||
"instruction": (
|
||||
f"Write a cover letter for the {row['title'] or 'unknown'} "
|
||||
f"position at {row['company'] or 'unknown'}."
|
||||
),
|
||||
"input": row["description"] or "",
|
||||
"output": _strip_greeting(row["cover_letter"]),
|
||||
"source": "db",
|
||||
"job_id": row["id"],
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def set_training_exclusion(db_path: Path, job_id: int, excluded: bool) -> None:
|
||||
"""Set excluded_from_training flag on a job."""
|
||||
conn = sqlite3.connect(db_path)
|
||||
try:
|
||||
conn.execute(
|
||||
"UPDATE jobs SET excluded_from_training = ? WHERE id = ?",
|
||||
(1 if excluded else 0, job_id),
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
|
|
|||
122
tests/test_training_export.py
Normal file
122
tests/test_training_export.py
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
"""Tests for cover letter training export helpers."""
|
||||
import json
|
||||
import sqlite3
|
||||
import pytest
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _make_db(tmp_path: Path) -> Path:
|
||||
from scripts.db import init_db
|
||||
db = tmp_path / "test.db"
|
||||
init_db(db)
|
||||
# excluded_from_training column is added by _migrate_db via _MIGRATIONS — no manual ALTER needed
|
||||
return db
|
||||
|
||||
|
||||
def _insert_job(db: Path, *, title="Engineer", company="Acme", status="applied",
|
||||
cover_letter="Dear Hiring Manager,\n\nI am excited.", description="Build stuff.",
|
||||
excluded=0) -> int:
|
||||
conn = sqlite3.connect(db)
|
||||
cur = conn.execute(
|
||||
"INSERT INTO jobs (title, company, url, source, location, is_remote, salary, "
|
||||
"description, date_found, status, cover_letter, excluded_from_training) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||
(title, company, f"https://example.com/{title}", "test", "Remote", 1, "",
|
||||
description, "2026-01-01", status, cover_letter, excluded),
|
||||
)
|
||||
conn.commit()
|
||||
job_id = cur.lastrowid
|
||||
conn.close()
|
||||
return job_id
|
||||
|
||||
|
||||
def test_get_training_pairs_returns_applied_jobs(tmp_path):
|
||||
from scripts.db import get_training_pairs
|
||||
db = _make_db(tmp_path)
|
||||
_insert_job(db, title="Engineer", company="Acme", status="applied")
|
||||
pairs = get_training_pairs(db)
|
||||
assert len(pairs) == 1
|
||||
assert pairs[0]["source"] == "db"
|
||||
assert pairs[0]["instruction"] == "Write a cover letter for the Engineer position at Acme."
|
||||
assert "job_id" in pairs[0]
|
||||
|
||||
|
||||
def test_get_training_pairs_strips_greeting(tmp_path):
|
||||
from scripts.db import get_training_pairs
|
||||
db = _make_db(tmp_path)
|
||||
_insert_job(db, cover_letter="Dear Hiring Manager,\n\nI am excited to apply.\n\nSincerely, Me")
|
||||
pairs = get_training_pairs(db)
|
||||
assert not pairs[0]["output"].startswith("Dear")
|
||||
assert "I am excited" in pairs[0]["output"]
|
||||
|
||||
|
||||
def test_get_training_pairs_excludes_non_applied(tmp_path):
|
||||
from scripts.db import get_training_pairs
|
||||
db = _make_db(tmp_path)
|
||||
_insert_job(db, title="PendingJob", status="pending")
|
||||
_insert_job(db, title="ApprovedJob", status="approved")
|
||||
pairs = get_training_pairs(db)
|
||||
assert len(pairs) == 0
|
||||
|
||||
|
||||
def test_get_training_pairs_excludes_opted_out(tmp_path):
|
||||
from scripts.db import get_training_pairs
|
||||
db = _make_db(tmp_path)
|
||||
_insert_job(db, excluded=1)
|
||||
pairs = get_training_pairs(db)
|
||||
assert len(pairs) == 0
|
||||
|
||||
|
||||
def test_get_training_pairs_null_description_gives_empty_input(tmp_path):
|
||||
from scripts.db import get_training_pairs
|
||||
db = _make_db(tmp_path)
|
||||
conn = sqlite3.connect(db)
|
||||
conn.execute(
|
||||
"INSERT INTO jobs (title, company, url, source, location, is_remote, salary, "
|
||||
"date_found, status, cover_letter, excluded_from_training) "
|
||||
"VALUES (?,?,?,?,?,?,?,?,?,?,?)",
|
||||
("Dev", "Corp", "https://x.com/1", "test", "Remote", 1, "",
|
||||
"2026-01-01", "applied", "Great letter body", 0),
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
pairs = get_training_pairs(db)
|
||||
assert pairs[0]["input"] == ""
|
||||
|
||||
|
||||
def test_get_db_pairs_includes_excluded_with_flag(tmp_path):
|
||||
from scripts.db import get_db_pairs
|
||||
db = _make_db(tmp_path)
|
||||
_insert_job(db, excluded=0)
|
||||
_insert_job(db, title="Other", excluded=1)
|
||||
pairs = get_db_pairs(db)
|
||||
assert len(pairs) == 2
|
||||
excluded = [p for p in pairs if p["excluded"]]
|
||||
included = [p for p in pairs if not p["excluded"]]
|
||||
assert len(excluded) == 1
|
||||
assert len(included) == 1
|
||||
|
||||
|
||||
def test_set_training_exclusion_excludes(tmp_path):
|
||||
from scripts.db import get_training_pairs, set_training_exclusion
|
||||
db = _make_db(tmp_path)
|
||||
job_id = _insert_job(db)
|
||||
assert len(get_training_pairs(db)) == 1
|
||||
set_training_exclusion(db, job_id, excluded=True)
|
||||
assert len(get_training_pairs(db)) == 0
|
||||
|
||||
|
||||
def test_set_training_exclusion_restores(tmp_path):
|
||||
from scripts.db import get_training_pairs, set_training_exclusion
|
||||
db = _make_db(tmp_path)
|
||||
job_id = _insert_job(db, excluded=1)
|
||||
assert len(get_training_pairs(db)) == 0
|
||||
set_training_exclusion(db, job_id, excluded=False)
|
||||
assert len(get_training_pairs(db)) == 1
|
||||
|
||||
|
||||
def test_strip_greeting_returns_original_when_no_body(tmp_path):
|
||||
from scripts.db import _strip_greeting
|
||||
# A letter that is only a salutation with no body should return the original text
|
||||
result = _strip_greeting("Dear Hiring Manager,")
|
||||
assert result == "Dear Hiring Manager,"
|
||||
Loading…
Reference in a new issue