chore: seed Peregrine from personal job-seeker (pre-generalization)
App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked)
This commit is contained in:
commit
1dc1ca89d7
61 changed files with 11370 additions and 0 deletions
20
.gitignore
vendored
Normal file
20
.gitignore
vendored
Normal file
|
|
@ -0,0 +1,20 @@
|
||||||
|
.env
|
||||||
|
config/notion.yaml
|
||||||
|
config/tokens.yaml
|
||||||
|
config/email.yaml
|
||||||
|
config/adzuna.yaml
|
||||||
|
config/craigslist.yaml
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.pytest_cache/
|
||||||
|
output/
|
||||||
|
aihawk/
|
||||||
|
resume_matcher/
|
||||||
|
staging.db
|
||||||
|
.streamlit.log
|
||||||
|
.streamlit.pid
|
||||||
|
.coverage
|
||||||
|
log/
|
||||||
|
unsloth_compiled_cache/
|
||||||
|
data/survey_screenshots/*
|
||||||
|
!data/survey_screenshots/.gitkeep
|
||||||
7
app/.streamlit/config.toml
Normal file
7
app/.streamlit/config.toml
Normal file
|
|
@ -0,0 +1,7 @@
|
||||||
|
[theme]
|
||||||
|
base = "dark"
|
||||||
|
primaryColor = "#2DD4BF"
|
||||||
|
backgroundColor = "#0F172A"
|
||||||
|
secondaryBackgroundColor = "#1E293B"
|
||||||
|
textColor = "#F1F5F9"
|
||||||
|
font = "sans serif"
|
||||||
475
app/Home.py
Normal file
475
app/Home.py
Normal file
|
|
@ -0,0 +1,475 @@
|
||||||
|
# app/Home.py
|
||||||
|
"""
|
||||||
|
Job Seeker Dashboard — Home page.
|
||||||
|
Shows counts, Run Discovery button, and Sync to Notion button.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \
|
||||||
|
purge_non_remote, archive_jobs, kill_stuck_tasks, get_task_for_job, get_active_tasks, \
|
||||||
|
insert_job, get_existing_urls
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
|
||||||
|
def _dismissible(key: str, status: str, msg: str) -> None:
|
||||||
|
"""Render a dismissible success/error message. key must be unique per task result."""
|
||||||
|
if st.session_state.get(f"dismissed_{key}"):
|
||||||
|
return
|
||||||
|
col_msg, col_x = st.columns([10, 1])
|
||||||
|
with col_msg:
|
||||||
|
if status == "completed":
|
||||||
|
st.success(msg)
|
||||||
|
else:
|
||||||
|
st.error(msg)
|
||||||
|
with col_x:
|
||||||
|
st.write("")
|
||||||
|
if st.button("✕", key=f"dismiss_{key}", help="Dismiss"):
|
||||||
|
st.session_state[f"dismissed_{key}"] = True
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
|
||||||
|
def _queue_url_imports(db_path: Path, urls: list) -> int:
|
||||||
|
"""Insert each URL as a pending manual job and queue a scrape_url task.
|
||||||
|
Returns count of newly queued jobs."""
|
||||||
|
from datetime import datetime
|
||||||
|
from scripts.scrape_url import canonicalize_url
|
||||||
|
existing = get_existing_urls(db_path)
|
||||||
|
queued = 0
|
||||||
|
for url in urls:
|
||||||
|
url = canonicalize_url(url.strip())
|
||||||
|
if not url.startswith("http"):
|
||||||
|
continue
|
||||||
|
if url in existing:
|
||||||
|
continue
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "Importing…",
|
||||||
|
"company": "",
|
||||||
|
"url": url,
|
||||||
|
"source": "manual",
|
||||||
|
"location": "",
|
||||||
|
"description": "",
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
if job_id:
|
||||||
|
submit_task(db_path, "scrape_url", job_id)
|
||||||
|
queued += 1
|
||||||
|
return queued
|
||||||
|
|
||||||
|
|
||||||
|
st.title("🔍 Alex's Job Search")
|
||||||
|
st.caption("Discover → Review → Sync to Notion")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
|
||||||
|
@st.fragment(run_every=10)
|
||||||
|
def _live_counts():
|
||||||
|
counts = get_job_counts(DEFAULT_DB)
|
||||||
|
col1, col2, col3, col4, col5 = st.columns(5)
|
||||||
|
col1.metric("Pending Review", counts.get("pending", 0))
|
||||||
|
col2.metric("Approved", counts.get("approved", 0))
|
||||||
|
col3.metric("Applied", counts.get("applied", 0))
|
||||||
|
col4.metric("Synced to Notion", counts.get("synced", 0))
|
||||||
|
col5.metric("Rejected", counts.get("rejected", 0))
|
||||||
|
|
||||||
|
|
||||||
|
_live_counts()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
left, enrich_col, mid, right = st.columns(4)
|
||||||
|
|
||||||
|
with left:
|
||||||
|
st.subheader("Find New Jobs")
|
||||||
|
st.caption("Scrapes all configured boards and adds new listings to your review queue.")
|
||||||
|
|
||||||
|
_disc_task = get_task_for_job(DEFAULT_DB, "discovery", 0)
|
||||||
|
_disc_running = _disc_task and _disc_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if st.button("🚀 Run Discovery", use_container_width=True, type="primary",
|
||||||
|
disabled=bool(_disc_running)):
|
||||||
|
submit_task(DEFAULT_DB, "discovery", 0)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _disc_running:
|
||||||
|
@st.fragment(run_every=4)
|
||||||
|
def _disc_status():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "discovery", 0)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
lbl = "Queued…" if t["status"] == "queued" else "Scraping job boards… this may take a minute"
|
||||||
|
st.info(f"⏳ {lbl}")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_disc_status()
|
||||||
|
elif _disc_task and _disc_task["status"] == "completed":
|
||||||
|
_dismissible(f"disc_{_disc_task['id']}", "completed",
|
||||||
|
f"✅ Discovery complete — {_disc_task.get('error', '')}. Head to Job Review.")
|
||||||
|
elif _disc_task and _disc_task["status"] == "failed":
|
||||||
|
_dismissible(f"disc_{_disc_task['id']}", "failed",
|
||||||
|
f"Discovery failed: {_disc_task.get('error', '')}")
|
||||||
|
|
||||||
|
with enrich_col:
|
||||||
|
st.subheader("Enrich Descriptions")
|
||||||
|
st.caption("Re-fetch missing descriptions for any listing (LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, generic).")
|
||||||
|
|
||||||
|
_enrich_task = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
|
||||||
|
_enrich_running = _enrich_task and _enrich_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if st.button("🔍 Fill Missing Descriptions", use_container_width=True, type="primary",
|
||||||
|
disabled=bool(_enrich_running)):
|
||||||
|
submit_task(DEFAULT_DB, "enrich_descriptions", 0)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _enrich_running:
|
||||||
|
@st.fragment(run_every=4)
|
||||||
|
def _enrich_status():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
st.info("⏳ Fetching descriptions…")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_enrich_status()
|
||||||
|
elif _enrich_task and _enrich_task["status"] == "completed":
|
||||||
|
_dismissible(f"enrich_{_enrich_task['id']}", "completed",
|
||||||
|
f"✅ {_enrich_task.get('error', 'Done')}")
|
||||||
|
elif _enrich_task and _enrich_task["status"] == "failed":
|
||||||
|
_dismissible(f"enrich_{_enrich_task['id']}", "failed",
|
||||||
|
f"Enrich failed: {_enrich_task.get('error', '')}")
|
||||||
|
|
||||||
|
with mid:
|
||||||
|
unscored = sum(1 for j in __import__("scripts.db", fromlist=["get_jobs_by_status"])
|
||||||
|
.get_jobs_by_status(DEFAULT_DB, "pending")
|
||||||
|
if j.get("match_score") is None and j.get("description"))
|
||||||
|
st.subheader("Score Listings")
|
||||||
|
st.caption(f"Run TF-IDF match scoring against Alex's resume. {unscored} pending job{'s' if unscored != 1 else ''} unscored.")
|
||||||
|
if st.button("📊 Score All Unscored Jobs", use_container_width=True, type="primary",
|
||||||
|
disabled=unscored == 0):
|
||||||
|
with st.spinner("Scoring…"):
|
||||||
|
result = subprocess.run(
|
||||||
|
["conda", "run", "-n", "job-seeker", "python", "scripts/match.py"],
|
||||||
|
capture_output=True, text=True,
|
||||||
|
cwd=str(Path(__file__).parent.parent),
|
||||||
|
)
|
||||||
|
if result.returncode == 0:
|
||||||
|
st.success("Scoring complete!")
|
||||||
|
st.code(result.stdout)
|
||||||
|
else:
|
||||||
|
st.error("Scoring failed.")
|
||||||
|
st.code(result.stderr)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with right:
|
||||||
|
approved_count = get_job_counts(DEFAULT_DB).get("approved", 0)
|
||||||
|
st.subheader("Send to Notion")
|
||||||
|
st.caption("Push all approved jobs to your Notion tracking database.")
|
||||||
|
if approved_count == 0:
|
||||||
|
st.info("No approved jobs yet. Review and approve some listings first.")
|
||||||
|
else:
|
||||||
|
if st.button(
|
||||||
|
f"📤 Sync {approved_count} approved job{'s' if approved_count != 1 else ''} → Notion",
|
||||||
|
use_container_width=True, type="primary",
|
||||||
|
):
|
||||||
|
with st.spinner("Syncing to Notion…"):
|
||||||
|
from scripts.sync import sync_to_notion
|
||||||
|
count = sync_to_notion(DEFAULT_DB)
|
||||||
|
st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Email Sync ────────────────────────────────────────────────────────────────
|
||||||
|
email_left, email_right = st.columns([3, 1])
|
||||||
|
|
||||||
|
with email_left:
|
||||||
|
st.subheader("Sync Emails")
|
||||||
|
st.caption("Pull inbound recruiter emails and match them to active applications. "
|
||||||
|
"New recruiter outreach is added to your Job Review queue.")
|
||||||
|
|
||||||
|
with email_right:
|
||||||
|
_email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||||
|
_email_running = _email_task and _email_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if st.button("📧 Sync Emails", use_container_width=True, type="primary",
|
||||||
|
disabled=bool(_email_running)):
|
||||||
|
submit_task(DEFAULT_DB, "email_sync", 0)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _email_running:
|
||||||
|
@st.fragment(run_every=4)
|
||||||
|
def _email_status():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
st.info("⏳ Syncing emails…")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_email_status()
|
||||||
|
elif _email_task and _email_task["status"] == "completed":
|
||||||
|
_dismissible(f"email_{_email_task['id']}", "completed",
|
||||||
|
f"✅ {_email_task.get('error', 'Done')}")
|
||||||
|
elif _email_task and _email_task["status"] == "failed":
|
||||||
|
_dismissible(f"email_{_email_task['id']}", "failed",
|
||||||
|
f"Sync failed: {_email_task.get('error', '')}")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Add Jobs by URL ───────────────────────────────────────────────────────────
|
||||||
|
add_left, _add_right = st.columns([3, 1])
|
||||||
|
with add_left:
|
||||||
|
st.subheader("Add Jobs by URL")
|
||||||
|
st.caption("Paste job listing URLs to import and scrape in the background. "
|
||||||
|
"Supports LinkedIn, Indeed, Glassdoor, and most job boards.")
|
||||||
|
|
||||||
|
url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"])
|
||||||
|
|
||||||
|
with url_tab:
|
||||||
|
url_text = st.text_area(
|
||||||
|
"urls",
|
||||||
|
placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc",
|
||||||
|
height=100,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
if st.button("📥 Add Jobs", key="add_urls_btn", use_container_width=True,
|
||||||
|
disabled=not (url_text or "").strip()):
|
||||||
|
_urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")]
|
||||||
|
if _urls:
|
||||||
|
_n = _queue_url_imports(DEFAULT_DB, _urls)
|
||||||
|
if _n:
|
||||||
|
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.")
|
||||||
|
else:
|
||||||
|
st.info("All URLs already in the database.")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with csv_tab:
|
||||||
|
csv_file = st.file_uploader("CSV with a URL column", type=["csv"],
|
||||||
|
label_visibility="collapsed")
|
||||||
|
if csv_file:
|
||||||
|
import csv as _csv
|
||||||
|
import io as _io
|
||||||
|
reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace")))
|
||||||
|
_csv_urls = []
|
||||||
|
for row in reader:
|
||||||
|
for val in row.values():
|
||||||
|
if val and val.strip().startswith("http"):
|
||||||
|
_csv_urls.append(val.strip())
|
||||||
|
break
|
||||||
|
if _csv_urls:
|
||||||
|
st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.")
|
||||||
|
if st.button("📥 Import CSV Jobs", key="add_csv_btn", use_container_width=True):
|
||||||
|
_n = _queue_url_imports(DEFAULT_DB, _csv_urls)
|
||||||
|
st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.")
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.warning("No URLs found — CSV must have a column whose values start with http.")
|
||||||
|
|
||||||
|
|
||||||
|
@st.fragment(run_every=3)
|
||||||
|
def _scrape_status():
|
||||||
|
import sqlite3 as _sq
|
||||||
|
conn = _sq.connect(DEFAULT_DB)
|
||||||
|
conn.row_factory = _sq.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT bt.status, bt.error, j.title, j.company, j.url
|
||||||
|
FROM background_tasks bt
|
||||||
|
JOIN jobs j ON j.id = bt.job_id
|
||||||
|
WHERE bt.task_type = 'scrape_url'
|
||||||
|
AND bt.updated_at >= datetime('now', '-5 minutes')
|
||||||
|
ORDER BY bt.updated_at DESC LIMIT 20"""
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
if not rows:
|
||||||
|
return
|
||||||
|
st.caption("Recent URL imports:")
|
||||||
|
for r in rows:
|
||||||
|
if r["status"] == "running":
|
||||||
|
st.info(f"⏳ Scraping {r['url']}")
|
||||||
|
elif r["status"] == "completed":
|
||||||
|
label = r["title"] + (f" @ {r['company']}" if r["company"] else "")
|
||||||
|
st.success(f"✅ {label}")
|
||||||
|
elif r["status"] == "failed":
|
||||||
|
st.error(f"❌ {r['url']} — {r['error'] or 'scrape failed'}")
|
||||||
|
|
||||||
|
|
||||||
|
_scrape_status()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Danger zone: purge + re-scrape ────────────────────────────────────────────
|
||||||
|
with st.expander("⚠️ Danger Zone", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
"**Purge** permanently deletes jobs from the local database. "
|
||||||
|
"Applied and synced jobs are never touched."
|
||||||
|
)
|
||||||
|
|
||||||
|
purge_col, rescrape_col, email_col, tasks_col = st.columns(4)
|
||||||
|
|
||||||
|
with purge_col:
|
||||||
|
st.markdown("**Purge pending & rejected**")
|
||||||
|
st.caption("Removes all _pending_ and _rejected_ listings so the next discovery starts fresh.")
|
||||||
|
if st.button("🗑 Purge Pending + Rejected", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "partial"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "partial":
|
||||||
|
st.warning("Are you sure? This cannot be undone.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, purge", type="primary", use_container_width=True):
|
||||||
|
deleted = purge_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
|
||||||
|
st.success(f"Purged {deleted} jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with email_col:
|
||||||
|
st.markdown("**Purge email data**")
|
||||||
|
st.caption("Clears all email thread logs and email-sourced pending jobs so the next sync starts fresh.")
|
||||||
|
if st.button("📧 Purge Email Data", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "email"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "email":
|
||||||
|
st.warning("This deletes all email contacts and email-sourced jobs. Cannot be undone.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, purge emails", type="primary", use_container_width=True):
|
||||||
|
contacts, jobs = purge_email_data(DEFAULT_DB)
|
||||||
|
st.success(f"Purged {contacts} email contacts, {jobs} email jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with tasks_col:
|
||||||
|
_active = get_active_tasks(DEFAULT_DB)
|
||||||
|
st.markdown("**Kill stuck tasks**")
|
||||||
|
st.caption(f"Force-fail all queued/running background tasks. Currently **{len(_active)}** active.")
|
||||||
|
if st.button("⏹ Kill All Tasks", use_container_width=True, disabled=len(_active) == 0):
|
||||||
|
killed = kill_stuck_tasks(DEFAULT_DB)
|
||||||
|
st.success(f"Killed {killed} task(s).")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with rescrape_col:
|
||||||
|
st.markdown("**Purge all & re-scrape**")
|
||||||
|
st.caption("Wipes _all_ non-applied, non-synced jobs then immediately runs a fresh discovery.")
|
||||||
|
if st.button("🔄 Purge All + Re-scrape", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "full"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "full":
|
||||||
|
st.warning("This will delete ALL pending, approved, and rejected jobs, then re-scrape. Applied and synced records are kept.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, wipe + scrape", type="primary", use_container_width=True):
|
||||||
|
purge_jobs(DEFAULT_DB, statuses=["pending", "approved", "rejected"])
|
||||||
|
submit_task(DEFAULT_DB, "discovery", 0)
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
pending_col, nonremote_col, approved_col, _ = st.columns(4)
|
||||||
|
|
||||||
|
with pending_col:
|
||||||
|
st.markdown("**Purge pending review**")
|
||||||
|
st.caption("Removes only _pending_ listings, keeping your rejected history intact.")
|
||||||
|
if st.button("🗑 Purge Pending Only", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "pending_only"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "pending_only":
|
||||||
|
st.warning("Deletes all pending jobs. Rejected jobs are kept. Cannot be undone.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, purge pending", type="primary", use_container_width=True):
|
||||||
|
deleted = purge_jobs(DEFAULT_DB, statuses=["pending"])
|
||||||
|
st.success(f"Purged {deleted} pending jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with nonremote_col:
|
||||||
|
st.markdown("**Purge non-remote**")
|
||||||
|
st.caption("Removes pending/approved/rejected jobs where remote is not set. Keeps anything already in the pipeline.")
|
||||||
|
if st.button("🏢 Purge On-site Jobs", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "non_remote"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "non_remote":
|
||||||
|
st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, purge on-site", type="primary", use_container_width=True):
|
||||||
|
deleted = purge_non_remote(DEFAULT_DB)
|
||||||
|
st.success(f"Purged {deleted} non-remote jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with approved_col:
|
||||||
|
st.markdown("**Purge approved (unapplied)**")
|
||||||
|
st.caption("Removes _approved_ jobs you haven't applied to yet — e.g. to reset after a review pass.")
|
||||||
|
if st.button("🗑 Purge Approved", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "approved_only"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "approved_only":
|
||||||
|
st.warning("Deletes all approved-but-not-applied jobs. Cannot be undone.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, purge approved", type="primary", use_container_width=True):
|
||||||
|
deleted = purge_jobs(DEFAULT_DB, statuses=["approved"])
|
||||||
|
st.success(f"Purged {deleted} approved jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
archive_col1, archive_col2, _, _ = st.columns(4)
|
||||||
|
|
||||||
|
with archive_col1:
|
||||||
|
st.markdown("**Archive remaining**")
|
||||||
|
st.caption(
|
||||||
|
"Move all _pending_ and _rejected_ jobs to archived status. "
|
||||||
|
"Archived jobs stay in the DB for dedup — they just won't appear in Job Review."
|
||||||
|
)
|
||||||
|
if st.button("📦 Archive Pending + Rejected", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "archive_remaining"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "archive_remaining":
|
||||||
|
st.info("Jobs will be archived (not deleted) — URLs are kept for dedup.")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, archive", type="primary", use_container_width=True):
|
||||||
|
archived = archive_jobs(DEFAULT_DB, statuses=["pending", "rejected"])
|
||||||
|
st.success(f"Archived {archived} jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with archive_col2:
|
||||||
|
st.markdown("**Archive approved (unapplied)**")
|
||||||
|
st.caption("Archive _approved_ listings you decided to skip — keeps history without cluttering the apply queue.")
|
||||||
|
if st.button("📦 Archive Approved", use_container_width=True):
|
||||||
|
st.session_state["confirm_purge"] = "archive_approved"
|
||||||
|
|
||||||
|
if st.session_state.get("confirm_purge") == "archive_approved":
|
||||||
|
st.info("Approved jobs will be archived (not deleted).")
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if c1.button("Yes, archive approved", type="primary", use_container_width=True):
|
||||||
|
archived = archive_jobs(DEFAULT_DB, statuses=["approved"])
|
||||||
|
st.success(f"Archived {archived} approved jobs.")
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
|
if c2.button("Cancel ", use_container_width=True):
|
||||||
|
st.session_state.pop("confirm_purge", None)
|
||||||
|
st.rerun()
|
||||||
119
app/app.py
Normal file
119
app/app.py
Normal file
|
|
@ -0,0 +1,119 @@
|
||||||
|
# app/app.py
|
||||||
|
"""
|
||||||
|
Streamlit entry point — uses st.navigation() to control the sidebar.
|
||||||
|
Main workflow pages are listed at the top; Settings is separated into
|
||||||
|
a "System" section so it doesn't crowd the navigation.
|
||||||
|
|
||||||
|
Run: streamlit run app/app.py
|
||||||
|
bash scripts/manage-ui.sh start
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
from scripts.db import DEFAULT_DB, init_db, get_active_tasks
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
st.set_page_config(
|
||||||
|
page_title="Job Seeker",
|
||||||
|
page_icon="💼",
|
||||||
|
layout="wide",
|
||||||
|
)
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── Startup cleanup — runs once per server process via cache_resource ──────────
|
||||||
|
@st.cache_resource
|
||||||
|
def _startup() -> None:
|
||||||
|
"""Runs exactly once per server lifetime (st.cache_resource).
|
||||||
|
1. Marks zombie tasks as failed.
|
||||||
|
2. Auto-queues re-runs for any research generated without SearXNG data,
|
||||||
|
if SearXNG is now reachable.
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(DEFAULT_DB)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE background_tasks SET status='failed', error='Interrupted by server restart',"
|
||||||
|
" finished_at=datetime('now') WHERE status IN ('queued','running')"
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Auto-recovery: re-run LLM-only research when SearXNG is available
|
||||||
|
try:
|
||||||
|
import requests as _req
|
||||||
|
if _req.get("http://localhost:8888/", timeout=3).status_code == 200:
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
_ACTIVE_STAGES = ("phone_screen", "interviewing", "offer", "hired")
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT cr.job_id FROM company_research cr
|
||||||
|
JOIN jobs j ON j.id = cr.job_id
|
||||||
|
WHERE (cr.scrape_used IS NULL OR cr.scrape_used = 0)
|
||||||
|
AND j.status IN ({})""".format(",".join("?" * len(_ACTIVE_STAGES))),
|
||||||
|
_ACTIVE_STAGES,
|
||||||
|
).fetchall()
|
||||||
|
for (job_id,) in rows:
|
||||||
|
submit_task(str(DEFAULT_DB), "company_research", job_id)
|
||||||
|
except Exception:
|
||||||
|
pass # never block startup
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
_startup()
|
||||||
|
|
||||||
|
# ── Navigation ─────────────────────────────────────────────────────────────────
|
||||||
|
# st.navigation() must be called before any sidebar writes so it can establish
|
||||||
|
# the navigation structure first; sidebar additions come after.
|
||||||
|
pages = {
|
||||||
|
"": [
|
||||||
|
st.Page("Home.py", title="Home", icon="🏠"),
|
||||||
|
st.Page("pages/1_Job_Review.py", title="Job Review", icon="📋"),
|
||||||
|
st.Page("pages/4_Apply.py", title="Apply Workspace", icon="🚀"),
|
||||||
|
st.Page("pages/5_Interviews.py", title="Interviews", icon="🎯"),
|
||||||
|
st.Page("pages/6_Interview_Prep.py", title="Interview Prep", icon="📞"),
|
||||||
|
st.Page("pages/7_Survey.py", title="Survey Assistant", icon="📋"),
|
||||||
|
],
|
||||||
|
"System": [
|
||||||
|
st.Page("pages/2_Settings.py", title="Settings", icon="⚙️"),
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
pg = st.navigation(pages)
|
||||||
|
|
||||||
|
# ── Background task sidebar indicator ─────────────────────────────────────────
|
||||||
|
# Fragment polls every 3s so stage labels update live without a full page reload.
|
||||||
|
# The sidebar context WRAPS the fragment call — do not write to st.sidebar inside it.
|
||||||
|
@st.fragment(run_every=3)
|
||||||
|
def _task_indicator():
|
||||||
|
tasks = get_active_tasks(DEFAULT_DB)
|
||||||
|
if not tasks:
|
||||||
|
return
|
||||||
|
st.divider()
|
||||||
|
st.markdown(f"**⏳ {len(tasks)} task(s) running**")
|
||||||
|
for t in tasks:
|
||||||
|
icon = "⏳" if t["status"] == "running" else "🕐"
|
||||||
|
task_type = t["task_type"]
|
||||||
|
if task_type == "cover_letter":
|
||||||
|
label = "Cover letter"
|
||||||
|
elif task_type == "company_research":
|
||||||
|
label = "Research"
|
||||||
|
elif task_type == "email_sync":
|
||||||
|
label = "Email sync"
|
||||||
|
elif task_type == "discovery":
|
||||||
|
label = "Discovery"
|
||||||
|
elif task_type == "enrich_descriptions":
|
||||||
|
label = "Enriching"
|
||||||
|
elif task_type == "scrape_url":
|
||||||
|
label = "Scraping URL"
|
||||||
|
elif task_type == "enrich_craigslist":
|
||||||
|
label = "Enriching listing"
|
||||||
|
else:
|
||||||
|
label = task_type.replace("_", " ").title()
|
||||||
|
stage = t.get("stage") or ""
|
||||||
|
detail = f" · {stage}" if stage else (f" — {t.get('company')}" if t.get("company") else "")
|
||||||
|
st.caption(f"{icon} {label}{detail}")
|
||||||
|
|
||||||
|
with st.sidebar:
|
||||||
|
_task_indicator()
|
||||||
|
|
||||||
|
pg.run()
|
||||||
203
app/pages/1_Job_Review.py
Normal file
203
app/pages/1_Job_Review.py
Normal file
|
|
@ -0,0 +1,203 @@
|
||||||
|
# app/pages/1_Job_Review.py
|
||||||
|
"""
|
||||||
|
Job Review — browse listings, approve/reject inline, generate cover letters,
|
||||||
|
and mark approved jobs as applied.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB, init_db, get_jobs_by_status, update_job_status,
|
||||||
|
update_cover_letter, mark_applied, get_email_leads,
|
||||||
|
)
|
||||||
|
|
||||||
|
st.title("📋 Job Review")
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
_email_leads = get_email_leads(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── Sidebar filters ────────────────────────────────────────────────────────────
|
||||||
|
with st.sidebar:
|
||||||
|
st.header("Filters")
|
||||||
|
show_status = st.selectbox(
|
||||||
|
"Show",
|
||||||
|
["pending", "approved", "applied", "rejected", "synced"],
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
remote_only = st.checkbox("Remote only", value=False)
|
||||||
|
min_score = st.slider("Min match score", 0, 100, 0)
|
||||||
|
|
||||||
|
st.header("Sort")
|
||||||
|
sort_by = st.selectbox(
|
||||||
|
"Sort by",
|
||||||
|
["Date Found (newest)", "Date Found (oldest)", "Match Score (high→low)", "Match Score (low→high)", "Company A–Z", "Title A–Z"],
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
|
||||||
|
jobs = get_jobs_by_status(DEFAULT_DB, show_status)
|
||||||
|
|
||||||
|
if remote_only:
|
||||||
|
jobs = [j for j in jobs if j.get("is_remote")]
|
||||||
|
if min_score > 0:
|
||||||
|
jobs = [j for j in jobs if (j.get("match_score") or 0) >= min_score]
|
||||||
|
|
||||||
|
# Apply sort
|
||||||
|
if sort_by == "Date Found (newest)":
|
||||||
|
jobs = sorted(jobs, key=lambda j: j.get("date_found") or "", reverse=True)
|
||||||
|
elif sort_by == "Date Found (oldest)":
|
||||||
|
jobs = sorted(jobs, key=lambda j: j.get("date_found") or "")
|
||||||
|
elif sort_by == "Match Score (high→low)":
|
||||||
|
jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0, reverse=True)
|
||||||
|
elif sort_by == "Match Score (low→high)":
|
||||||
|
jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0)
|
||||||
|
elif sort_by == "Company A–Z":
|
||||||
|
jobs = sorted(jobs, key=lambda j: (j.get("company") or "").lower())
|
||||||
|
elif sort_by == "Title A–Z":
|
||||||
|
jobs = sorted(jobs, key=lambda j: (j.get("title") or "").lower())
|
||||||
|
|
||||||
|
if not jobs:
|
||||||
|
st.info(f"No {show_status} jobs matching your filters.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
st.caption(f"Showing {len(jobs)} {show_status} job{'s' if len(jobs) != 1 else ''}")
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
if show_status == "pending" and _email_leads:
|
||||||
|
st.subheader(f"📧 Email Leads ({len(_email_leads)})")
|
||||||
|
st.caption(
|
||||||
|
"Inbound recruiter emails not yet matched to a scraped listing. "
|
||||||
|
"Approve to add to Job Review; Reject to dismiss."
|
||||||
|
)
|
||||||
|
for lead in _email_leads:
|
||||||
|
lead_id = lead["id"]
|
||||||
|
with st.container(border=True):
|
||||||
|
left_l, right_l = st.columns([7, 3])
|
||||||
|
with left_l:
|
||||||
|
st.markdown(f"**{lead['title']}** — {lead['company']}")
|
||||||
|
badge_cols = st.columns(4)
|
||||||
|
badge_cols[0].caption("📧 Email Lead")
|
||||||
|
badge_cols[1].caption(f"📅 {lead.get('date_found', '')}")
|
||||||
|
if lead.get("description"):
|
||||||
|
with st.expander("📄 Email excerpt", expanded=False):
|
||||||
|
st.text(lead["description"][:500])
|
||||||
|
with right_l:
|
||||||
|
if st.button("✅ Approve", key=f"el_approve_{lead_id}",
|
||||||
|
type="primary", use_container_width=True):
|
||||||
|
update_job_status(DEFAULT_DB, [lead_id], "approved")
|
||||||
|
st.rerun()
|
||||||
|
if st.button("❌ Reject", key=f"el_reject_{lead_id}",
|
||||||
|
use_container_width=True):
|
||||||
|
update_job_status(DEFAULT_DB, [lead_id], "rejected")
|
||||||
|
st.rerun()
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# Filter email leads out of the main pending list (already shown above)
|
||||||
|
if show_status == "pending":
|
||||||
|
jobs = [j for j in jobs if j.get("source") != "email"]
|
||||||
|
|
||||||
|
# ── Job cards ──────────────────────────────────────────────────────────────────
|
||||||
|
for job in jobs:
|
||||||
|
job_id = job["id"]
|
||||||
|
|
||||||
|
score = job.get("match_score")
|
||||||
|
if score is None:
|
||||||
|
score_badge = "⬜ No score"
|
||||||
|
elif score >= 70:
|
||||||
|
score_badge = f"🟢 {score:.0f}%"
|
||||||
|
elif score >= 40:
|
||||||
|
score_badge = f"🟡 {score:.0f}%"
|
||||||
|
else:
|
||||||
|
score_badge = f"🔴 {score:.0f}%"
|
||||||
|
|
||||||
|
remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site"
|
||||||
|
src = (job.get("source") or "").lower()
|
||||||
|
source_badge = f"🤖 {src.title()}" if src == "linkedin" else f"👤 {src.title() or 'Manual'}"
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
left, right = st.columns([7, 3])
|
||||||
|
|
||||||
|
# ── Left: job info ─────────────────────────────────────────────────────
|
||||||
|
with left:
|
||||||
|
st.markdown(f"**{job['title']}** — {job['company']}")
|
||||||
|
|
||||||
|
badge_cols = st.columns(4)
|
||||||
|
badge_cols[0].caption(remote_badge)
|
||||||
|
badge_cols[1].caption(source_badge)
|
||||||
|
badge_cols[2].caption(score_badge)
|
||||||
|
badge_cols[3].caption(f"📅 {job.get('date_found', '')}")
|
||||||
|
|
||||||
|
if job.get("keyword_gaps"):
|
||||||
|
st.caption(f"**Keyword gaps:** {job['keyword_gaps']}")
|
||||||
|
|
||||||
|
# Cover letter expander (approved view)
|
||||||
|
if show_status == "approved":
|
||||||
|
_cl_key = f"cl_{job_id}"
|
||||||
|
if _cl_key not in st.session_state:
|
||||||
|
st.session_state[_cl_key] = job.get("cover_letter") or ""
|
||||||
|
|
||||||
|
cl_exists = bool(st.session_state[_cl_key])
|
||||||
|
with st.expander("📝 Cover Letter", expanded=cl_exists):
|
||||||
|
gen_label = "Regenerate" if cl_exists else "Generate Cover Letter"
|
||||||
|
if st.button(gen_label, key=f"gen_{job_id}"):
|
||||||
|
with st.spinner("Generating via LLM…"):
|
||||||
|
try:
|
||||||
|
from scripts.generate_cover_letter import generate as _gen
|
||||||
|
st.session_state[_cl_key] = _gen(
|
||||||
|
job.get("title", ""),
|
||||||
|
job.get("company", ""),
|
||||||
|
job.get("description", ""),
|
||||||
|
)
|
||||||
|
st.rerun()
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Generation failed: {e}")
|
||||||
|
|
||||||
|
st.text_area(
|
||||||
|
"cover_letter_edit",
|
||||||
|
key=_cl_key,
|
||||||
|
height=300,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
save_col, _ = st.columns([2, 5])
|
||||||
|
if save_col.button("💾 Save draft", key=f"save_cl_{job_id}"):
|
||||||
|
update_cover_letter(DEFAULT_DB, job_id, st.session_state[_cl_key])
|
||||||
|
st.success("Saved!")
|
||||||
|
|
||||||
|
# Applied date + cover letter preview (applied/synced)
|
||||||
|
if show_status in ("applied", "synced") and job.get("applied_at"):
|
||||||
|
st.caption(f"✅ Applied: {job['applied_at']}")
|
||||||
|
if show_status in ("applied", "synced") and job.get("cover_letter"):
|
||||||
|
with st.expander("📝 Cover Letter (sent)"):
|
||||||
|
st.text(job["cover_letter"])
|
||||||
|
|
||||||
|
# ── Right: actions ─────────────────────────────────────────────────────
|
||||||
|
with right:
|
||||||
|
if job.get("url"):
|
||||||
|
st.link_button("View listing →", job["url"], use_container_width=True)
|
||||||
|
if job.get("salary"):
|
||||||
|
st.caption(f"💰 {job['salary']}")
|
||||||
|
|
||||||
|
if show_status == "pending":
|
||||||
|
if st.button("✅ Approve", key=f"approve_{job_id}",
|
||||||
|
type="primary", use_container_width=True):
|
||||||
|
update_job_status(DEFAULT_DB, [job_id], "approved")
|
||||||
|
st.rerun()
|
||||||
|
if st.button("❌ Reject", key=f"reject_{job_id}",
|
||||||
|
use_container_width=True):
|
||||||
|
update_job_status(DEFAULT_DB, [job_id], "rejected")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
elif show_status == "approved":
|
||||||
|
if st.button("🚀 Apply →", key=f"apply_page_{job_id}",
|
||||||
|
type="primary", use_container_width=True):
|
||||||
|
st.session_state["apply_job_id"] = job_id
|
||||||
|
st.switch_page("pages/4_Apply.py")
|
||||||
|
if st.button("✅ Mark Applied", key=f"applied_{job_id}",
|
||||||
|
use_container_width=True):
|
||||||
|
cl_text = st.session_state.get(f"cl_{job_id}", "")
|
||||||
|
if cl_text:
|
||||||
|
update_cover_letter(DEFAULT_DB, job_id, cl_text)
|
||||||
|
mark_applied(DEFAULT_DB, [job_id])
|
||||||
|
st.rerun()
|
||||||
842
app/pages/2_Settings.py
Normal file
842
app/pages/2_Settings.py
Normal file
|
|
@ -0,0 +1,842 @@
|
||||||
|
# app/pages/2_Settings.py
|
||||||
|
"""
|
||||||
|
Settings — edit search profiles, LLM backends, Notion connection, services,
|
||||||
|
and resume profile (paste-able bullets used in Apply Workspace).
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
st.title("⚙️ Settings")
|
||||||
|
|
||||||
|
CONFIG_DIR = Path(__file__).parent.parent.parent / "config"
|
||||||
|
SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||||||
|
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||||||
|
LLM_CFG = CONFIG_DIR / "llm.yaml"
|
||||||
|
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||||||
|
RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
|
KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml"
|
||||||
|
|
||||||
|
def load_yaml(path: Path) -> dict:
|
||||||
|
if path.exists():
|
||||||
|
return yaml.safe_load(path.read_text()) or {}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def save_yaml(path: Path, data: dict) -> None:
|
||||||
|
path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
|
||||||
|
|
||||||
|
|
||||||
|
def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict:
|
||||||
|
"""Call LLM to suggest additional job titles and exclude keywords."""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
resume_context = ""
|
||||||
|
if resume_path.exists():
|
||||||
|
resume = load_yaml(resume_path)
|
||||||
|
lines = []
|
||||||
|
for exp in (resume.get("experience_details") or [])[:3]:
|
||||||
|
pos = exp.get("position", "")
|
||||||
|
co = exp.get("company", "")
|
||||||
|
skills = ", ".join((exp.get("skills_acquired") or [])[:5])
|
||||||
|
lines.append(f"- {pos} at {co}: {skills}")
|
||||||
|
resume_context = "\n".join(lines)
|
||||||
|
|
||||||
|
titles_str = "\n".join(f"- {t}" for t in current_titles)
|
||||||
|
prompt = f"""You are helping a job seeker optimize their search criteria.
|
||||||
|
|
||||||
|
Their background (from resume):
|
||||||
|
{resume_context or "Customer success and technical account management leader"}
|
||||||
|
|
||||||
|
Current job titles being searched:
|
||||||
|
{titles_str}
|
||||||
|
|
||||||
|
Suggest:
|
||||||
|
1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants)
|
||||||
|
2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings)
|
||||||
|
|
||||||
|
Return ONLY valid JSON in this exact format:
|
||||||
|
{{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}"""
|
||||||
|
|
||||||
|
result = LLMRouter().complete(prompt).strip()
|
||||||
|
m = re.search(r"\{.*\}", result, re.DOTALL)
|
||||||
|
if m:
|
||||||
|
try:
|
||||||
|
return json.loads(m.group())
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return {"suggested_titles": [], "suggested_excludes": []}
|
||||||
|
|
||||||
|
tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs(
|
||||||
|
["🔎 Search", "🤖 LLM Backends", "📚 Notion", "🔌 Services", "📝 Resume Profile", "📧 Email", "🏷️ Skills"]
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Search tab ───────────────────────────────────────────────────────────────
|
||||||
|
with tab_search:
|
||||||
|
cfg = load_yaml(SEARCH_CFG)
|
||||||
|
profiles = cfg.get("profiles", [{}])
|
||||||
|
p = profiles[0] if profiles else {}
|
||||||
|
|
||||||
|
# Seed session state from config on first load (or when config changes after save)
|
||||||
|
_sp_hash = str(p.get("titles", [])) + str(p.get("exclude_keywords", []))
|
||||||
|
if st.session_state.get("_sp_hash") != _sp_hash:
|
||||||
|
st.session_state["_sp_titles"] = "\n".join(p.get("titles", []))
|
||||||
|
st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", []))
|
||||||
|
st.session_state["_sp_hash"] = _sp_hash
|
||||||
|
|
||||||
|
# ── Titles ────────────────────────────────────────────────────────────────
|
||||||
|
title_row, suggest_btn_col = st.columns([4, 1])
|
||||||
|
with title_row:
|
||||||
|
st.subheader("Job Titles to Search")
|
||||||
|
with suggest_btn_col:
|
||||||
|
st.write("") # vertical align
|
||||||
|
_run_suggest = st.button("✨ Suggest", key="sp_suggest_btn",
|
||||||
|
help="Ask the LLM to suggest additional titles and exclude keywords based on your resume")
|
||||||
|
|
||||||
|
titles_text = st.text_area(
|
||||||
|
"One title per line",
|
||||||
|
key="_sp_titles",
|
||||||
|
height=150,
|
||||||
|
help="JobSpy will search for any of these titles across all configured boards.",
|
||||||
|
label_visibility="visible",
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── LLM suggestions panel ────────────────────────────────────────────────
|
||||||
|
if _run_suggest:
|
||||||
|
current = [t.strip() for t in titles_text.splitlines() if t.strip()]
|
||||||
|
with st.spinner("Asking LLM for suggestions…"):
|
||||||
|
suggestions = _suggest_search_terms(current, RESUME_PATH)
|
||||||
|
st.session_state["_sp_suggestions"] = suggestions
|
||||||
|
|
||||||
|
if st.session_state.get("_sp_suggestions"):
|
||||||
|
sugg = st.session_state["_sp_suggestions"]
|
||||||
|
s_titles = sugg.get("suggested_titles", [])
|
||||||
|
s_excl = sugg.get("suggested_excludes", [])
|
||||||
|
|
||||||
|
existing_titles = {t.lower() for t in titles_text.splitlines() if t.strip()}
|
||||||
|
existing_excl = {e.lower() for e in st.session_state.get("_sp_excludes", "").splitlines() if e.strip()}
|
||||||
|
|
||||||
|
if s_titles:
|
||||||
|
st.caption("**Suggested titles** — click to add:")
|
||||||
|
cols = st.columns(min(len(s_titles), 4))
|
||||||
|
for i, title in enumerate(s_titles):
|
||||||
|
with cols[i % 4]:
|
||||||
|
if title.lower() not in existing_titles:
|
||||||
|
if st.button(f"+ {title}", key=f"sp_add_title_{i}"):
|
||||||
|
st.session_state["_sp_titles"] = (
|
||||||
|
st.session_state.get("_sp_titles", "").rstrip("\n") + f"\n{title}"
|
||||||
|
)
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.caption(f"✓ {title}")
|
||||||
|
|
||||||
|
if s_excl:
|
||||||
|
st.caption("**Suggested exclusions** — click to add:")
|
||||||
|
cols2 = st.columns(min(len(s_excl), 4))
|
||||||
|
for i, kw in enumerate(s_excl):
|
||||||
|
with cols2[i % 4]:
|
||||||
|
if kw.lower() not in existing_excl:
|
||||||
|
if st.button(f"+ {kw}", key=f"sp_add_excl_{i}"):
|
||||||
|
st.session_state["_sp_excludes"] = (
|
||||||
|
st.session_state.get("_sp_excludes", "").rstrip("\n") + f"\n{kw}"
|
||||||
|
)
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.caption(f"✓ {kw}")
|
||||||
|
|
||||||
|
if st.button("✕ Clear suggestions", key="sp_clear_sugg"):
|
||||||
|
st.session_state.pop("_sp_suggestions", None)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.subheader("Locations")
|
||||||
|
locations_text = st.text_area(
|
||||||
|
"One location per line",
|
||||||
|
value="\n".join(p.get("locations", [])),
|
||||||
|
height=100,
|
||||||
|
)
|
||||||
|
|
||||||
|
st.subheader("Exclude Keywords")
|
||||||
|
st.caption("Jobs whose **title or description** contain any of these words are silently dropped before entering the queue. Case-insensitive.")
|
||||||
|
exclude_text = st.text_area(
|
||||||
|
"One keyword or phrase per line",
|
||||||
|
key="_sp_excludes",
|
||||||
|
height=150,
|
||||||
|
help="e.g. 'sales', 'account executive', 'SDR'",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.subheader("Job Boards")
|
||||||
|
board_options = ["linkedin", "indeed", "glassdoor", "zip_recruiter", "google"]
|
||||||
|
selected_boards = st.multiselect(
|
||||||
|
"Standard boards (via JobSpy)", board_options,
|
||||||
|
default=[b for b in p.get("boards", board_options) if b in board_options],
|
||||||
|
help="Google Jobs aggregates listings from many sources and often finds roles the other boards miss.",
|
||||||
|
)
|
||||||
|
|
||||||
|
_custom_board_options = ["adzuna", "theladders"]
|
||||||
|
_custom_board_labels = {
|
||||||
|
"adzuna": "Adzuna (free API — requires app_id + app_key in config/adzuna.yaml)",
|
||||||
|
"theladders": "The Ladders (curl_cffi scraper — $100K+ roles, requires curl_cffi)",
|
||||||
|
}
|
||||||
|
st.caption("**Custom boards** — scrapers built into this app, not part of JobSpy.")
|
||||||
|
selected_custom = st.multiselect(
|
||||||
|
"Custom boards",
|
||||||
|
options=_custom_board_options,
|
||||||
|
default=[b for b in p.get("custom_boards", []) if b in _custom_board_options],
|
||||||
|
format_func=lambda b: _custom_board_labels.get(b, b),
|
||||||
|
)
|
||||||
|
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
results_per = col1.slider("Results per board", 5, 100, p.get("results_per_board", 25))
|
||||||
|
hours_old = col2.slider("How far back to look (hours)", 24, 720, p.get("hours_old", 72))
|
||||||
|
|
||||||
|
if st.button("💾 Save search settings", type="primary"):
|
||||||
|
profiles[0] = {
|
||||||
|
**p,
|
||||||
|
"titles": [t.strip() for t in titles_text.splitlines() if t.strip()],
|
||||||
|
"locations": [loc.strip() for loc in locations_text.splitlines() if loc.strip()],
|
||||||
|
"boards": selected_boards,
|
||||||
|
"custom_boards": selected_custom,
|
||||||
|
"results_per_board": results_per,
|
||||||
|
"hours_old": hours_old,
|
||||||
|
"exclude_keywords": [k.strip() for k in exclude_text.splitlines() if k.strip()],
|
||||||
|
}
|
||||||
|
save_yaml(SEARCH_CFG, {"profiles": profiles})
|
||||||
|
st.session_state["_sp_hash"] = "" # force re-seed on next load
|
||||||
|
st.session_state.pop("_sp_suggestions", None)
|
||||||
|
st.success("Search settings saved!")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Blocklist ──────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("🚫 Blocklist — companies, industries, and locations I will never work at", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
"Listings matching any rule below are **silently dropped before entering the review queue**, "
|
||||||
|
"across all search profiles and custom boards. Changes take effect on the next discovery run."
|
||||||
|
)
|
||||||
|
bl = load_yaml(BLOCKLIST_CFG)
|
||||||
|
|
||||||
|
bl_companies = st.text_area(
|
||||||
|
"Company names (partial match, one per line)",
|
||||||
|
value="\n".join(bl.get("companies", [])),
|
||||||
|
height=120,
|
||||||
|
help="e.g. 'Amazon' blocks any listing where the company name contains 'amazon' (case-insensitive).",
|
||||||
|
key="bl_companies",
|
||||||
|
)
|
||||||
|
bl_industries = st.text_area(
|
||||||
|
"Industry / content keywords (one per line)",
|
||||||
|
value="\n".join(bl.get("industries", [])),
|
||||||
|
height=100,
|
||||||
|
help="Blocked if the keyword appears in the company name OR job description. "
|
||||||
|
"e.g. 'gambling', 'crypto', 'tobacco', 'defense contractor'.",
|
||||||
|
key="bl_industries",
|
||||||
|
)
|
||||||
|
bl_locations = st.text_area(
|
||||||
|
"Location strings to exclude (one per line)",
|
||||||
|
value="\n".join(bl.get("locations", [])),
|
||||||
|
height=80,
|
||||||
|
help="e.g. 'Dallas' blocks any listing whose location contains 'dallas'.",
|
||||||
|
key="bl_locations",
|
||||||
|
)
|
||||||
|
|
||||||
|
if st.button("💾 Save blocklist", type="primary", key="save_blocklist"):
|
||||||
|
save_yaml(BLOCKLIST_CFG, {
|
||||||
|
"companies": [c.strip() for c in bl_companies.splitlines() if c.strip()],
|
||||||
|
"industries": [i.strip() for i in bl_industries.splitlines() if i.strip()],
|
||||||
|
"locations": [loc.strip() for loc in bl_locations.splitlines() if loc.strip()],
|
||||||
|
})
|
||||||
|
st.success("Blocklist saved — takes effect on next discovery run.")
|
||||||
|
|
||||||
|
# ── LLM Backends tab ─────────────────────────────────────────────────────────
|
||||||
|
with tab_llm:
|
||||||
|
import requests as _req
|
||||||
|
|
||||||
|
def _ollama_models(base_url: str) -> list[str]:
|
||||||
|
"""Fetch installed model names from the Ollama /api/tags endpoint."""
|
||||||
|
try:
|
||||||
|
r = _req.get(base_url.rstrip("/v1").rstrip("/") + "/api/tags", timeout=2)
|
||||||
|
if r.ok:
|
||||||
|
return [m["name"] for m in r.json().get("models", [])]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return []
|
||||||
|
|
||||||
|
cfg = load_yaml(LLM_CFG)
|
||||||
|
backends = cfg.get("backends", {})
|
||||||
|
fallback_order = cfg.get("fallback_order", list(backends.keys()))
|
||||||
|
|
||||||
|
# Persist reordering across reruns triggered by ↑↓ buttons.
|
||||||
|
# Reset to config order whenever the config file is fresher than the session key.
|
||||||
|
_cfg_key = str(fallback_order)
|
||||||
|
if st.session_state.get("_llm_order_cfg_key") != _cfg_key:
|
||||||
|
st.session_state["_llm_order"] = list(fallback_order)
|
||||||
|
st.session_state["_llm_order_cfg_key"] = _cfg_key
|
||||||
|
new_order: list[str] = st.session_state["_llm_order"]
|
||||||
|
|
||||||
|
# All known backends (in current order first, then any extras)
|
||||||
|
all_names = list(new_order) + [n for n in backends if n not in new_order]
|
||||||
|
|
||||||
|
st.caption("Enable/disable backends and drag their priority with the ↑ ↓ buttons. "
|
||||||
|
"First enabled + reachable backend wins on each call.")
|
||||||
|
|
||||||
|
updated_backends = {}
|
||||||
|
|
||||||
|
for name in all_names:
|
||||||
|
b = backends.get(name, {})
|
||||||
|
enabled = b.get("enabled", True)
|
||||||
|
label = name.replace("_", " ").title()
|
||||||
|
pos = new_order.index(name) + 1 if name in new_order else "—"
|
||||||
|
header = f"{'🟢' if enabled else '⚫'} **{pos}. {label}**"
|
||||||
|
|
||||||
|
with st.expander(header, expanded=False):
|
||||||
|
col_tog, col_up, col_dn, col_spacer = st.columns([2, 1, 1, 4])
|
||||||
|
|
||||||
|
new_enabled = col_tog.checkbox("Enabled", value=enabled, key=f"{name}_enabled")
|
||||||
|
|
||||||
|
# Up / Down only apply to backends currently in the order
|
||||||
|
if name in new_order:
|
||||||
|
idx = new_order.index(name)
|
||||||
|
if col_up.button("↑", key=f"{name}_up", disabled=idx == 0):
|
||||||
|
new_order[idx], new_order[idx - 1] = new_order[idx - 1], new_order[idx]
|
||||||
|
st.session_state["_llm_order"] = new_order
|
||||||
|
st.rerun()
|
||||||
|
if col_dn.button("↓", key=f"{name}_dn", disabled=idx == len(new_order) - 1):
|
||||||
|
new_order[idx], new_order[idx + 1] = new_order[idx + 1], new_order[idx]
|
||||||
|
st.session_state["_llm_order"] = new_order
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if b.get("type") == "openai_compat":
|
||||||
|
url = st.text_input("URL", value=b.get("base_url", ""), key=f"{name}_url")
|
||||||
|
|
||||||
|
# Ollama gets a live model picker; other backends get a text input
|
||||||
|
if name == "ollama":
|
||||||
|
ollama_models = _ollama_models(b.get("base_url", "http://localhost:11434"))
|
||||||
|
current_model = b.get("model", "")
|
||||||
|
if ollama_models:
|
||||||
|
options = ollama_models
|
||||||
|
idx_default = options.index(current_model) if current_model in options else 0
|
||||||
|
model = st.selectbox(
|
||||||
|
"Model",
|
||||||
|
options,
|
||||||
|
index=idx_default,
|
||||||
|
key=f"{name}_model",
|
||||||
|
help="Lists models currently installed in Ollama. Pull new ones with `ollama pull <name>`.",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
st.caption("_Ollama not reachable — enter model name manually_")
|
||||||
|
model = st.text_input("Model", value=current_model, key=f"{name}_model")
|
||||||
|
else:
|
||||||
|
model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model")
|
||||||
|
|
||||||
|
updated_backends[name] = {**b, "base_url": url, "model": model, "enabled": new_enabled}
|
||||||
|
elif b.get("type") == "anthropic":
|
||||||
|
model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model")
|
||||||
|
updated_backends[name] = {**b, "model": model, "enabled": new_enabled}
|
||||||
|
else:
|
||||||
|
updated_backends[name] = {**b, "enabled": new_enabled}
|
||||||
|
|
||||||
|
if b.get("type") == "openai_compat":
|
||||||
|
if st.button(f"Test connection", key=f"test_{name}"):
|
||||||
|
with st.spinner("Testing…"):
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
r = LLMRouter()
|
||||||
|
reachable = r._is_reachable(b.get("base_url", ""))
|
||||||
|
if reachable:
|
||||||
|
st.success("Reachable ✓")
|
||||||
|
else:
|
||||||
|
st.warning("Not reachable ✗")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error: {e}")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.caption("Current priority: " + " → ".join(
|
||||||
|
f"{'✓' if backends.get(n, {}).get('enabled', True) else '✗'} {n}"
|
||||||
|
for n in new_order
|
||||||
|
))
|
||||||
|
|
||||||
|
if st.button("💾 Save LLM settings", type="primary"):
|
||||||
|
save_yaml(LLM_CFG, {**cfg, "backends": updated_backends, "fallback_order": new_order})
|
||||||
|
st.session_state.pop("_llm_order", None)
|
||||||
|
st.session_state.pop("_llm_order_cfg_key", None)
|
||||||
|
st.success("LLM settings saved!")
|
||||||
|
|
||||||
|
# ── Notion tab ────────────────────────────────────────────────────────────────
|
||||||
|
with tab_notion:
|
||||||
|
cfg = load_yaml(NOTION_CFG) if NOTION_CFG.exists() else {}
|
||||||
|
|
||||||
|
st.subheader("Notion Connection")
|
||||||
|
token = st.text_input(
|
||||||
|
"Integration Token",
|
||||||
|
value=cfg.get("token", ""),
|
||||||
|
type="password",
|
||||||
|
help="Find this at notion.so/my-integrations → your integration → Internal Integration Token",
|
||||||
|
)
|
||||||
|
db_id = st.text_input(
|
||||||
|
"Database ID",
|
||||||
|
value=cfg.get("database_id", ""),
|
||||||
|
help="The 32-character ID from your Notion database URL",
|
||||||
|
)
|
||||||
|
|
||||||
|
col_save, col_test = st.columns(2)
|
||||||
|
if col_save.button("💾 Save Notion settings", type="primary"):
|
||||||
|
save_yaml(NOTION_CFG, {**cfg, "token": token, "database_id": db_id})
|
||||||
|
st.success("Notion settings saved!")
|
||||||
|
|
||||||
|
if col_test.button("🔌 Test connection"):
|
||||||
|
with st.spinner("Connecting…"):
|
||||||
|
try:
|
||||||
|
from notion_client import Client
|
||||||
|
n = Client(auth=token)
|
||||||
|
db = n.databases.retrieve(db_id)
|
||||||
|
st.success(f"Connected to: **{db['title'][0]['plain_text']}**")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Connection failed: {e}")
|
||||||
|
|
||||||
|
# ── Services tab ───────────────────────────────────────────────────────────────
|
||||||
|
with tab_services:
|
||||||
|
import socket
|
||||||
|
import subprocess as _sp
|
||||||
|
|
||||||
|
TOKENS_CFG = CONFIG_DIR / "tokens.yaml"
|
||||||
|
PFP_DIR = Path("/Library/Documents/Post Fight Processing")
|
||||||
|
|
||||||
|
# Service definitions: (display_name, port, start_cmd, stop_cmd, notes)
|
||||||
|
SERVICES = [
|
||||||
|
{
|
||||||
|
"name": "Streamlit UI",
|
||||||
|
"port": 8501,
|
||||||
|
"start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "start"],
|
||||||
|
"stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "stop"],
|
||||||
|
"cwd": str(Path(__file__).parent.parent.parent),
|
||||||
|
"note": "Job Seeker web interface",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Ollama (local LLM)",
|
||||||
|
"port": 11434,
|
||||||
|
"start": ["sudo", "systemctl", "start", "ollama"],
|
||||||
|
"stop": ["sudo", "systemctl", "stop", "ollama"],
|
||||||
|
"cwd": "/",
|
||||||
|
"note": "Local inference engine — systemd service",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Claude Code Wrapper",
|
||||||
|
"port": 3009,
|
||||||
|
"start": ["bash", str(PFP_DIR / "manage-services.sh"), "start"],
|
||||||
|
"stop": ["bash", str(PFP_DIR / "manage-services.sh"), "stop"],
|
||||||
|
"cwd": str(PFP_DIR),
|
||||||
|
"note": "OpenAI-compat proxy → Claude Code (port 3009)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "GitHub Copilot Wrapper",
|
||||||
|
"port": 3010,
|
||||||
|
"start": ["bash", str(PFP_DIR / "manage-copilot.sh"), "start"],
|
||||||
|
"stop": ["bash", str(PFP_DIR / "manage-copilot.sh"), "stop"],
|
||||||
|
"cwd": str(PFP_DIR),
|
||||||
|
"note": "OpenAI-compat proxy → GitHub Copilot (port 3010)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "vLLM Server",
|
||||||
|
"port": 8000,
|
||||||
|
"start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "start"],
|
||||||
|
"stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "stop"],
|
||||||
|
"cwd": str(Path(__file__).parent.parent.parent),
|
||||||
|
"model_dir": "/Library/Assets/LLM/vllm/models",
|
||||||
|
"note": "Local vLLM inference — Ouro model family (port 8000, GPU 1)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Vision Service (moondream2)",
|
||||||
|
"port": 8002,
|
||||||
|
"start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "start"],
|
||||||
|
"stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "stop"],
|
||||||
|
"cwd": str(Path(__file__).parent.parent.parent),
|
||||||
|
"note": "Survey screenshot analysis — moondream2 (port 8002, optional)",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "SearXNG (company scraper)",
|
||||||
|
"port": 8888,
|
||||||
|
"start": ["docker", "compose", "up", "-d"],
|
||||||
|
"stop": ["docker", "compose", "down"],
|
||||||
|
"cwd": str(Path("/Library/Development/scrapers/SearXNG")),
|
||||||
|
"note": "Privacy-respecting meta-search used for company research (port 8888)",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
def _port_open(port: int) -> bool:
|
||||||
|
try:
|
||||||
|
with socket.create_connection(("127.0.0.1", port), timeout=1):
|
||||||
|
return True
|
||||||
|
except OSError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
st.caption("Monitor and control the LLM backend services. Status is checked live on each page load.")
|
||||||
|
|
||||||
|
for svc in SERVICES:
|
||||||
|
up = _port_open(svc["port"])
|
||||||
|
badge = "🟢 Running" if up else "🔴 Stopped"
|
||||||
|
header = f"**{svc['name']}** — {badge}"
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
left_col, right_col = st.columns([3, 1])
|
||||||
|
with left_col:
|
||||||
|
st.markdown(header)
|
||||||
|
st.caption(f"Port {svc['port']} · {svc['note']}")
|
||||||
|
|
||||||
|
# Model selector for services backed by a local model directory (e.g. vLLM)
|
||||||
|
if "model_dir" in svc:
|
||||||
|
_mdir = Path(svc["model_dir"])
|
||||||
|
_models = (
|
||||||
|
sorted(d.name for d in _mdir.iterdir() if d.is_dir())
|
||||||
|
if _mdir.exists() else []
|
||||||
|
)
|
||||||
|
_mk = f"svc_model_{svc['port']}"
|
||||||
|
_loaded_file = Path("/tmp/vllm-server.model")
|
||||||
|
_loaded = _loaded_file.read_text().strip() if (_loaded_file.exists()) else ""
|
||||||
|
if _models:
|
||||||
|
_default = _models.index(_loaded) if _loaded in _models else 0
|
||||||
|
st.selectbox(
|
||||||
|
"Model",
|
||||||
|
_models,
|
||||||
|
index=_default,
|
||||||
|
key=_mk,
|
||||||
|
disabled=up,
|
||||||
|
help="Model to load on start. Stop then Start to swap models.",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
st.caption(f"_No models found in {svc['model_dir']}_")
|
||||||
|
|
||||||
|
with right_col:
|
||||||
|
if svc["start"] is None:
|
||||||
|
st.caption("_Manual start only_")
|
||||||
|
elif up:
|
||||||
|
if st.button("⏹ Stop", key=f"svc_stop_{svc['port']}", use_container_width=True):
|
||||||
|
with st.spinner(f"Stopping {svc['name']}…"):
|
||||||
|
r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"])
|
||||||
|
if r.returncode == 0:
|
||||||
|
st.success("Stopped.")
|
||||||
|
else:
|
||||||
|
st.error(f"Error: {r.stderr or r.stdout}")
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
# Build start command, appending selected model for services with model_dir
|
||||||
|
_start_cmd = list(svc["start"])
|
||||||
|
if "model_dir" in svc:
|
||||||
|
_sel = st.session_state.get(f"svc_model_{svc['port']}")
|
||||||
|
if _sel:
|
||||||
|
_start_cmd.append(_sel)
|
||||||
|
if st.button("▶ Start", key=f"svc_start_{svc['port']}", use_container_width=True, type="primary"):
|
||||||
|
with st.spinner(f"Starting {svc['name']}…"):
|
||||||
|
r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"])
|
||||||
|
if r.returncode == 0:
|
||||||
|
st.success("Started!")
|
||||||
|
else:
|
||||||
|
st.error(f"Error: {r.stderr or r.stdout}")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.subheader("🤗 Hugging Face")
|
||||||
|
st.caption(
|
||||||
|
"Used for uploading training data and running fine-tune jobs on HF infrastructure. "
|
||||||
|
"Token is stored in `config/tokens.yaml` (git-ignored). "
|
||||||
|
"Create a **write-permission** token at huggingface.co/settings/tokens."
|
||||||
|
)
|
||||||
|
|
||||||
|
tok_cfg = load_yaml(TOKENS_CFG) if TOKENS_CFG.exists() else {}
|
||||||
|
hf_token = st.text_input(
|
||||||
|
"HF Token",
|
||||||
|
value=tok_cfg.get("hf_token", ""),
|
||||||
|
type="password",
|
||||||
|
placeholder="hf_…",
|
||||||
|
)
|
||||||
|
|
||||||
|
col_save_hf, col_test_hf = st.columns(2)
|
||||||
|
if col_save_hf.button("💾 Save HF token", type="primary"):
|
||||||
|
save_yaml(TOKENS_CFG, {**tok_cfg, "hf_token": hf_token})
|
||||||
|
TOKENS_CFG.chmod(0o600)
|
||||||
|
st.success("Saved!")
|
||||||
|
|
||||||
|
if col_test_hf.button("🔌 Test HF token"):
|
||||||
|
with st.spinner("Checking…"):
|
||||||
|
try:
|
||||||
|
import requests as _r
|
||||||
|
resp = _r.get(
|
||||||
|
"https://huggingface.co/api/whoami",
|
||||||
|
headers={"Authorization": f"Bearer {hf_token}"},
|
||||||
|
timeout=5,
|
||||||
|
)
|
||||||
|
if resp.ok:
|
||||||
|
info = resp.json()
|
||||||
|
name = info.get("name") or info.get("fullname") or "unknown"
|
||||||
|
auth = info.get("auth", {})
|
||||||
|
perm = auth.get("accessToken", {}).get("role", "read")
|
||||||
|
st.success(f"Logged in as **{name}** · permission: `{perm}`")
|
||||||
|
if perm == "read":
|
||||||
|
st.warning("Token is read-only — create a **write** token to upload datasets and run Jobs.")
|
||||||
|
else:
|
||||||
|
st.error(f"Invalid token ({resp.status_code})")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error: {e}")
|
||||||
|
|
||||||
|
# ── Resume Profile tab ────────────────────────────────────────────────────────
|
||||||
|
with tab_resume:
|
||||||
|
st.caption(
|
||||||
|
"Edit Alex's application profile. "
|
||||||
|
"Bullets are used as paste-able shortcuts in the Apply Workspace."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not RESUME_PATH.exists():
|
||||||
|
st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
_data = yaml.safe_load(RESUME_PATH.read_text()) or {}
|
||||||
|
|
||||||
|
def _field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str:
|
||||||
|
needs_attention = str(value).startswith("FILL_IN") or value == ""
|
||||||
|
if needs_attention:
|
||||||
|
st.markdown(
|
||||||
|
'<p style="color:#F59E0B;font-size:0.8em;margin-bottom:2px">⚠️ Needs attention</p>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
return st.text_input(label, value=value or "", key=key, help=help,
|
||||||
|
type="password" if password else "default")
|
||||||
|
|
||||||
|
# ── Personal Info ─────────────────────────────────────────────────────────
|
||||||
|
with st.expander("👤 Personal Information", expanded=True):
|
||||||
|
_info = _data.get("personal_information", {})
|
||||||
|
_c1, _c2 = st.columns(2)
|
||||||
|
with _c1:
|
||||||
|
_name = _field("First Name", _info.get("name", ""), "rp_name")
|
||||||
|
_email = _field("Email", _info.get("email", ""), "rp_email")
|
||||||
|
_phone = _field("Phone", _info.get("phone", ""), "rp_phone")
|
||||||
|
_city = _field("City", _info.get("city", ""), "rp_city")
|
||||||
|
with _c2:
|
||||||
|
_surname = _field("Last Name", _info.get("surname", ""), "rp_surname")
|
||||||
|
_linkedin = _field("LinkedIn URL", _info.get("linkedin", ""), "rp_linkedin")
|
||||||
|
_zip_code = _field("Zip Code", _info.get("zip_code", ""), "rp_zip")
|
||||||
|
_dob = _field("Date of Birth", _info.get("date_of_birth", ""), "rp_dob",
|
||||||
|
help="MM/DD/YYYY")
|
||||||
|
|
||||||
|
# ── Experience ────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("💼 Work Experience"):
|
||||||
|
_exp_list = _data.get("experience_details", [{}])
|
||||||
|
if "rp_exp_count" not in st.session_state:
|
||||||
|
st.session_state.rp_exp_count = len(_exp_list)
|
||||||
|
if st.button("+ Add Experience Entry", key="rp_add_exp"):
|
||||||
|
st.session_state.rp_exp_count += 1
|
||||||
|
_exp_list.append({})
|
||||||
|
|
||||||
|
_updated_exp = []
|
||||||
|
for _i in range(st.session_state.rp_exp_count):
|
||||||
|
_exp = _exp_list[_i] if _i < len(_exp_list) else {}
|
||||||
|
st.markdown(f"**Position {_i + 1}**")
|
||||||
|
_ec1, _ec2 = st.columns(2)
|
||||||
|
with _ec1:
|
||||||
|
_pos = _field("Job Title", _exp.get("position", ""), f"rp_pos_{_i}")
|
||||||
|
_co = _field("Company", _exp.get("company", ""), f"rp_co_{_i}")
|
||||||
|
_period = _field("Period", _exp.get("employment_period", ""), f"rp_period_{_i}",
|
||||||
|
help="e.g. 01/2022 - Present")
|
||||||
|
with _ec2:
|
||||||
|
_loc = st.text_input("Location", _exp.get("location", ""), key=f"rp_loc_{_i}")
|
||||||
|
_ind = st.text_input("Industry", _exp.get("industry", ""), key=f"rp_ind_{_i}")
|
||||||
|
_resp_raw = st.text_area(
|
||||||
|
"Key Responsibilities (one per line)",
|
||||||
|
value="\n".join(
|
||||||
|
r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r)
|
||||||
|
for j, r in enumerate(_exp.get("key_responsibilities", []))
|
||||||
|
),
|
||||||
|
key=f"rp_resp_{_i}", height=100,
|
||||||
|
)
|
||||||
|
_skills_raw = st.text_input(
|
||||||
|
"Skills (comma-separated)",
|
||||||
|
value=", ".join(_exp.get("skills_acquired", [])),
|
||||||
|
key=f"rp_skills_{_i}",
|
||||||
|
)
|
||||||
|
_updated_exp.append({
|
||||||
|
"position": _pos, "company": _co, "employment_period": _period,
|
||||||
|
"location": _loc, "industry": _ind,
|
||||||
|
"key_responsibilities": [{"responsibility_1": r.strip()} for r in _resp_raw.splitlines() if r.strip()],
|
||||||
|
"skills_acquired": [s.strip() for s in _skills_raw.split(",") if s.strip()],
|
||||||
|
})
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Preferences ───────────────────────────────────────────────────────────
|
||||||
|
with st.expander("⚙️ Preferences & Availability"):
|
||||||
|
_wp = _data.get("work_preferences", {})
|
||||||
|
_sal = _data.get("salary_expectations", {})
|
||||||
|
_avail = _data.get("availability", {})
|
||||||
|
_pc1, _pc2 = st.columns(2)
|
||||||
|
with _pc1:
|
||||||
|
_salary_range = st.text_input("Salary Range (USD)", _sal.get("salary_range_usd", ""),
|
||||||
|
key="rp_salary", help="e.g. 120000 - 180000")
|
||||||
|
_notice = st.text_input("Notice Period", _avail.get("notice_period", "2 weeks"), key="rp_notice")
|
||||||
|
with _pc2:
|
||||||
|
_remote = st.checkbox("Open to Remote", value=_wp.get("remote_work", "Yes") == "Yes", key="rp_remote")
|
||||||
|
_reloc = st.checkbox("Open to Relocation", value=_wp.get("open_to_relocation", "No") == "Yes", key="rp_reloc")
|
||||||
|
_assessments = st.checkbox("Willing to complete assessments",
|
||||||
|
value=_wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="rp_assess")
|
||||||
|
_bg = st.checkbox("Willing to undergo background checks",
|
||||||
|
value=_wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="rp_bg")
|
||||||
|
|
||||||
|
# ── Self-ID ───────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("🏳️🌈 Self-Identification (optional)"):
|
||||||
|
_sid = _data.get("self_identification", {})
|
||||||
|
_sc1, _sc2 = st.columns(2)
|
||||||
|
with _sc1:
|
||||||
|
_gender = st.text_input("Gender identity", _sid.get("gender", "Non-binary"), key="rp_gender")
|
||||||
|
_pronouns = st.text_input("Pronouns", _sid.get("pronouns", "Any"), key="rp_pronouns")
|
||||||
|
_ethnicity = _field("Ethnicity", _sid.get("ethnicity", ""), "rp_ethnicity")
|
||||||
|
with _sc2:
|
||||||
|
_vet_opts = ["No", "Yes", "Prefer not to say"]
|
||||||
|
_veteran = st.selectbox("Veteran status", _vet_opts,
|
||||||
|
index=_vet_opts.index(_sid.get("veteran", "No")), key="rp_vet")
|
||||||
|
_dis_opts = ["Prefer not to say", "No", "Yes"]
|
||||||
|
_disability = st.selectbox("Disability disclosure", _dis_opts,
|
||||||
|
index=_dis_opts.index(_sid.get("disability", "Prefer not to say")),
|
||||||
|
key="rp_dis")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
if st.button("💾 Save Resume Profile", type="primary", use_container_width=True, key="rp_save"):
|
||||||
|
_data["personal_information"] = {
|
||||||
|
**_data.get("personal_information", {}),
|
||||||
|
"name": _name, "surname": _surname, "email": _email, "phone": _phone,
|
||||||
|
"city": _city, "zip_code": _zip_code, "linkedin": _linkedin, "date_of_birth": _dob,
|
||||||
|
}
|
||||||
|
_data["experience_details"] = _updated_exp
|
||||||
|
_data["salary_expectations"] = {"salary_range_usd": _salary_range}
|
||||||
|
_data["availability"] = {"notice_period": _notice}
|
||||||
|
_data["work_preferences"] = {
|
||||||
|
**_data.get("work_preferences", {}),
|
||||||
|
"remote_work": "Yes" if _remote else "No",
|
||||||
|
"open_to_relocation": "Yes" if _reloc else "No",
|
||||||
|
"willing_to_complete_assessments": "Yes" if _assessments else "No",
|
||||||
|
"willing_to_undergo_background_checks": "Yes" if _bg else "No",
|
||||||
|
}
|
||||||
|
_data["self_identification"] = {
|
||||||
|
"gender": _gender, "pronouns": _pronouns, "veteran": _veteran,
|
||||||
|
"disability": _disability, "ethnicity": _ethnicity,
|
||||||
|
}
|
||||||
|
RESUME_PATH.write_text(yaml.dump(_data, default_flow_style=False, allow_unicode=True))
|
||||||
|
st.success("✅ Resume profile saved!")
|
||||||
|
st.balloons()
|
||||||
|
|
||||||
|
# ── Email tab ─────────────────────────────────────────────────────────────────
|
||||||
|
with tab_email:
|
||||||
|
EMAIL_CFG = CONFIG_DIR / "email.yaml"
|
||||||
|
EMAIL_EXAMPLE = CONFIG_DIR / "email.yaml.example"
|
||||||
|
|
||||||
|
st.caption(
|
||||||
|
"Connect Alex's email via IMAP to automatically associate recruitment "
|
||||||
|
"emails with job applications. Only emails that mention the company name "
|
||||||
|
"AND contain a recruitment keyword are ever imported — no personal emails "
|
||||||
|
"are touched."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not EMAIL_CFG.exists():
|
||||||
|
st.info("No email config found — fill in your credentials below and click **Save** to create it.")
|
||||||
|
|
||||||
|
em_cfg = load_yaml(EMAIL_CFG) if EMAIL_CFG.exists() else {}
|
||||||
|
|
||||||
|
col_a, col_b = st.columns(2)
|
||||||
|
with col_a:
|
||||||
|
em_host = st.text_input("IMAP Host", em_cfg.get("host", "imap.gmail.com"), key="em_host")
|
||||||
|
em_port = st.number_input("Port", value=int(em_cfg.get("port", 993)),
|
||||||
|
min_value=1, max_value=65535, key="em_port")
|
||||||
|
em_ssl = st.checkbox("Use SSL", value=em_cfg.get("use_ssl", True), key="em_ssl")
|
||||||
|
with col_b:
|
||||||
|
em_user = st.text_input("Username (email address)", em_cfg.get("username", ""), key="em_user")
|
||||||
|
em_pass = st.text_input("Password / App Password", em_cfg.get("password", ""),
|
||||||
|
type="password", key="em_pass")
|
||||||
|
em_sent = st.text_input("Sent folder (blank = auto-detect)",
|
||||||
|
em_cfg.get("sent_folder", ""), key="em_sent",
|
||||||
|
placeholder='e.g. "[Gmail]/Sent Mail"')
|
||||||
|
|
||||||
|
em_days = st.slider("Look-back window (days)", 14, 365,
|
||||||
|
int(em_cfg.get("lookback_days", 90)), key="em_days")
|
||||||
|
|
||||||
|
st.caption(
|
||||||
|
"**Gmail users:** create an App Password at "
|
||||||
|
"myaccount.google.com/apppasswords (requires 2-Step Verification). "
|
||||||
|
"Enable IMAP at Gmail Settings → Forwarding and POP/IMAP."
|
||||||
|
)
|
||||||
|
|
||||||
|
col_save, col_test = st.columns(2)
|
||||||
|
|
||||||
|
if col_save.button("💾 Save email settings", type="primary", key="em_save"):
|
||||||
|
save_yaml(EMAIL_CFG, {
|
||||||
|
"host": em_host, "port": int(em_port), "use_ssl": em_ssl,
|
||||||
|
"username": em_user, "password": em_pass,
|
||||||
|
"sent_folder": em_sent, "lookback_days": int(em_days),
|
||||||
|
})
|
||||||
|
EMAIL_CFG.chmod(0o600)
|
||||||
|
st.success("Saved!")
|
||||||
|
|
||||||
|
if col_test.button("🔌 Test connection", key="em_test"):
|
||||||
|
with st.spinner("Connecting…"):
|
||||||
|
try:
|
||||||
|
import imaplib as _imap
|
||||||
|
_conn = (_imap.IMAP4_SSL if em_ssl else _imap.IMAP4)(em_host, int(em_port))
|
||||||
|
_conn.login(em_user, em_pass)
|
||||||
|
_, _caps = _conn.capability()
|
||||||
|
_conn.logout()
|
||||||
|
st.success(f"Connected successfully to {em_host}")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Connection failed: {e}")
|
||||||
|
|
||||||
|
# ── Skills & Keywords tab ─────────────────────────────────────────────────────
|
||||||
|
with tab_skills:
|
||||||
|
st.subheader("🏷️ Skills & Keywords")
|
||||||
|
st.caption(
|
||||||
|
"These are matched against job descriptions to select Alex's most relevant "
|
||||||
|
"experience and highlight keyword overlap in the research brief."
|
||||||
|
)
|
||||||
|
|
||||||
|
if not KEYWORDS_CFG.exists():
|
||||||
|
st.warning("resume_keywords.yaml not found — create it at config/resume_keywords.yaml")
|
||||||
|
else:
|
||||||
|
kw_data = load_yaml(KEYWORDS_CFG)
|
||||||
|
|
||||||
|
changed = False
|
||||||
|
for category in ["skills", "domains", "keywords"]:
|
||||||
|
st.markdown(f"**{category.title()}**")
|
||||||
|
tags: list[str] = kw_data.get(category, [])
|
||||||
|
|
||||||
|
if not tags:
|
||||||
|
st.caption("No tags yet — add one below.")
|
||||||
|
|
||||||
|
# Render existing tags as removable chips (value-based keys for stability)
|
||||||
|
n_cols = min(max(len(tags), 1), 6)
|
||||||
|
cols = st.columns(n_cols)
|
||||||
|
to_remove = None
|
||||||
|
for i, tag in enumerate(tags):
|
||||||
|
with cols[i % n_cols]:
|
||||||
|
if st.button(f"× {tag}", key=f"rm_{category}_{tag}", use_container_width=True):
|
||||||
|
to_remove = tag
|
||||||
|
if to_remove:
|
||||||
|
tags.remove(to_remove)
|
||||||
|
kw_data[category] = tags
|
||||||
|
changed = True
|
||||||
|
|
||||||
|
# Add new tag
|
||||||
|
new_col, btn_col = st.columns([4, 1])
|
||||||
|
new_tag = new_col.text_input(
|
||||||
|
"Add",
|
||||||
|
key=f"new_{category}",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
placeholder=f"Add {category[:-1] if category.endswith('s') else category}…",
|
||||||
|
)
|
||||||
|
if btn_col.button("+ Add", key=f"add_{category}"):
|
||||||
|
tag = new_tag.strip()
|
||||||
|
if tag and tag not in tags:
|
||||||
|
tags.append(tag)
|
||||||
|
kw_data[category] = tags
|
||||||
|
changed = True
|
||||||
|
|
||||||
|
st.markdown("---")
|
||||||
|
|
||||||
|
if changed:
|
||||||
|
save_yaml(KEYWORDS_CFG, kw_data)
|
||||||
|
st.success("Saved.")
|
||||||
|
st.rerun()
|
||||||
191
app/pages/3_Resume_Editor.py
Normal file
191
app/pages/3_Resume_Editor.py
Normal file
|
|
@ -0,0 +1,191 @@
|
||||||
|
# app/pages/3_Resume_Editor.py
|
||||||
|
"""
|
||||||
|
Resume Editor — form-based editor for Alex's AIHawk profile YAML.
|
||||||
|
FILL_IN fields highlighted in amber.
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
st.set_page_config(page_title="Resume Editor", page_icon="📝", layout="wide")
|
||||||
|
st.title("📝 Resume Editor")
|
||||||
|
st.caption("Edit Alex's application profile used by AIHawk for LinkedIn Easy Apply.")
|
||||||
|
|
||||||
|
RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
|
|
||||||
|
if not RESUME_PATH.exists():
|
||||||
|
st.error(f"Resume file not found at `{RESUME_PATH}`. Is AIHawk cloned?")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
data = yaml.safe_load(RESUME_PATH.read_text()) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str:
|
||||||
|
"""Render a text input, highlighted amber if value is FILL_IN or empty."""
|
||||||
|
needs_attention = str(value).startswith("FILL_IN") or value == ""
|
||||||
|
if needs_attention:
|
||||||
|
st.markdown(
|
||||||
|
'<p style="color:#F59E0B;font-size:0.8em;margin-bottom:2px">⚠️ Needs your attention</p>',
|
||||||
|
unsafe_allow_html=True,
|
||||||
|
)
|
||||||
|
return st.text_input(label, value=value or "", key=key, help=help,
|
||||||
|
type="password" if password else "default")
|
||||||
|
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Personal Info ─────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("👤 Personal Information", expanded=True):
|
||||||
|
info = data.get("personal_information", {})
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
name = field("First Name", info.get("name", ""), "pi_name")
|
||||||
|
email = field("Email", info.get("email", ""), "pi_email")
|
||||||
|
phone = field("Phone", info.get("phone", ""), "pi_phone")
|
||||||
|
city = field("City", info.get("city", ""), "pi_city")
|
||||||
|
with col2:
|
||||||
|
surname = field("Last Name", info.get("surname", ""), "pi_surname")
|
||||||
|
linkedin = field("LinkedIn URL", info.get("linkedin", ""), "pi_linkedin")
|
||||||
|
zip_code = field("Zip Code", info.get("zip_code", ""), "pi_zip")
|
||||||
|
dob = field("Date of Birth", info.get("date_of_birth", ""), "pi_dob",
|
||||||
|
help="Format: MM/DD/YYYY")
|
||||||
|
|
||||||
|
# ── Education ─────────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("🎓 Education"):
|
||||||
|
edu_list = data.get("education_details", [{}])
|
||||||
|
updated_edu = []
|
||||||
|
degree_options = ["Bachelor's Degree", "Master's Degree", "Some College",
|
||||||
|
"Associate's Degree", "High School", "Other"]
|
||||||
|
for i, edu in enumerate(edu_list):
|
||||||
|
st.markdown(f"**Entry {i+1}**")
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
inst = field("Institution", edu.get("institution", ""), f"edu_inst_{i}")
|
||||||
|
field_study = st.text_input("Field of Study", edu.get("field_of_study", ""), key=f"edu_field_{i}")
|
||||||
|
start = st.text_input("Start Year", edu.get("start_date", ""), key=f"edu_start_{i}")
|
||||||
|
with col2:
|
||||||
|
current_level = edu.get("education_level", "Some College")
|
||||||
|
level_idx = degree_options.index(current_level) if current_level in degree_options else 2
|
||||||
|
level = st.selectbox("Degree Level", degree_options, index=level_idx, key=f"edu_level_{i}")
|
||||||
|
end = st.text_input("Completion Year", edu.get("year_of_completion", ""), key=f"edu_end_{i}")
|
||||||
|
updated_edu.append({
|
||||||
|
"education_level": level, "institution": inst, "field_of_study": field_study,
|
||||||
|
"start_date": start, "year_of_completion": end, "final_evaluation_grade": "", "exam": {},
|
||||||
|
})
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Experience ────────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("💼 Work Experience"):
|
||||||
|
exp_list = data.get("experience_details", [{}])
|
||||||
|
if "exp_count" not in st.session_state:
|
||||||
|
st.session_state.exp_count = len(exp_list)
|
||||||
|
if st.button("+ Add Experience Entry"):
|
||||||
|
st.session_state.exp_count += 1
|
||||||
|
exp_list.append({})
|
||||||
|
|
||||||
|
updated_exp = []
|
||||||
|
for i in range(st.session_state.exp_count):
|
||||||
|
exp = exp_list[i] if i < len(exp_list) else {}
|
||||||
|
st.markdown(f"**Position {i+1}**")
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
pos = field("Job Title", exp.get("position", ""), f"exp_pos_{i}")
|
||||||
|
company = field("Company", exp.get("company", ""), f"exp_co_{i}")
|
||||||
|
period = field("Employment Period", exp.get("employment_period", ""), f"exp_period_{i}",
|
||||||
|
help="e.g. 01/2022 - Present")
|
||||||
|
with col2:
|
||||||
|
location = st.text_input("Location", exp.get("location", ""), key=f"exp_loc_{i}")
|
||||||
|
industry = st.text_input("Industry", exp.get("industry", ""), key=f"exp_ind_{i}")
|
||||||
|
|
||||||
|
responsibilities = st.text_area(
|
||||||
|
"Key Responsibilities (one per line)",
|
||||||
|
value="\n".join(
|
||||||
|
r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r)
|
||||||
|
for j, r in enumerate(exp.get("key_responsibilities", []))
|
||||||
|
),
|
||||||
|
key=f"exp_resp_{i}", height=100,
|
||||||
|
)
|
||||||
|
skills = st.text_input(
|
||||||
|
"Skills (comma-separated)",
|
||||||
|
value=", ".join(exp.get("skills_acquired", [])),
|
||||||
|
key=f"exp_skills_{i}",
|
||||||
|
)
|
||||||
|
resp_list = [{"responsibility_1": r.strip()} for r in responsibilities.splitlines() if r.strip()]
|
||||||
|
skill_list = [s.strip() for s in skills.split(",") if s.strip()]
|
||||||
|
updated_exp.append({
|
||||||
|
"position": pos, "company": company, "employment_period": period,
|
||||||
|
"location": location, "industry": industry,
|
||||||
|
"key_responsibilities": resp_list, "skills_acquired": skill_list,
|
||||||
|
})
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Preferences ───────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("⚙️ Preferences & Availability"):
|
||||||
|
wp = data.get("work_preferences", {})
|
||||||
|
sal = data.get("salary_expectations", {})
|
||||||
|
avail = data.get("availability", {})
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
salary_range = st.text_input("Salary Range (USD)", sal.get("salary_range_usd", ""),
|
||||||
|
key="pref_salary", help="e.g. 120000 - 180000")
|
||||||
|
notice = st.text_input("Notice Period", avail.get("notice_period", "2 weeks"), key="pref_notice")
|
||||||
|
with col2:
|
||||||
|
remote_work = st.checkbox("Open to Remote", value=wp.get("remote_work", "Yes") == "Yes", key="pref_remote")
|
||||||
|
relocation = st.checkbox("Open to Relocation", value=wp.get("open_to_relocation", "No") == "Yes", key="pref_reloc")
|
||||||
|
assessments = st.checkbox("Willing to complete assessments",
|
||||||
|
value=wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="pref_assess")
|
||||||
|
bg_checks = st.checkbox("Willing to undergo background checks",
|
||||||
|
value=wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="pref_bg")
|
||||||
|
drug_tests = st.checkbox("Willing to undergo drug tests",
|
||||||
|
value=wp.get("willing_to_undergo_drug_tests", "No") == "Yes", key="pref_drug")
|
||||||
|
|
||||||
|
# ── Self-ID ───────────────────────────────────────────────────────────────────
|
||||||
|
with st.expander("🏳️🌈 Self-Identification (optional)"):
|
||||||
|
sid = data.get("self_identification", {})
|
||||||
|
col1, col2 = st.columns(2)
|
||||||
|
with col1:
|
||||||
|
gender = st.text_input("Gender identity", sid.get("gender", "Non-binary"), key="sid_gender",
|
||||||
|
help="Select 'Non-binary' or 'Prefer not to say' when options allow")
|
||||||
|
pronouns = st.text_input("Pronouns", sid.get("pronouns", "Any"), key="sid_pronouns")
|
||||||
|
ethnicity = field("Ethnicity", sid.get("ethnicity", ""), "sid_ethnicity",
|
||||||
|
help="'Prefer not to say' is always an option")
|
||||||
|
with col2:
|
||||||
|
vet_options = ["No", "Yes", "Prefer not to say"]
|
||||||
|
veteran = st.selectbox("Veteran status", vet_options,
|
||||||
|
index=vet_options.index(sid.get("veteran", "No")), key="sid_vet")
|
||||||
|
dis_options = ["Prefer not to say", "No", "Yes"]
|
||||||
|
disability = st.selectbox("Disability disclosure", dis_options,
|
||||||
|
index=dis_options.index(sid.get("disability", "Prefer not to say")),
|
||||||
|
key="sid_dis")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Save ──────────────────────────────────────────────────────────────────────
|
||||||
|
if st.button("💾 Save Resume Profile", type="primary", use_container_width=True):
|
||||||
|
data["personal_information"] = {
|
||||||
|
**data.get("personal_information", {}),
|
||||||
|
"name": name, "surname": surname, "email": email, "phone": phone,
|
||||||
|
"city": city, "zip_code": zip_code, "linkedin": linkedin, "date_of_birth": dob,
|
||||||
|
}
|
||||||
|
data["education_details"] = updated_edu
|
||||||
|
data["experience_details"] = updated_exp
|
||||||
|
data["salary_expectations"] = {"salary_range_usd": salary_range}
|
||||||
|
data["availability"] = {"notice_period": notice}
|
||||||
|
data["work_preferences"] = {
|
||||||
|
**data.get("work_preferences", {}),
|
||||||
|
"remote_work": "Yes" if remote_work else "No",
|
||||||
|
"open_to_relocation": "Yes" if relocation else "No",
|
||||||
|
"willing_to_complete_assessments": "Yes" if assessments else "No",
|
||||||
|
"willing_to_undergo_background_checks": "Yes" if bg_checks else "No",
|
||||||
|
"willing_to_undergo_drug_tests": "Yes" if drug_tests else "No",
|
||||||
|
}
|
||||||
|
data["self_identification"] = {
|
||||||
|
"gender": gender, "pronouns": pronouns, "veteran": veteran,
|
||||||
|
"disability": disability, "ethnicity": ethnicity,
|
||||||
|
}
|
||||||
|
RESUME_PATH.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True))
|
||||||
|
st.success("✅ Profile saved!")
|
||||||
|
st.balloons()
|
||||||
388
app/pages/4_Apply.py
Normal file
388
app/pages/4_Apply.py
Normal file
|
|
@ -0,0 +1,388 @@
|
||||||
|
# app/pages/4_Apply.py
|
||||||
|
"""
|
||||||
|
Apply Workspace — side-by-side cover letter tools and job description.
|
||||||
|
Generates a PDF cover letter saved to the JobSearch docs folder.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
import streamlit.components.v1 as components
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB, init_db, get_jobs_by_status,
|
||||||
|
update_cover_letter, mark_applied, update_job_status,
|
||||||
|
get_task_for_job,
|
||||||
|
)
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
|
||||||
|
DOCS_DIR = Path("/Library/Documents/JobSearch")
|
||||||
|
RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
|
|
||||||
|
st.title("🚀 Apply Workspace")
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── PDF generation ─────────────────────────────────────────────────────────────
|
||||||
|
def _make_cover_letter_pdf(job: dict, cover_letter: str, output_dir: Path) -> Path:
|
||||||
|
from reportlab.lib.pagesizes import letter
|
||||||
|
from reportlab.lib.units import inch
|
||||||
|
from reportlab.lib.colors import HexColor
|
||||||
|
from reportlab.lib.styles import ParagraphStyle
|
||||||
|
from reportlab.lib.enums import TA_LEFT
|
||||||
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable
|
||||||
|
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
company_safe = re.sub(r"[^a-zA-Z0-9]", "", job.get("company", "Company"))
|
||||||
|
date_str = datetime.now().strftime("%Y-%m-%d")
|
||||||
|
out_path = output_dir / f"CoverLetter_{company_safe}_{date_str}.pdf"
|
||||||
|
|
||||||
|
doc = SimpleDocTemplate(
|
||||||
|
str(out_path),
|
||||||
|
pagesize=letter,
|
||||||
|
leftMargin=inch, rightMargin=inch,
|
||||||
|
topMargin=inch, bottomMargin=inch,
|
||||||
|
)
|
||||||
|
|
||||||
|
teal = HexColor("#2DD4BF")
|
||||||
|
dark = HexColor("#0F172A")
|
||||||
|
slate = HexColor("#64748B")
|
||||||
|
|
||||||
|
name_style = ParagraphStyle(
|
||||||
|
"Name", fontName="Helvetica-Bold", fontSize=22,
|
||||||
|
textColor=teal, spaceAfter=6,
|
||||||
|
)
|
||||||
|
contact_style = ParagraphStyle(
|
||||||
|
"Contact", fontName="Helvetica", fontSize=9,
|
||||||
|
textColor=slate, spaceAfter=4,
|
||||||
|
)
|
||||||
|
date_style = ParagraphStyle(
|
||||||
|
"Date", fontName="Helvetica", fontSize=11,
|
||||||
|
textColor=dark, spaceBefore=16, spaceAfter=14,
|
||||||
|
)
|
||||||
|
body_style = ParagraphStyle(
|
||||||
|
"Body", fontName="Helvetica", fontSize=11,
|
||||||
|
textColor=dark, leading=16, spaceAfter=12, alignment=TA_LEFT,
|
||||||
|
)
|
||||||
|
|
||||||
|
story = [
|
||||||
|
Paragraph("ALEX RIVERA", name_style),
|
||||||
|
Paragraph(
|
||||||
|
"alex@example.com · (555) 867-5309 · "
|
||||||
|
"linkedin.com/in/AlexMcCann · hirealexmccann.site",
|
||||||
|
contact_style,
|
||||||
|
),
|
||||||
|
HRFlowable(width="100%", thickness=1, color=teal, spaceBefore=8, spaceAfter=0),
|
||||||
|
Paragraph(datetime.now().strftime("%B %d, %Y"), date_style),
|
||||||
|
]
|
||||||
|
|
||||||
|
for para in cover_letter.strip().split("\n\n"):
|
||||||
|
para = para.strip()
|
||||||
|
if para:
|
||||||
|
story.append(Paragraph(para.replace("\n", "<br/>"), body_style))
|
||||||
|
|
||||||
|
story += [
|
||||||
|
Spacer(1, 6),
|
||||||
|
Paragraph("Warm regards,<br/><br/>Alex Rivera", body_style),
|
||||||
|
]
|
||||||
|
|
||||||
|
doc.build(story)
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
# ── Application Q&A helper ─────────────────────────────────────────────────────
|
||||||
|
def _answer_question(job: dict, question: str) -> str:
|
||||||
|
"""Call the LLM to answer an application question in Alex's voice.
|
||||||
|
|
||||||
|
Uses research_fallback_order (claude_code → vllm → ollama_research)
|
||||||
|
rather than the default cover-letter order — the fine-tuned cover letter
|
||||||
|
model is not suited for answering general application questions.
|
||||||
|
"""
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
router = LLMRouter()
|
||||||
|
fallback = router.config.get("research_fallback_order") or router.config.get("fallback_order")
|
||||||
|
description_snippet = (job.get("description") or "")[:1200].strip()
|
||||||
|
prompt = f"""You are answering job application questions for Alex Rivera, a customer success leader.
|
||||||
|
|
||||||
|
Background:
|
||||||
|
- 6+ years in customer success, technical account management, and CS leadership
|
||||||
|
- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), NPS consistently ≥95
|
||||||
|
- Also founder of M3 Consulting, a CS advisory practice for SaaS startups
|
||||||
|
- Based in SF Bay Area; open to remote/hybrid; pronouns: any
|
||||||
|
|
||||||
|
Role she's applying to: {job.get("title", "")} at {job.get("company", "")}
|
||||||
|
{f"Job description excerpt:{chr(10)}{description_snippet}" if description_snippet else ""}
|
||||||
|
|
||||||
|
Application Question:
|
||||||
|
{question}
|
||||||
|
|
||||||
|
Answer in Alex's voice — specific, warm, and confident. If the question specifies a word or character limit, respect it. Answer only the question with no preamble or sign-off."""
|
||||||
|
return router.complete(prompt, fallback_order=fallback).strip()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Copy-to-clipboard button ───────────────────────────────────────────────────
|
||||||
|
def _copy_btn(text: str, label: str = "📋 Copy", done: str = "✅ Copied!", height: int = 44) -> None:
|
||||||
|
import json
|
||||||
|
# Each components.html call renders in its own sandboxed iframe, so a fixed
|
||||||
|
# element id is fine. json.dumps handles all special chars (quotes, newlines,
|
||||||
|
# backslashes, etc.) — avoids the fragile inline-onclick escaping approach.
|
||||||
|
components.html(
|
||||||
|
f"""<button id="b"
|
||||||
|
style="width:100%;background:#2DD4BF;color:#0F172A;border:none;
|
||||||
|
padding:6px 10px;border-radius:6px;cursor:pointer;
|
||||||
|
font-size:13px;font-weight:600">{label}</button>
|
||||||
|
<script>
|
||||||
|
document.getElementById('b').addEventListener('click', function() {{
|
||||||
|
navigator.clipboard.writeText({json.dumps(text)});
|
||||||
|
this.textContent = {json.dumps(done)};
|
||||||
|
setTimeout(() => this.textContent = {json.dumps(label)}, 2000);
|
||||||
|
}});
|
||||||
|
</script>""",
|
||||||
|
height=height,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Job selection ──────────────────────────────────────────────────────────────
|
||||||
|
approved = get_jobs_by_status(DEFAULT_DB, "approved")
|
||||||
|
if not approved:
|
||||||
|
st.info("No approved jobs — head to Job Review to approve some listings first.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
preselect_id = st.session_state.pop("apply_job_id", None)
|
||||||
|
job_options = {j["id"]: f"{j['title']} — {j['company']}" for j in approved}
|
||||||
|
ids = list(job_options.keys())
|
||||||
|
default_idx = ids.index(preselect_id) if preselect_id in ids else 0
|
||||||
|
|
||||||
|
selected_id = st.selectbox(
|
||||||
|
"Job",
|
||||||
|
options=ids,
|
||||||
|
format_func=lambda x: job_options[x],
|
||||||
|
index=default_idx,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
job = next(j for j in approved if j["id"] == selected_id)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Two-column workspace ───────────────────────────────────────────────────────
|
||||||
|
col_tools, col_jd = st.columns([2, 3])
|
||||||
|
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
# RIGHT — job description
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
with col_jd:
|
||||||
|
score = job.get("match_score")
|
||||||
|
score_badge = (
|
||||||
|
"⬜ No score" if score is None else
|
||||||
|
f"🟢 {score:.0f}%" if score >= 70 else
|
||||||
|
f"🟡 {score:.0f}%" if score >= 40 else f"🔴 {score:.0f}%"
|
||||||
|
)
|
||||||
|
remote_badge = "🌐 Remote" if job.get("is_remote") else "🏢 On-site"
|
||||||
|
src = (job.get("source") or "").lower()
|
||||||
|
source_badge = f"🤖 {src.title()}" if src == "linkedin" else f"👤 {src.title() or 'Manual'}"
|
||||||
|
|
||||||
|
st.subheader(job["title"])
|
||||||
|
st.caption(
|
||||||
|
f"**{job['company']}** · {job.get('location', '')} · "
|
||||||
|
f"{remote_badge} · {source_badge} · {score_badge}"
|
||||||
|
)
|
||||||
|
if job.get("salary"):
|
||||||
|
st.caption(f"💰 {job['salary']}")
|
||||||
|
if job.get("keyword_gaps"):
|
||||||
|
st.caption(f"**Gaps to address in letter:** {job['keyword_gaps']}")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.markdown(job.get("description") or "_No description scraped for this listing._")
|
||||||
|
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
# LEFT — copy tools
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
with col_tools:
|
||||||
|
|
||||||
|
# ── Cover letter ──────────────────────────────
|
||||||
|
st.subheader("📝 Cover Letter")
|
||||||
|
|
||||||
|
_cl_key = f"cl_{selected_id}"
|
||||||
|
if _cl_key not in st.session_state:
|
||||||
|
st.session_state[_cl_key] = job.get("cover_letter") or ""
|
||||||
|
|
||||||
|
_cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||||
|
_cl_running = _cl_task and _cl_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)):
|
||||||
|
submit_task(DEFAULT_DB, "cover_letter", selected_id)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _cl_running:
|
||||||
|
@st.fragment(run_every=3)
|
||||||
|
def _cl_status_fragment():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…"
|
||||||
|
st.info(f"⏳ {lbl}")
|
||||||
|
else:
|
||||||
|
st.rerun() # full page rerun — reloads cover letter from DB
|
||||||
|
_cl_status_fragment()
|
||||||
|
elif _cl_task and _cl_task["status"] == "failed":
|
||||||
|
st.error(f"Generation failed: {_cl_task.get('error', 'unknown error')}")
|
||||||
|
|
||||||
|
# Refresh session state only when a NEW task has just completed — not on every rerun.
|
||||||
|
# Without this guard, every Save Draft click would overwrite the edited text with the
|
||||||
|
# old DB value before cl_text could be captured.
|
||||||
|
_cl_loaded_key = f"cl_loaded_{selected_id}"
|
||||||
|
if not _cl_running and _cl_task and _cl_task["status"] == "completed":
|
||||||
|
if st.session_state.get(_cl_loaded_key) != _cl_task["id"]:
|
||||||
|
st.session_state[_cl_key] = job.get("cover_letter") or ""
|
||||||
|
st.session_state[_cl_loaded_key] = _cl_task["id"]
|
||||||
|
|
||||||
|
cl_text = st.text_area(
|
||||||
|
"cover_letter_body",
|
||||||
|
key=_cl_key,
|
||||||
|
height=280,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Copy + Save row
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
with c1:
|
||||||
|
if cl_text:
|
||||||
|
_copy_btn(cl_text, label="📋 Copy Letter")
|
||||||
|
with c2:
|
||||||
|
if st.button("💾 Save draft", use_container_width=True):
|
||||||
|
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||||
|
st.success("Saved!")
|
||||||
|
|
||||||
|
# PDF generation
|
||||||
|
if cl_text:
|
||||||
|
if st.button("📄 Export PDF → JobSearch folder", use_container_width=True, type="primary"):
|
||||||
|
with st.spinner("Generating PDF…"):
|
||||||
|
try:
|
||||||
|
pdf_path = _make_cover_letter_pdf(job, cl_text, DOCS_DIR)
|
||||||
|
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||||
|
st.success(f"Saved: `{pdf_path.name}`")
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"PDF error: {e}")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# Open listing + Mark Applied
|
||||||
|
c3, c4 = st.columns(2)
|
||||||
|
with c3:
|
||||||
|
if job.get("url"):
|
||||||
|
st.link_button("Open listing ↗", job["url"], use_container_width=True)
|
||||||
|
with c4:
|
||||||
|
if st.button("✅ Mark as Applied", use_container_width=True, type="primary"):
|
||||||
|
if cl_text:
|
||||||
|
update_cover_letter(DEFAULT_DB, selected_id, cl_text)
|
||||||
|
mark_applied(DEFAULT_DB, [selected_id])
|
||||||
|
st.success("Marked as applied!")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if st.button("🚫 Reject listing", use_container_width=True):
|
||||||
|
update_job_status(DEFAULT_DB, [selected_id], "rejected")
|
||||||
|
# Advance selectbox to next job so list doesn't snap to first item
|
||||||
|
current_idx = ids.index(selected_id) if selected_id in ids else 0
|
||||||
|
if current_idx + 1 < len(ids):
|
||||||
|
st.session_state["apply_job_id"] = ids[current_idx + 1]
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Resume highlights ─────────────────────────
|
||||||
|
with st.expander("📄 Resume Highlights"):
|
||||||
|
if RESUME_YAML.exists():
|
||||||
|
resume = yaml.safe_load(RESUME_YAML.read_text()) or {}
|
||||||
|
for exp in resume.get("experience_details", []):
|
||||||
|
position = exp.get("position", "")
|
||||||
|
company = exp.get("company", "")
|
||||||
|
period = exp.get("employment_period", "")
|
||||||
|
|
||||||
|
# Parse start / end dates (handles "MM/YYYY - Present" style)
|
||||||
|
if " - " in period:
|
||||||
|
date_start, date_end = [p.strip() for p in period.split(" - ", 1)]
|
||||||
|
else:
|
||||||
|
date_start, date_end = period, ""
|
||||||
|
|
||||||
|
# Flatten bullets
|
||||||
|
bullets = [
|
||||||
|
v
|
||||||
|
for resp_dict in exp.get("key_responsibilities", [])
|
||||||
|
for v in resp_dict.values()
|
||||||
|
]
|
||||||
|
all_duties = "\n".join(f"• {b}" for b in bullets)
|
||||||
|
|
||||||
|
# ── Header ────────────────────────────────────────────────────
|
||||||
|
st.markdown(
|
||||||
|
f"**{position}** · "
|
||||||
|
f"{company} · "
|
||||||
|
f"*{period}*"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Copy row: title | start | end | all duties ────────────────
|
||||||
|
cp_t, cp_s, cp_e, cp_d = st.columns(4)
|
||||||
|
with cp_t:
|
||||||
|
st.caption("Title")
|
||||||
|
_copy_btn(position, label="📋 Copy", height=34)
|
||||||
|
with cp_s:
|
||||||
|
st.caption("Start")
|
||||||
|
_copy_btn(date_start, label="📋 Copy", height=34)
|
||||||
|
with cp_e:
|
||||||
|
st.caption("End")
|
||||||
|
_copy_btn(date_end or period, label="📋 Copy", height=34)
|
||||||
|
with cp_d:
|
||||||
|
st.caption("All Duties")
|
||||||
|
if bullets:
|
||||||
|
_copy_btn(all_duties, label="📋 Copy", height=34)
|
||||||
|
|
||||||
|
# ── Individual bullets ────────────────────────────────────────
|
||||||
|
for bullet in bullets:
|
||||||
|
b_col, cp_col = st.columns([6, 1])
|
||||||
|
b_col.caption(f"• {bullet}")
|
||||||
|
with cp_col:
|
||||||
|
_copy_btn(bullet, label="📋", done="✅", height=32)
|
||||||
|
|
||||||
|
st.markdown("---")
|
||||||
|
else:
|
||||||
|
st.warning("Resume YAML not found — check that AIHawk is cloned.")
|
||||||
|
|
||||||
|
# ── Application Q&A ───────────────────────────────────────────────────────
|
||||||
|
with st.expander("💬 Answer Application Questions"):
|
||||||
|
st.caption("Paste a question from the application and get an answer in your voice.")
|
||||||
|
|
||||||
|
_qa_key = f"qa_list_{selected_id}"
|
||||||
|
if _qa_key not in st.session_state:
|
||||||
|
st.session_state[_qa_key] = []
|
||||||
|
|
||||||
|
q_input = st.text_area(
|
||||||
|
"Paste question",
|
||||||
|
placeholder="In 200 words or less, explain why you're a strong fit for this role.",
|
||||||
|
height=80,
|
||||||
|
key=f"qa_input_{selected_id}",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
if st.button("✨ Generate Answer", key=f"qa_gen_{selected_id}",
|
||||||
|
use_container_width=True,
|
||||||
|
disabled=not (q_input or "").strip()):
|
||||||
|
with st.spinner("Generating answer…"):
|
||||||
|
_answer = _answer_question(job, q_input.strip())
|
||||||
|
st.session_state[_qa_key].append({"q": q_input.strip(), "a": _answer})
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
for _i, _pair in enumerate(reversed(st.session_state[_qa_key])):
|
||||||
|
_real_idx = len(st.session_state[_qa_key]) - 1 - _i
|
||||||
|
st.markdown(f"**Q:** {_pair['q']}")
|
||||||
|
_a_key = f"qa_ans_{selected_id}_{_real_idx}"
|
||||||
|
if _a_key not in st.session_state:
|
||||||
|
st.session_state[_a_key] = _pair["a"]
|
||||||
|
_answer_text = st.text_area(
|
||||||
|
"answer",
|
||||||
|
key=_a_key,
|
||||||
|
height=120,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
_copy_btn(_answer_text, label="📋 Copy Answer")
|
||||||
|
if _i < len(st.session_state[_qa_key]) - 1:
|
||||||
|
st.markdown("---")
|
||||||
539
app/pages/5_Interviews.py
Normal file
539
app/pages/5_Interviews.py
Normal file
|
|
@ -0,0 +1,539 @@
|
||||||
|
# app/pages/5_Interviews.py
|
||||||
|
"""
|
||||||
|
Interviews — Kanban board for tracking post-application engagement.
|
||||||
|
|
||||||
|
Pipeline: applied → phone_screen → interviewing → offer → hired
|
||||||
|
(or rejected at any stage, with stage captured for analytics)
|
||||||
|
|
||||||
|
Features:
|
||||||
|
- Kanban columns for each interview stage
|
||||||
|
- Company research brief auto-generated when advancing to Phone Screen
|
||||||
|
- Contact / email log per job
|
||||||
|
- Email reply drafter via LLM
|
||||||
|
- Interview date tracking with calendar push hint
|
||||||
|
- Rejection analytics
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
from datetime import date, datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB, init_db,
|
||||||
|
get_interview_jobs, advance_to_stage, reject_at_stage,
|
||||||
|
set_interview_date, add_contact, get_contacts,
|
||||||
|
get_research, get_task_for_job, get_job_by_id,
|
||||||
|
get_unread_stage_signals, dismiss_stage_signal,
|
||||||
|
)
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
|
||||||
|
st.title("🎯 Interviews")
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── Sidebar: Email sync ────────────────────────────────────────────────────────
|
||||||
|
with st.sidebar:
|
||||||
|
st.markdown("### 📧 Email Sync")
|
||||||
|
_email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||||
|
_email_running = _email_task and _email_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if st.button("🔄 Sync Emails", use_container_width=True, type="primary",
|
||||||
|
disabled=bool(_email_running)):
|
||||||
|
submit_task(DEFAULT_DB, "email_sync", 0)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _email_running:
|
||||||
|
@st.fragment(run_every=4)
|
||||||
|
def _email_sidebar_status():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "email_sync", 0)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
st.info("⏳ Syncing…")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_email_sidebar_status()
|
||||||
|
elif _email_task and _email_task["status"] == "completed":
|
||||||
|
st.success(_email_task.get("error", "Done"))
|
||||||
|
elif _email_task and _email_task["status"] == "failed":
|
||||||
|
msg = _email_task.get("error", "")
|
||||||
|
if "not configured" in msg.lower():
|
||||||
|
st.error("Email not configured. Go to **Settings → Email**.")
|
||||||
|
else:
|
||||||
|
st.error(f"Sync failed: {msg}")
|
||||||
|
|
||||||
|
# ── Constants ─────────────────────────────────────────────────────────────────
|
||||||
|
STAGE_LABELS = {
|
||||||
|
"phone_screen": "📞 Phone Screen",
|
||||||
|
"interviewing": "🎯 Interviewing",
|
||||||
|
"offer": "📜 Offer / Hired",
|
||||||
|
}
|
||||||
|
STAGE_NEXT = {
|
||||||
|
"survey": "phone_screen",
|
||||||
|
"applied": "phone_screen",
|
||||||
|
"phone_screen": "interviewing",
|
||||||
|
"interviewing": "offer",
|
||||||
|
"offer": "hired",
|
||||||
|
}
|
||||||
|
STAGE_NEXT_LABEL = {
|
||||||
|
"survey": "📞 Phone Screen",
|
||||||
|
"applied": "📞 Phone Screen",
|
||||||
|
"phone_screen": "🎯 Interviewing",
|
||||||
|
"interviewing": "📜 Offer",
|
||||||
|
"offer": "🎉 Hired",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Data ──────────────────────────────────────────────────────────────────────
|
||||||
|
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
def _days_ago(date_str: str | None) -> str:
|
||||||
|
if not date_str:
|
||||||
|
return "—"
|
||||||
|
try:
|
||||||
|
d = date.fromisoformat(date_str[:10])
|
||||||
|
delta = (date.today() - d).days
|
||||||
|
if delta == 0:
|
||||||
|
return "today"
|
||||||
|
if delta == 1:
|
||||||
|
return "yesterday"
|
||||||
|
return f"{delta}d ago"
|
||||||
|
except Exception:
|
||||||
|
return date_str[:10]
|
||||||
|
|
||||||
|
@st.dialog("🔬 Company Research", width="large")
|
||||||
|
def _research_modal(job: dict) -> None:
|
||||||
|
job_id = job["id"]
|
||||||
|
st.caption(f"**{job.get('company')}** — {job.get('title')}")
|
||||||
|
research = get_research(DEFAULT_DB, job_id=job_id)
|
||||||
|
task = get_task_for_job(DEFAULT_DB, "company_research", job_id)
|
||||||
|
running = task and task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if running:
|
||||||
|
task_stage = (task.get("stage") or "")
|
||||||
|
lbl = "Queued…" if task["status"] == "queued" else (task_stage or "Generating…")
|
||||||
|
st.info(f"⏳ {lbl}")
|
||||||
|
elif research:
|
||||||
|
scrape_used = research.get("scrape_used")
|
||||||
|
if not scrape_used:
|
||||||
|
import socket as _sock
|
||||||
|
_searxng_up = False
|
||||||
|
try:
|
||||||
|
with _sock.create_connection(("127.0.0.1", 8888), timeout=1):
|
||||||
|
_searxng_up = True
|
||||||
|
except OSError:
|
||||||
|
pass
|
||||||
|
if _searxng_up:
|
||||||
|
st.warning(
|
||||||
|
"⚠️ This brief was generated without live web data and may contain "
|
||||||
|
"inaccuracies. SearXNG is now available — re-run to get verified facts."
|
||||||
|
)
|
||||||
|
if st.button("🔄 Re-run with live data", key=f"modal_rescrape_{job_id}", type="primary"):
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
st.rerun()
|
||||||
|
st.divider()
|
||||||
|
else:
|
||||||
|
st.warning(
|
||||||
|
"⚠️ Generated without live web data (SearXNG was offline). "
|
||||||
|
"Key facts like CEO, investors, and founding date may be hallucinated — "
|
||||||
|
"verify before the call. Start SearXNG in Settings → Services to re-run."
|
||||||
|
)
|
||||||
|
st.divider()
|
||||||
|
st.caption(
|
||||||
|
f"Generated {research.get('generated_at', '')} "
|
||||||
|
f"{'· web data used ✓' if scrape_used else '· LLM knowledge only'}"
|
||||||
|
)
|
||||||
|
st.markdown(research["raw_output"])
|
||||||
|
if st.button("🔄 Refresh", key=f"modal_regen_{job_id}", disabled=bool(running)):
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.info("No research brief yet.")
|
||||||
|
if task and task["status"] == "failed":
|
||||||
|
st.error(f"Last attempt failed: {task.get('error', '')}")
|
||||||
|
if st.button("🔬 Generate now", key=f"modal_gen_{job_id}"):
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
|
||||||
|
@st.dialog("📧 Email History", width="large")
|
||||||
|
def _email_modal(job: dict) -> None:
|
||||||
|
job_id = job["id"]
|
||||||
|
st.caption(f"**{job.get('company')}** — {job.get('title')}")
|
||||||
|
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||||
|
|
||||||
|
if not contacts:
|
||||||
|
st.info("No emails logged yet. Use the form below to add one.")
|
||||||
|
else:
|
||||||
|
for c in contacts:
|
||||||
|
icon = "📥" if c["direction"] == "inbound" else "📤"
|
||||||
|
st.markdown(
|
||||||
|
f"{icon} **{c.get('subject') or '(no subject)'}** "
|
||||||
|
f"· _{c.get('received_at', '')[:10]}_"
|
||||||
|
)
|
||||||
|
if c.get("from_addr"):
|
||||||
|
st.caption(f"From: {c['from_addr']}")
|
||||||
|
if c.get("body"):
|
||||||
|
st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else ""))
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
inbound = [c for c in contacts if c["direction"] == "inbound"]
|
||||||
|
if inbound:
|
||||||
|
last = inbound[-1]
|
||||||
|
if st.button("✍️ Draft reply", key=f"modal_draft_{job_id}"):
|
||||||
|
with st.spinner("Drafting…"):
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import complete
|
||||||
|
draft = complete(
|
||||||
|
prompt=(
|
||||||
|
f"Draft a professional, warm reply to this email.\n\n"
|
||||||
|
f"From: {last.get('from_addr', '')}\n"
|
||||||
|
f"Subject: {last.get('subject', '')}\n\n"
|
||||||
|
f"{last.get('body', '')}\n\n"
|
||||||
|
f"Context: Alex Rivera is a Customer Success / "
|
||||||
|
f"Technical Account Manager applying for "
|
||||||
|
f"{job.get('title')} at {job.get('company')}."
|
||||||
|
),
|
||||||
|
system=(
|
||||||
|
"You are Alex Rivera's professional email assistant. "
|
||||||
|
"Write concise, warm, and professional replies in her voice. "
|
||||||
|
"Keep it to 3–5 sentences unless more is needed."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
st.session_state[f"modal_draft_text_{job_id}"] = draft
|
||||||
|
st.rerun()
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Draft failed: {e}")
|
||||||
|
|
||||||
|
if f"modal_draft_text_{job_id}" in st.session_state:
|
||||||
|
st.text_area(
|
||||||
|
"Draft (edit before sending)",
|
||||||
|
value=st.session_state[f"modal_draft_text_{job_id}"],
|
||||||
|
height=160,
|
||||||
|
key=f"modal_draft_area_{job_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
st.markdown("**Log a contact**")
|
||||||
|
with st.form(key=f"contact_form_modal_{job_id}", clear_on_submit=True):
|
||||||
|
col_a, col_b = st.columns(2)
|
||||||
|
direction = col_a.radio(
|
||||||
|
"Direction", ["inbound", "outbound"],
|
||||||
|
horizontal=True, key=f"dir_modal_{job_id}",
|
||||||
|
)
|
||||||
|
recv_at = col_b.text_input(
|
||||||
|
"Date (YYYY-MM-DD)", value=str(date.today()), key=f"recv_modal_{job_id}"
|
||||||
|
)
|
||||||
|
subject = st.text_input("Subject", key=f"subj_modal_{job_id}")
|
||||||
|
from_addr = st.text_input("From", key=f"from_modal_{job_id}")
|
||||||
|
body_text = st.text_area("Body / notes", height=80, key=f"body_modal_{job_id}")
|
||||||
|
if st.form_submit_button("📧 Save contact"):
|
||||||
|
add_contact(
|
||||||
|
DEFAULT_DB, job_id=job_id,
|
||||||
|
direction=direction, subject=subject,
|
||||||
|
from_addr=from_addr, body=body_text, received_at=recv_at,
|
||||||
|
)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
def _render_card(job: dict, stage: str, compact: bool = False) -> None:
|
||||||
|
"""Render a single job card appropriate for the given stage."""
|
||||||
|
job_id = job["id"]
|
||||||
|
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||||
|
last_contact = contacts[-1] if contacts else None
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
st.markdown(f"**{job.get('company', '?')}**")
|
||||||
|
st.caption(job.get("title", ""))
|
||||||
|
|
||||||
|
col_a, col_b = st.columns(2)
|
||||||
|
col_a.caption(f"Applied: {_days_ago(job.get('applied_at'))}")
|
||||||
|
if last_contact:
|
||||||
|
col_b.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}")
|
||||||
|
|
||||||
|
# Interview date picker (phone_screen / interviewing stages)
|
||||||
|
if stage in ("phone_screen", "interviewing"):
|
||||||
|
current_idate = job.get("interview_date") or ""
|
||||||
|
with st.form(key=f"idate_form_{job_id}"):
|
||||||
|
new_date = st.date_input(
|
||||||
|
"Interview date",
|
||||||
|
value=date.fromisoformat(current_idate) if current_idate else None,
|
||||||
|
key=f"idate_{job_id}",
|
||||||
|
format="YYYY-MM-DD",
|
||||||
|
)
|
||||||
|
if st.form_submit_button("📅 Save date"):
|
||||||
|
set_interview_date(DEFAULT_DB, job_id=job_id, date_str=str(new_date))
|
||||||
|
st.success("Saved!")
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if not compact:
|
||||||
|
if stage in ("applied", "phone_screen", "interviewing"):
|
||||||
|
signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
|
||||||
|
if signals:
|
||||||
|
sig = signals[-1]
|
||||||
|
_SIGNAL_TO_STAGE = {
|
||||||
|
"interview_scheduled": ("phone_screen", "📞 Phone Screen"),
|
||||||
|
"positive_response": ("phone_screen", "📞 Phone Screen"),
|
||||||
|
"offer_received": ("offer", "📜 Offer"),
|
||||||
|
"survey_received": ("survey", "📋 Survey"),
|
||||||
|
}
|
||||||
|
target_stage, target_label = _SIGNAL_TO_STAGE.get(
|
||||||
|
sig["stage_signal"], (None, None)
|
||||||
|
)
|
||||||
|
with st.container(border=True):
|
||||||
|
st.caption(
|
||||||
|
f"💡 Email suggests: **{sig['stage_signal'].replace('_', ' ')}** \n"
|
||||||
|
f"_{sig.get('subject', '')}_ · {(sig.get('received_at') or '')[:10]}"
|
||||||
|
)
|
||||||
|
b1, b2 = st.columns(2)
|
||||||
|
if sig["stage_signal"] == "rejected":
|
||||||
|
if b1.button("✗ Reject", key=f"sig_rej_{sig['id']}",
|
||||||
|
use_container_width=True):
|
||||||
|
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||||
|
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||||
|
st.rerun(scope="app")
|
||||||
|
elif target_stage and b1.button(
|
||||||
|
f"→ {target_label}", key=f"sig_adv_{sig['id']}",
|
||||||
|
use_container_width=True, type="primary",
|
||||||
|
):
|
||||||
|
if target_stage == "phone_screen" and stage == "applied":
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
elif target_stage:
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
|
||||||
|
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||||
|
st.rerun(scope="app")
|
||||||
|
if b2.button("Dismiss", key=f"sig_dis_{sig['id']}",
|
||||||
|
use_container_width=True):
|
||||||
|
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Advance / Reject buttons
|
||||||
|
next_stage = STAGE_NEXT.get(stage)
|
||||||
|
c1, c2 = st.columns(2)
|
||||||
|
if next_stage:
|
||||||
|
next_label = STAGE_NEXT_LABEL.get(stage, next_stage)
|
||||||
|
if c1.button(
|
||||||
|
f"→ {next_label}", key=f"adv_{job_id}",
|
||||||
|
use_container_width=True, type="primary",
|
||||||
|
):
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=next_stage)
|
||||||
|
if next_stage == "phone_screen":
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
st.rerun(scope="app") # full rerun — card must appear in new column
|
||||||
|
|
||||||
|
if c2.button(
|
||||||
|
"✗ Reject", key=f"rej_{job_id}",
|
||||||
|
use_container_width=True,
|
||||||
|
):
|
||||||
|
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||||
|
st.rerun() # fragment-scope rerun — card disappears without scroll-to-top
|
||||||
|
|
||||||
|
if job.get("url"):
|
||||||
|
st.link_button("Open listing ↗", job["url"], use_container_width=True)
|
||||||
|
|
||||||
|
if stage in ("phone_screen", "interviewing", "offer"):
|
||||||
|
if st.button(
|
||||||
|
"📋 Open Prep Sheet", key=f"prep_{job_id}",
|
||||||
|
use_container_width=True,
|
||||||
|
help="Open the Interview Prep page for this job",
|
||||||
|
):
|
||||||
|
st.session_state["prep_job_id"] = job_id
|
||||||
|
st.switch_page("pages/6_Interview_Prep.py")
|
||||||
|
|
||||||
|
# Detail modals — full-width overlays replace narrow inline expanders
|
||||||
|
if stage in ("phone_screen", "interviewing", "offer"):
|
||||||
|
mc1, mc2 = st.columns(2)
|
||||||
|
if mc1.button("🔬 Research", key=f"res_btn_{job_id}", use_container_width=True):
|
||||||
|
_research_modal(job)
|
||||||
|
if mc2.button("📧 Emails", key=f"email_btn_{job_id}", use_container_width=True):
|
||||||
|
_email_modal(job)
|
||||||
|
else:
|
||||||
|
if st.button("📧 Emails", key=f"email_btn_{job_id}", use_container_width=True):
|
||||||
|
_email_modal(job)
|
||||||
|
|
||||||
|
# ── Fragment wrappers — keep scroll position on card actions ─────────────────
|
||||||
|
@st.fragment
|
||||||
|
def _card_fragment(job_id: int, stage: str) -> None:
|
||||||
|
"""Re-fetches the job on each fragment rerun; renders nothing if moved/rejected."""
|
||||||
|
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||||
|
if job is None or job.get("status") != stage:
|
||||||
|
return
|
||||||
|
_render_card(job, stage)
|
||||||
|
|
||||||
|
|
||||||
|
@st.fragment
|
||||||
|
def _pre_kanban_row_fragment(job_id: int) -> None:
|
||||||
|
"""Pre-kanban compact row for applied and survey-stage jobs."""
|
||||||
|
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||||
|
if job is None or job.get("status") not in ("applied", "survey"):
|
||||||
|
return
|
||||||
|
stage = job["status"]
|
||||||
|
contacts = get_contacts(DEFAULT_DB, job_id=job_id)
|
||||||
|
last_contact = contacts[-1] if contacts else None
|
||||||
|
|
||||||
|
with st.container(border=True):
|
||||||
|
left, mid, right = st.columns([3, 2, 2])
|
||||||
|
badge = " 📋 **Survey**" if stage == "survey" else ""
|
||||||
|
left.markdown(f"**{job.get('company')}** — {job.get('title', '')}{badge}")
|
||||||
|
left.caption(f"Applied: {_days_ago(job.get('applied_at'))}")
|
||||||
|
|
||||||
|
with mid:
|
||||||
|
if last_contact:
|
||||||
|
st.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}")
|
||||||
|
if st.button("📧 Emails", key=f"email_pre_{job_id}", use_container_width=True):
|
||||||
|
_email_modal(job)
|
||||||
|
|
||||||
|
# Stage signal hint (email-detected next steps)
|
||||||
|
signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id)
|
||||||
|
if signals:
|
||||||
|
sig = signals[-1]
|
||||||
|
_SIGNAL_TO_STAGE = {
|
||||||
|
"interview_scheduled": ("phone_screen", "📞 Phone Screen"),
|
||||||
|
"positive_response": ("phone_screen", "📞 Phone Screen"),
|
||||||
|
"offer_received": ("offer", "📜 Offer"),
|
||||||
|
"survey_received": ("survey", "📋 Survey"),
|
||||||
|
}
|
||||||
|
target_stage, target_label = _SIGNAL_TO_STAGE.get(
|
||||||
|
sig["stage_signal"], (None, None)
|
||||||
|
)
|
||||||
|
with st.container(border=True):
|
||||||
|
st.caption(
|
||||||
|
f"💡 **{sig['stage_signal'].replace('_', ' ')}** \n"
|
||||||
|
f"_{sig.get('subject', '')}_ · {(sig.get('received_at') or '')[:10]}"
|
||||||
|
)
|
||||||
|
s1, s2 = st.columns(2)
|
||||||
|
if target_stage and s1.button(
|
||||||
|
f"→ {target_label}", key=f"sig_adv_pre_{sig['id']}",
|
||||||
|
use_container_width=True, type="primary",
|
||||||
|
):
|
||||||
|
if target_stage == "phone_screen":
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
else:
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage)
|
||||||
|
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||||
|
st.rerun(scope="app")
|
||||||
|
if s2.button("Dismiss", key=f"sig_dis_pre_{sig['id']}",
|
||||||
|
use_container_width=True):
|
||||||
|
dismiss_stage_signal(DEFAULT_DB, sig["id"])
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
with right:
|
||||||
|
if st.button(
|
||||||
|
"→ 📞 Phone Screen", key=f"adv_pre_{job_id}",
|
||||||
|
use_container_width=True, type="primary",
|
||||||
|
):
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen")
|
||||||
|
submit_task(DEFAULT_DB, "company_research", job_id)
|
||||||
|
st.rerun(scope="app")
|
||||||
|
col_a, col_b = st.columns(2)
|
||||||
|
if stage == "applied" and col_a.button(
|
||||||
|
"📋 Survey", key=f"to_survey_{job_id}", use_container_width=True,
|
||||||
|
):
|
||||||
|
advance_to_stage(DEFAULT_DB, job_id=job_id, stage="survey")
|
||||||
|
st.rerun(scope="app")
|
||||||
|
if col_b.button("✗ Reject", key=f"rej_pre_{job_id}", use_container_width=True):
|
||||||
|
reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
|
||||||
|
@st.fragment
|
||||||
|
def _hired_card_fragment(job_id: int) -> None:
|
||||||
|
"""Compact hired job card — shown in the Offer/Hired column."""
|
||||||
|
job = get_job_by_id(DEFAULT_DB, job_id)
|
||||||
|
if job is None or job.get("status") != "hired":
|
||||||
|
return
|
||||||
|
with st.container(border=True):
|
||||||
|
st.markdown(f"✅ **{job.get('company', '?')}**")
|
||||||
|
st.caption(job.get("title", ""))
|
||||||
|
st.caption(f"Hired {_days_ago(job.get('hired_at'))}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── Stats bar ─────────────────────────────────────────────────────────────────
|
||||||
|
c1, c2, c3, c4, c5, c6 = st.columns(6)
|
||||||
|
c1.metric("Applied", len(jobs_by_stage.get("applied", [])))
|
||||||
|
c2.metric("Survey", len(jobs_by_stage.get("survey", [])))
|
||||||
|
c3.metric("Phone Screen", len(jobs_by_stage.get("phone_screen", [])))
|
||||||
|
c4.metric("Interviewing", len(jobs_by_stage.get("interviewing", [])))
|
||||||
|
c5.metric("Offer/Hired", len(jobs_by_stage.get("offer", [])) + len(jobs_by_stage.get("hired", [])))
|
||||||
|
c6.metric("Rejected", len(jobs_by_stage.get("rejected", [])))
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Pre-kanban: Applied + Survey ───────────────────────────────────────────────
|
||||||
|
applied_jobs = jobs_by_stage.get("applied", [])
|
||||||
|
survey_jobs = jobs_by_stage.get("survey", [])
|
||||||
|
pre_kanban = survey_jobs + applied_jobs # survey shown first
|
||||||
|
|
||||||
|
if pre_kanban:
|
||||||
|
st.subheader(f"📋 Pre-pipeline ({len(pre_kanban)})")
|
||||||
|
st.caption(
|
||||||
|
"Move a job to **Phone Screen** once you receive an outreach. "
|
||||||
|
"A company research brief will be auto-generated to help you prepare."
|
||||||
|
)
|
||||||
|
for job in pre_kanban:
|
||||||
|
_pre_kanban_row_fragment(job["id"])
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Kanban columns ─────────────────────────────────────────────────────────────
|
||||||
|
kanban_stages = ["phone_screen", "interviewing", "offer"]
|
||||||
|
cols = st.columns(len(kanban_stages))
|
||||||
|
|
||||||
|
for col, stage in zip(cols, kanban_stages):
|
||||||
|
with col:
|
||||||
|
stage_jobs = jobs_by_stage.get(stage, [])
|
||||||
|
hired_jobs = jobs_by_stage.get("hired", []) if stage == "offer" else []
|
||||||
|
all_col_jobs = stage_jobs + hired_jobs
|
||||||
|
st.markdown(f"### {STAGE_LABELS[stage]}")
|
||||||
|
st.caption(f"{len(all_col_jobs)} job{'s' if len(all_col_jobs) != 1 else ''}")
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
if not all_col_jobs:
|
||||||
|
st.caption("_Empty_")
|
||||||
|
else:
|
||||||
|
for job in stage_jobs:
|
||||||
|
_card_fragment(job["id"], stage)
|
||||||
|
for job in hired_jobs:
|
||||||
|
_hired_card_fragment(job["id"])
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Rejected log + analytics ───────────────────────────────────────────────────
|
||||||
|
rejected_jobs = jobs_by_stage.get("rejected", [])
|
||||||
|
if rejected_jobs:
|
||||||
|
with st.expander(f"❌ Rejected ({len(rejected_jobs)})", expanded=False):
|
||||||
|
# Stage breakdown
|
||||||
|
stage_counts = Counter(
|
||||||
|
j.get("rejection_stage") or "unknown" for j in rejected_jobs
|
||||||
|
)
|
||||||
|
st.caption(
|
||||||
|
"Rejection by stage: "
|
||||||
|
+ " · ".join(f"**{k}**: {v}" for k, v in stage_counts.most_common())
|
||||||
|
)
|
||||||
|
|
||||||
|
# Rejection rate timeline (simple)
|
||||||
|
if len(rejected_jobs) > 1:
|
||||||
|
by_month: dict[str, int] = {}
|
||||||
|
for j in rejected_jobs:
|
||||||
|
mo = (j.get("applied_at") or "")[:7]
|
||||||
|
if mo:
|
||||||
|
by_month[mo] = by_month.get(mo, 0) + 1
|
||||||
|
if by_month:
|
||||||
|
import pandas as pd
|
||||||
|
chart_data = pd.DataFrame(
|
||||||
|
list(by_month.items()), columns=["Month", "Rejections"]
|
||||||
|
).sort_values("Month")
|
||||||
|
st.bar_chart(chart_data.set_index("Month"))
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
for job in rejected_jobs:
|
||||||
|
r_stage = job.get("rejection_stage") or "unknown"
|
||||||
|
company = job.get("company") or "?"
|
||||||
|
title = job.get("title") or ""
|
||||||
|
applied = _days_ago(job.get("applied_at"))
|
||||||
|
st.markdown(
|
||||||
|
f"**{company}** — {title} "
|
||||||
|
f"· rejected at _**{r_stage}**_ · applied {applied}"
|
||||||
|
)
|
||||||
371
app/pages/6_Interview_Prep.py
Normal file
371
app/pages/6_Interview_Prep.py
Normal file
|
|
@ -0,0 +1,371 @@
|
||||||
|
# app/pages/6_Interview_Prep.py
|
||||||
|
"""
|
||||||
|
Interview Prep — a clean, glanceable reference you can keep open during a call.
|
||||||
|
|
||||||
|
Left panel : talking points, company brief, CEO info, practice Q&A
|
||||||
|
Right panel : job description, email / contact history, cover letter snippet
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from datetime import date
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB, init_db,
|
||||||
|
get_interview_jobs, get_contacts, get_research,
|
||||||
|
get_task_for_job,
|
||||||
|
)
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
# ── Job selection ─────────────────────────────────────────────────────────────
|
||||||
|
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||||
|
active_stages = ["phone_screen", "interviewing", "offer"]
|
||||||
|
active_jobs = [
|
||||||
|
j for stage in active_stages
|
||||||
|
for j in jobs_by_stage.get(stage, [])
|
||||||
|
]
|
||||||
|
|
||||||
|
if not active_jobs:
|
||||||
|
st.title("📋 Interview Prep")
|
||||||
|
st.info(
|
||||||
|
"No active interviews found. "
|
||||||
|
"Move a job to **Phone Screen** on the Interviews page first."
|
||||||
|
)
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
# Allow pre-selecting via session state (e.g., from Interviews page)
|
||||||
|
preselect_id = st.session_state.pop("prep_job_id", None)
|
||||||
|
job_options = {
|
||||||
|
j["id"]: f"{j['title']} — {j['company']} ({j['status'].replace('_', ' ').title()})"
|
||||||
|
for j in active_jobs
|
||||||
|
}
|
||||||
|
ids = list(job_options.keys())
|
||||||
|
default_idx = ids.index(preselect_id) if preselect_id in ids else 0
|
||||||
|
|
||||||
|
selected_id = st.selectbox(
|
||||||
|
"Job",
|
||||||
|
options=ids,
|
||||||
|
format_func=lambda x: job_options[x],
|
||||||
|
index=default_idx,
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
job = next(j for j in active_jobs if j["id"] == selected_id)
|
||||||
|
|
||||||
|
# ── Header bar ────────────────────────────────────────────────────────────────
|
||||||
|
stage_label = job["status"].replace("_", " ").title()
|
||||||
|
idate = job.get("interview_date")
|
||||||
|
countdown = ""
|
||||||
|
if idate:
|
||||||
|
try:
|
||||||
|
delta = (date.fromisoformat(idate) - date.today()).days
|
||||||
|
if delta == 0:
|
||||||
|
countdown = " 🔴 **TODAY**"
|
||||||
|
elif delta == 1:
|
||||||
|
countdown = " 🟡 **TOMORROW**"
|
||||||
|
elif delta > 0:
|
||||||
|
countdown = f" 🟢 in {delta} days"
|
||||||
|
else:
|
||||||
|
countdown = f" (was {abs(delta)}d ago)"
|
||||||
|
except Exception:
|
||||||
|
countdown = ""
|
||||||
|
|
||||||
|
st.title(f"📋 {job.get('company')} — {job.get('title')}")
|
||||||
|
st.caption(
|
||||||
|
f"Stage: **{stage_label}**"
|
||||||
|
+ (f" · Interview: {idate}{countdown}" if idate else "")
|
||||||
|
+ (f" · Applied: {job.get('applied_at', '')[:10]}" if job.get("applied_at") else "")
|
||||||
|
)
|
||||||
|
|
||||||
|
if job.get("url"):
|
||||||
|
st.link_button("Open job listing ↗", job["url"])
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Two-column layout ─────────────────────────────────────────────────────────
|
||||||
|
col_prep, col_context = st.columns([2, 3])
|
||||||
|
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
# LEFT — prep materials
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
with col_prep:
|
||||||
|
|
||||||
|
research = get_research(DEFAULT_DB, job_id=selected_id)
|
||||||
|
|
||||||
|
# Refresh / generate research
|
||||||
|
_res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||||
|
_res_running = _res_task and _res_task["status"] in ("queued", "running")
|
||||||
|
|
||||||
|
if not research:
|
||||||
|
if not _res_running:
|
||||||
|
st.warning("No research brief yet for this job.")
|
||||||
|
if _res_task and _res_task["status"] == "failed":
|
||||||
|
st.error(f"Last attempt failed: {_res_task.get('error', '')}")
|
||||||
|
if st.button("🔬 Generate research brief", type="primary", use_container_width=True):
|
||||||
|
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _res_running:
|
||||||
|
@st.fragment(run_every=3)
|
||||||
|
def _res_status_initial():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
stage = t.get("stage") or ""
|
||||||
|
lbl = "Queued…" if t["status"] == "queued" else (stage or "Generating… this may take 30–60 seconds")
|
||||||
|
st.info(f"⏳ {lbl}")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_res_status_initial()
|
||||||
|
|
||||||
|
st.stop()
|
||||||
|
else:
|
||||||
|
generated_at = research.get("generated_at", "")
|
||||||
|
col_ts, col_btn = st.columns([3, 1])
|
||||||
|
col_ts.caption(f"Research generated: {generated_at}")
|
||||||
|
if col_btn.button("🔄 Refresh", use_container_width=True, disabled=bool(_res_running)):
|
||||||
|
submit_task(DEFAULT_DB, "company_research", selected_id)
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
if _res_running:
|
||||||
|
@st.fragment(run_every=3)
|
||||||
|
def _res_status_refresh():
|
||||||
|
t = get_task_for_job(DEFAULT_DB, "company_research", selected_id)
|
||||||
|
if t and t["status"] in ("queued", "running"):
|
||||||
|
stage = t.get("stage") or ""
|
||||||
|
lbl = "Queued…" if t["status"] == "queued" else (stage or "Refreshing research…")
|
||||||
|
st.info(f"⏳ {lbl}")
|
||||||
|
else:
|
||||||
|
st.rerun()
|
||||||
|
_res_status_refresh()
|
||||||
|
elif _res_task and _res_task["status"] == "failed":
|
||||||
|
st.error(f"Refresh failed: {_res_task.get('error', '')}")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Talking points (top — most useful during a call) ──────────────────────
|
||||||
|
st.subheader("🎯 Talking Points")
|
||||||
|
tp = (research.get("talking_points") or "").strip()
|
||||||
|
if tp:
|
||||||
|
st.markdown(tp)
|
||||||
|
else:
|
||||||
|
st.caption("_No talking points extracted — try regenerating._")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Company brief ─────────────────────────────────────────────────────────
|
||||||
|
st.subheader("🏢 Company Overview")
|
||||||
|
st.markdown(research.get("company_brief", "_—_"))
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Leadership brief ──────────────────────────────────────────────────────
|
||||||
|
st.subheader("👤 Leadership & Culture")
|
||||||
|
st.markdown(research.get("ceo_brief", "_—_"))
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Tech Stack & Product ───────────────────────────────────────────────────
|
||||||
|
tech = (research.get("tech_brief") or "").strip()
|
||||||
|
if tech:
|
||||||
|
st.subheader("⚙️ Tech Stack & Product")
|
||||||
|
st.markdown(tech)
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Funding & Market Position ──────────────────────────────────────────────
|
||||||
|
funding = (research.get("funding_brief") or "").strip()
|
||||||
|
if funding:
|
||||||
|
st.subheader("💰 Funding & Market Position")
|
||||||
|
st.markdown(funding)
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Red Flags & Watch-outs ────────────────────────────────────────────────
|
||||||
|
red = (research.get("red_flags") or "").strip()
|
||||||
|
if red and "no significant red flags" not in red.lower():
|
||||||
|
st.subheader("⚠️ Red Flags & Watch-outs")
|
||||||
|
st.warning(red)
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Inclusion & Accessibility ─────────────────────────────────────────────
|
||||||
|
access = (research.get("accessibility_brief") or "").strip()
|
||||||
|
if access:
|
||||||
|
st.subheader("♿ Inclusion & Accessibility")
|
||||||
|
st.caption("For your personal evaluation — not disclosed in any application.")
|
||||||
|
st.markdown(access)
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Practice Q&A (collapsible — use before the call) ─────────────────────
|
||||||
|
with st.expander("🎤 Practice Q&A (pre-call prep)", expanded=False):
|
||||||
|
st.caption(
|
||||||
|
"The LLM will play the interviewer. Type your answers below. "
|
||||||
|
"Use this before the call to warm up."
|
||||||
|
)
|
||||||
|
|
||||||
|
qa_key = f"qa_{selected_id}"
|
||||||
|
if qa_key not in st.session_state:
|
||||||
|
st.session_state[qa_key] = []
|
||||||
|
|
||||||
|
if st.button("🔄 Start / Reset session", key=f"qa_reset_{selected_id}"):
|
||||||
|
st.session_state[qa_key] = []
|
||||||
|
st.rerun()
|
||||||
|
|
||||||
|
# Display history
|
||||||
|
for msg in st.session_state[qa_key]:
|
||||||
|
with st.chat_message(msg["role"]):
|
||||||
|
st.markdown(msg["content"])
|
||||||
|
|
||||||
|
# Initial question if session is empty
|
||||||
|
if not st.session_state[qa_key]:
|
||||||
|
with st.spinner("Setting up your mock interview…"):
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import complete
|
||||||
|
opening = complete(
|
||||||
|
prompt=(
|
||||||
|
f"Start a mock phone screen for the {job.get('title')} "
|
||||||
|
f"role at {job.get('company')}. Ask your first question. "
|
||||||
|
f"Keep it realistic and concise."
|
||||||
|
),
|
||||||
|
system=(
|
||||||
|
f"You are a recruiter at {job.get('company')} conducting "
|
||||||
|
f"a phone screen for the {job.get('title')} role. "
|
||||||
|
f"Ask one question at a time. After Alex answers, give "
|
||||||
|
f"brief feedback (1–2 sentences), then ask your next question. "
|
||||||
|
f"Be professional but warm."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
st.session_state[qa_key] = [{"role": "assistant", "content": opening}]
|
||||||
|
st.rerun()
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"LLM error: {e}")
|
||||||
|
|
||||||
|
# Answer input
|
||||||
|
answer = st.chat_input("Your answer…", key=f"qa_input_{selected_id}")
|
||||||
|
if answer and st.session_state[qa_key]:
|
||||||
|
history = st.session_state[qa_key]
|
||||||
|
history.append({"role": "user", "content": answer})
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
f"You are a recruiter at {job.get('company')} conducting "
|
||||||
|
f"a phone screen for the {job.get('title')} role. "
|
||||||
|
f"Ask one question at a time. After Alex answers, give "
|
||||||
|
f"brief feedback (1–2 sentences), then ask your next question."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
] + history
|
||||||
|
|
||||||
|
with st.spinner("…"):
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
router = LLMRouter()
|
||||||
|
# Build prompt from history for single-turn backends
|
||||||
|
convo = "\n\n".join(
|
||||||
|
f"{'Interviewer' if m['role'] == 'assistant' else 'Alex'}: {m['content']}"
|
||||||
|
for m in history
|
||||||
|
)
|
||||||
|
response = router.complete(
|
||||||
|
prompt=convo + "\n\nInterviewer:",
|
||||||
|
system=messages[0]["content"],
|
||||||
|
)
|
||||||
|
history.append({"role": "assistant", "content": response})
|
||||||
|
st.session_state[qa_key] = history
|
||||||
|
st.rerun()
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Error: {e}")
|
||||||
|
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
# RIGHT — context / reference
|
||||||
|
# ════════════════════════════════════════════════
|
||||||
|
with col_context:
|
||||||
|
|
||||||
|
tab_jd, tab_emails, tab_letter = st.tabs(
|
||||||
|
["📄 Job Description", "📧 Email History", "📝 Cover Letter"]
|
||||||
|
)
|
||||||
|
|
||||||
|
with tab_jd:
|
||||||
|
score = job.get("match_score")
|
||||||
|
if score is not None:
|
||||||
|
badge = (
|
||||||
|
f"🟢 {score:.0f}% match" if score >= 70 else
|
||||||
|
f"🟡 {score:.0f}% match" if score >= 40 else
|
||||||
|
f"🔴 {score:.0f}% match"
|
||||||
|
)
|
||||||
|
st.caption(badge)
|
||||||
|
if job.get("keyword_gaps"):
|
||||||
|
st.caption(f"**Gaps to address:** {job['keyword_gaps']}")
|
||||||
|
st.markdown(job.get("description") or "_No description saved for this listing._")
|
||||||
|
|
||||||
|
with tab_emails:
|
||||||
|
contacts = get_contacts(DEFAULT_DB, job_id=selected_id)
|
||||||
|
if not contacts:
|
||||||
|
st.info("No contacts logged yet. Use the Interviews page to log emails.")
|
||||||
|
else:
|
||||||
|
for c in contacts:
|
||||||
|
icon = "📥" if c["direction"] == "inbound" else "📤"
|
||||||
|
recv = (c.get("received_at") or "")[:10]
|
||||||
|
st.markdown(
|
||||||
|
f"{icon} **{c.get('subject') or '(no subject)'}** · _{recv}_"
|
||||||
|
)
|
||||||
|
if c.get("from_addr"):
|
||||||
|
st.caption(f"From: {c['from_addr']}")
|
||||||
|
if c.get("body"):
|
||||||
|
st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else ""))
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# Quick draft reply
|
||||||
|
inbound = [c for c in contacts if c["direction"] == "inbound"]
|
||||||
|
if inbound:
|
||||||
|
last = inbound[-1]
|
||||||
|
if st.button("✍️ Draft reply to last email"):
|
||||||
|
with st.spinner("Drafting…"):
|
||||||
|
try:
|
||||||
|
from scripts.llm_router import complete
|
||||||
|
draft = complete(
|
||||||
|
prompt=(
|
||||||
|
f"Draft a professional, warm reply.\n\n"
|
||||||
|
f"From: {last.get('from_addr', '')}\n"
|
||||||
|
f"Subject: {last.get('subject', '')}\n\n"
|
||||||
|
f"{last.get('body', '')}\n\n"
|
||||||
|
f"Context: Alex is a CS/TAM professional applying "
|
||||||
|
f"for {job.get('title')} at {job.get('company')}."
|
||||||
|
),
|
||||||
|
system=(
|
||||||
|
"You are Alex Rivera's professional email assistant. "
|
||||||
|
"Write concise, warm, and professional replies in her voice."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
st.session_state[f"draft_{selected_id}"] = draft
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Draft failed: {e}")
|
||||||
|
|
||||||
|
if f"draft_{selected_id}" in st.session_state:
|
||||||
|
st.text_area(
|
||||||
|
"Draft (edit before sending)",
|
||||||
|
value=st.session_state[f"draft_{selected_id}"],
|
||||||
|
height=180,
|
||||||
|
)
|
||||||
|
|
||||||
|
with tab_letter:
|
||||||
|
cl = (job.get("cover_letter") or "").strip()
|
||||||
|
if cl:
|
||||||
|
st.markdown(cl)
|
||||||
|
else:
|
||||||
|
st.info("No cover letter saved for this job.")
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
|
||||||
|
# ── Notes (freeform, stored in session only — not persisted to DB) ────────
|
||||||
|
st.subheader("📝 Call Notes")
|
||||||
|
st.caption("Notes are per-session only — copy anything important before navigating away.")
|
||||||
|
st.text_area(
|
||||||
|
"notes",
|
||||||
|
placeholder="Type notes during or after the call…",
|
||||||
|
height=200,
|
||||||
|
key=f"notes_{selected_id}",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
274
app/pages/7_Survey.py
Normal file
274
app/pages/7_Survey.py
Normal file
|
|
@ -0,0 +1,274 @@
|
||||||
|
# app/pages/7_Survey.py
|
||||||
|
"""
|
||||||
|
Survey Assistant — real-time help with culture-fit surveys.
|
||||||
|
|
||||||
|
Supports text paste and screenshot (via clipboard or file upload).
|
||||||
|
Quick mode: "pick B" + one-liner. Detailed mode: option-by-option breakdown.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB, init_db,
|
||||||
|
get_interview_jobs, get_job_by_id,
|
||||||
|
insert_survey_response, get_survey_responses,
|
||||||
|
)
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
st.title("📋 Survey Assistant")
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Vision service health check ────────────────────────────────────────────────
|
||||||
|
def _vision_available() -> bool:
|
||||||
|
try:
|
||||||
|
r = requests.get("http://localhost:8002/health", timeout=2)
|
||||||
|
return r.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
vision_up = _vision_available()
|
||||||
|
|
||||||
|
# ── Job selector ───────────────────────────────────────────────────────────────
|
||||||
|
jobs_by_stage = get_interview_jobs(DEFAULT_DB)
|
||||||
|
survey_jobs = jobs_by_stage.get("survey", [])
|
||||||
|
other_jobs = (
|
||||||
|
jobs_by_stage.get("applied", []) +
|
||||||
|
jobs_by_stage.get("phone_screen", []) +
|
||||||
|
jobs_by_stage.get("interviewing", []) +
|
||||||
|
jobs_by_stage.get("offer", [])
|
||||||
|
)
|
||||||
|
all_jobs = survey_jobs + other_jobs
|
||||||
|
|
||||||
|
if not all_jobs:
|
||||||
|
st.info("No active jobs found. Add jobs in Job Review first.")
|
||||||
|
st.stop()
|
||||||
|
|
||||||
|
job_labels = {j["id"]: f"{j.get('company', '?')} — {j.get('title', '')}" for j in all_jobs}
|
||||||
|
selected_job_id = st.selectbox(
|
||||||
|
"Job",
|
||||||
|
options=[j["id"] for j in all_jobs],
|
||||||
|
format_func=lambda jid: job_labels[jid],
|
||||||
|
index=0,
|
||||||
|
)
|
||||||
|
selected_job = get_job_by_id(DEFAULT_DB, selected_job_id)
|
||||||
|
|
||||||
|
# ── LLM prompt builders ────────────────────────────────────────────────────────
|
||||||
|
_SURVEY_SYSTEM = (
|
||||||
|
"You are a job application advisor helping a candidate answer a culture-fit survey. "
|
||||||
|
"The candidate values collaborative teamwork, clear communication, growth, and impact. "
|
||||||
|
"Choose answers that present them in the best professional light."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_text_prompt(text: str, mode: str) -> str:
|
||||||
|
if mode == "Quick":
|
||||||
|
return (
|
||||||
|
"Answer each survey question below. For each, give ONLY the letter of the best "
|
||||||
|
"option and a single-sentence reason. Format exactly as:\n"
|
||||||
|
"1. B — reason here\n2. A — reason here\n\n"
|
||||||
|
f"Survey:\n{text}"
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"Analyze each survey question below. For each question:\n"
|
||||||
|
"- Briefly evaluate each option (1 sentence each)\n"
|
||||||
|
"- State your recommendation with reasoning\n\n"
|
||||||
|
f"Survey:\n{text}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_image_prompt(mode: str) -> str:
|
||||||
|
if mode == "Quick":
|
||||||
|
return (
|
||||||
|
"This is a screenshot of a culture-fit survey. Read all questions and answer each "
|
||||||
|
"with the letter of the best option for a collaborative, growth-oriented candidate. "
|
||||||
|
"Format: '1. B — brief reason' on separate lines."
|
||||||
|
)
|
||||||
|
return (
|
||||||
|
"This is a screenshot of a culture-fit survey. For each question, evaluate each option "
|
||||||
|
"and recommend the best choice for a collaborative, growth-oriented candidate. "
|
||||||
|
"Include a brief breakdown per option and a clear recommendation."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ── Layout ─────────────────────────────────────────────────────────────────────
|
||||||
|
left_col, right_col = st.columns([1, 1], gap="large")
|
||||||
|
|
||||||
|
with left_col:
|
||||||
|
survey_name = st.text_input(
|
||||||
|
"Survey name (optional)",
|
||||||
|
placeholder="e.g. Culture Fit Round 1",
|
||||||
|
key="survey_name",
|
||||||
|
)
|
||||||
|
mode = st.radio("Mode", ["Quick", "Detailed"], horizontal=True, key="survey_mode")
|
||||||
|
st.caption(
|
||||||
|
"**Quick** — best answer + one-liner per question | "
|
||||||
|
"**Detailed** — option-by-option breakdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Input tabs
|
||||||
|
if vision_up:
|
||||||
|
tab_text, tab_screenshot = st.tabs(["📝 Paste Text", "🖼️ Screenshot"])
|
||||||
|
else:
|
||||||
|
st.info(
|
||||||
|
"📷 Screenshot input unavailable — vision service not running. \n"
|
||||||
|
"Start it with: `bash scripts/manage-vision.sh start`"
|
||||||
|
)
|
||||||
|
tab_text = st.container()
|
||||||
|
tab_screenshot = None
|
||||||
|
|
||||||
|
image_b64: str | None = None
|
||||||
|
raw_text: str = ""
|
||||||
|
|
||||||
|
with tab_text:
|
||||||
|
raw_text = st.text_area(
|
||||||
|
"Paste survey questions here",
|
||||||
|
height=280,
|
||||||
|
placeholder=(
|
||||||
|
"Q1: Which describes your ideal work environment?\n"
|
||||||
|
"A. Solo focused work\nB. Collaborative team\n"
|
||||||
|
"C. Mix of both\nD. Depends on the task"
|
||||||
|
),
|
||||||
|
key="survey_text",
|
||||||
|
)
|
||||||
|
|
||||||
|
if tab_screenshot is not None:
|
||||||
|
with tab_screenshot:
|
||||||
|
st.caption("Paste from clipboard or upload a screenshot file.")
|
||||||
|
paste_col, upload_col = st.columns(2)
|
||||||
|
|
||||||
|
with paste_col:
|
||||||
|
try:
|
||||||
|
from streamlit_paste_button import paste_image_button
|
||||||
|
paste_result = paste_image_button("📋 Paste from clipboard", key="paste_btn")
|
||||||
|
if paste_result and paste_result.image_data:
|
||||||
|
buf = io.BytesIO()
|
||||||
|
paste_result.image_data.save(buf, format="PNG")
|
||||||
|
image_b64 = base64.b64encode(buf.getvalue()).decode()
|
||||||
|
st.image(
|
||||||
|
paste_result.image_data,
|
||||||
|
caption="Pasted image",
|
||||||
|
use_container_width=True,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
st.warning("streamlit-paste-button not installed. Use file upload.")
|
||||||
|
|
||||||
|
with upload_col:
|
||||||
|
uploaded = st.file_uploader(
|
||||||
|
"Upload screenshot",
|
||||||
|
type=["png", "jpg", "jpeg"],
|
||||||
|
key="survey_upload",
|
||||||
|
label_visibility="collapsed",
|
||||||
|
)
|
||||||
|
if uploaded:
|
||||||
|
image_b64 = base64.b64encode(uploaded.read()).decode()
|
||||||
|
st.image(uploaded, caption="Uploaded image", use_container_width=True)
|
||||||
|
|
||||||
|
# Analyze button
|
||||||
|
has_input = bool(raw_text.strip()) or bool(image_b64)
|
||||||
|
if st.button("🔍 Analyze", type="primary", disabled=not has_input, use_container_width=True):
|
||||||
|
with st.spinner("Analyzing…"):
|
||||||
|
try:
|
||||||
|
router = LLMRouter()
|
||||||
|
if image_b64:
|
||||||
|
prompt = _build_image_prompt(mode)
|
||||||
|
output = router.complete(
|
||||||
|
prompt,
|
||||||
|
images=[image_b64],
|
||||||
|
fallback_order=router.config.get("vision_fallback_order"),
|
||||||
|
)
|
||||||
|
source = "screenshot"
|
||||||
|
else:
|
||||||
|
prompt = _build_text_prompt(raw_text, mode)
|
||||||
|
output = router.complete(
|
||||||
|
prompt,
|
||||||
|
system=_SURVEY_SYSTEM,
|
||||||
|
fallback_order=router.config.get("research_fallback_order"),
|
||||||
|
)
|
||||||
|
source = "text_paste"
|
||||||
|
st.session_state["survey_output"] = output
|
||||||
|
st.session_state["survey_source"] = source
|
||||||
|
st.session_state["survey_image_b64"] = image_b64
|
||||||
|
st.session_state["survey_raw_text"] = raw_text
|
||||||
|
except Exception as e:
|
||||||
|
st.error(f"Analysis failed: {e}")
|
||||||
|
|
||||||
|
with right_col:
|
||||||
|
output = st.session_state.get("survey_output")
|
||||||
|
if output:
|
||||||
|
st.markdown("### Analysis")
|
||||||
|
st.markdown(output)
|
||||||
|
|
||||||
|
st.divider()
|
||||||
|
with st.form("save_survey_form"):
|
||||||
|
reported_score = st.text_input(
|
||||||
|
"Reported score (optional)",
|
||||||
|
placeholder="e.g. 82% or 4.2/5",
|
||||||
|
key="reported_score_input",
|
||||||
|
)
|
||||||
|
if st.form_submit_button("💾 Save to Job"):
|
||||||
|
source = st.session_state.get("survey_source", "text_paste")
|
||||||
|
image_b64_saved = st.session_state.get("survey_image_b64")
|
||||||
|
raw_text_saved = st.session_state.get("survey_raw_text", "")
|
||||||
|
|
||||||
|
image_path = ""
|
||||||
|
if image_b64_saved:
|
||||||
|
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||||
|
save_dir = (
|
||||||
|
Path(__file__).parent.parent.parent
|
||||||
|
/ "data"
|
||||||
|
/ "survey_screenshots"
|
||||||
|
/ str(selected_job_id)
|
||||||
|
)
|
||||||
|
save_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
img_file = save_dir / f"{ts}.png"
|
||||||
|
img_file.write_bytes(base64.b64decode(image_b64_saved))
|
||||||
|
image_path = str(img_file)
|
||||||
|
|
||||||
|
insert_survey_response(
|
||||||
|
DEFAULT_DB,
|
||||||
|
job_id=selected_job_id,
|
||||||
|
survey_name=survey_name,
|
||||||
|
source=source,
|
||||||
|
raw_input=raw_text_saved,
|
||||||
|
image_path=image_path,
|
||||||
|
mode=mode.lower(),
|
||||||
|
llm_output=output,
|
||||||
|
reported_score=reported_score,
|
||||||
|
)
|
||||||
|
st.success("Saved!")
|
||||||
|
del st.session_state["survey_output"]
|
||||||
|
st.rerun()
|
||||||
|
else:
|
||||||
|
st.markdown("### Analysis")
|
||||||
|
st.caption("Results will appear here after analysis.")
|
||||||
|
|
||||||
|
# ── History ────────────────────────────────────────────────────────────────────
|
||||||
|
st.divider()
|
||||||
|
st.subheader("📂 Response History")
|
||||||
|
history = get_survey_responses(DEFAULT_DB, job_id=selected_job_id)
|
||||||
|
|
||||||
|
if not history:
|
||||||
|
st.caption("No saved responses for this job yet.")
|
||||||
|
else:
|
||||||
|
for resp in history:
|
||||||
|
label = resp.get("survey_name") or "Survey response"
|
||||||
|
ts = (resp.get("created_at") or "")[:16]
|
||||||
|
score = resp.get("reported_score")
|
||||||
|
score_str = f" · Score: {score}" if score else ""
|
||||||
|
with st.expander(f"{label} · {ts}{score_str}"):
|
||||||
|
st.caption(f"Mode: {resp.get('mode', '?')} · Source: {resp.get('source', '?')}")
|
||||||
|
if resp.get("raw_input"):
|
||||||
|
with st.expander("Original input"):
|
||||||
|
st.text(resp["raw_input"])
|
||||||
|
st.markdown(resp.get("llm_output", ""))
|
||||||
5
config/adzuna.yaml.example
Normal file
5
config/adzuna.yaml.example
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
# Adzuna Jobs API credentials
|
||||||
|
# Register at https://developer.adzuna.com/admin/applications
|
||||||
|
# Both app_id and app_key are required.
|
||||||
|
app_id: "" # short alphanumeric ID from your developer dashboard
|
||||||
|
app_key: "" # 32-character hex key from your developer dashboard
|
||||||
15
config/blocklist.yaml
Normal file
15
config/blocklist.yaml
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
# Discovery blocklist — entries matching any rule are silently dropped before DB insert.
|
||||||
|
# Applies globally across all search profiles and custom boards.
|
||||||
|
|
||||||
|
# Company name blocklist — partial case-insensitive match on the company field.
|
||||||
|
# e.g. "Amazon" blocks any listing where company contains "amazon".
|
||||||
|
companies: []
|
||||||
|
|
||||||
|
# Industry/content blocklist — blocked if company name OR job description contains any keyword.
|
||||||
|
# Use this for industries you will never work in regardless of company.
|
||||||
|
# e.g. "gambling", "crypto", "tobacco", "defense"
|
||||||
|
industries: []
|
||||||
|
|
||||||
|
# Location blocklist — blocked if the location field contains any of these strings.
|
||||||
|
# e.g. "Dallas", "Austin, TX"
|
||||||
|
locations: []
|
||||||
24
config/craigslist.yaml.example
Normal file
24
config/craigslist.yaml.example
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Craigslist metro subdomains to search.
|
||||||
|
# Copy to config/craigslist.yaml and adjust for your markets.
|
||||||
|
# Full subdomain list: https://www.craigslist.org/about/sites
|
||||||
|
metros:
|
||||||
|
- sfbay
|
||||||
|
- newyork
|
||||||
|
- chicago
|
||||||
|
- losangeles
|
||||||
|
- seattle
|
||||||
|
- austin
|
||||||
|
|
||||||
|
# Maps search profile location strings → Craigslist metro subdomain.
|
||||||
|
# Locations not listed here are silently skipped.
|
||||||
|
location_map:
|
||||||
|
"San Francisco Bay Area, CA": sfbay
|
||||||
|
"New York, NY": newyork
|
||||||
|
"Chicago, IL": chicago
|
||||||
|
"Los Angeles, CA": losangeles
|
||||||
|
"Seattle, WA": seattle
|
||||||
|
"Austin, TX": austin
|
||||||
|
|
||||||
|
# Craigslist job category. Defaults to 'jjj' (general jobs) if omitted.
|
||||||
|
# Other options: csr (customer service), mar (marketing), sof (software/qa/dba)
|
||||||
|
# category: jjj
|
||||||
38
config/email.yaml.example
Normal file
38
config/email.yaml.example
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
# config/email.yaml — IMAP email sync configuration
|
||||||
|
# Copy this to config/email.yaml and fill in your credentials.
|
||||||
|
# config/email.yaml is gitignored — never commit real credentials.
|
||||||
|
#
|
||||||
|
# Gmail setup:
|
||||||
|
# 1. Enable IMAP: Gmail Settings → See all settings → Forwarding and POP/IMAP
|
||||||
|
# 2. Create App Password: myaccount.google.com/apppasswords
|
||||||
|
# (requires 2-Step Verification to be enabled)
|
||||||
|
# 3. Use your Gmail address as username, App Password as password.
|
||||||
|
#
|
||||||
|
# Outlook / Office 365:
|
||||||
|
# host: outlook.office365.com
|
||||||
|
# port: 993
|
||||||
|
# use_ssl: true
|
||||||
|
# (Use your regular email + password, or an App Password if MFA is enabled)
|
||||||
|
|
||||||
|
host: imap.gmail.com
|
||||||
|
port: 993
|
||||||
|
use_ssl: true
|
||||||
|
|
||||||
|
# Your full email address
|
||||||
|
username: your.email@gmail.com
|
||||||
|
|
||||||
|
# Gmail: use an App Password (16-char code, no spaces)
|
||||||
|
# Other providers: use your regular password (or App Password if MFA enabled)
|
||||||
|
password: xxxx-xxxx-xxxx-xxxx
|
||||||
|
|
||||||
|
# Sent folder name — leave blank to auto-detect
|
||||||
|
# Gmail: "[Gmail]/Sent Mail" Outlook: "Sent Items" Generic: "Sent"
|
||||||
|
sent_folder: ""
|
||||||
|
|
||||||
|
# How many days back to search (90 = ~3 months)
|
||||||
|
lookback_days: 90
|
||||||
|
|
||||||
|
# Optional: Gmail label to scan for action-needed emails (e.g. "TO DO JOBS").
|
||||||
|
# Emails in this label are matched to pipeline jobs by company name, then
|
||||||
|
# filtered by action keywords in the subject. Leave blank to disable.
|
||||||
|
todo_label: ""
|
||||||
66
config/llm.yaml
Normal file
66
config/llm.yaml
Normal file
|
|
@ -0,0 +1,66 @@
|
||||||
|
backends:
|
||||||
|
anthropic:
|
||||||
|
api_key_env: ANTHROPIC_API_KEY
|
||||||
|
enabled: false
|
||||||
|
model: claude-sonnet-4-6
|
||||||
|
type: anthropic
|
||||||
|
supports_images: true
|
||||||
|
claude_code:
|
||||||
|
api_key: any
|
||||||
|
base_url: http://localhost:3009/v1
|
||||||
|
enabled: false
|
||||||
|
model: claude-code-terminal
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: true
|
||||||
|
github_copilot:
|
||||||
|
api_key: any
|
||||||
|
base_url: http://localhost:3010/v1
|
||||||
|
enabled: false
|
||||||
|
model: gpt-4o
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
ollama:
|
||||||
|
api_key: ollama
|
||||||
|
base_url: http://localhost:11434/v1
|
||||||
|
enabled: true
|
||||||
|
model: alex-cover-writer:latest
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
ollama_research:
|
||||||
|
api_key: ollama
|
||||||
|
base_url: http://localhost:11434/v1
|
||||||
|
enabled: true
|
||||||
|
model: llama3.1:8b
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
vllm:
|
||||||
|
api_key: ''
|
||||||
|
base_url: http://localhost:8000/v1
|
||||||
|
enabled: true
|
||||||
|
model: __auto__
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
vision_service:
|
||||||
|
base_url: http://localhost:8002
|
||||||
|
enabled: false
|
||||||
|
type: vision_service
|
||||||
|
supports_images: true
|
||||||
|
fallback_order:
|
||||||
|
- ollama
|
||||||
|
- claude_code
|
||||||
|
- vllm
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
research_fallback_order:
|
||||||
|
- claude_code
|
||||||
|
- vllm
|
||||||
|
- ollama_research
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
vision_fallback_order:
|
||||||
|
- vision_service
|
||||||
|
- claude_code
|
||||||
|
- anthropic
|
||||||
|
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
|
||||||
|
# must never use the fine-tuned writer model, and this also avoids evicting
|
||||||
|
# the writer from GPU memory while a cover letter task is in flight.
|
||||||
66
config/llm.yaml.example
Normal file
66
config/llm.yaml.example
Normal file
|
|
@ -0,0 +1,66 @@
|
||||||
|
backends:
|
||||||
|
anthropic:
|
||||||
|
api_key_env: ANTHROPIC_API_KEY
|
||||||
|
enabled: false
|
||||||
|
model: claude-sonnet-4-6
|
||||||
|
type: anthropic
|
||||||
|
supports_images: true
|
||||||
|
claude_code:
|
||||||
|
api_key: any
|
||||||
|
base_url: http://localhost:3009/v1
|
||||||
|
enabled: false
|
||||||
|
model: claude-code-terminal
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: true
|
||||||
|
github_copilot:
|
||||||
|
api_key: any
|
||||||
|
base_url: http://localhost:3010/v1
|
||||||
|
enabled: false
|
||||||
|
model: gpt-4o
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
ollama:
|
||||||
|
api_key: ollama
|
||||||
|
base_url: http://localhost:11434/v1
|
||||||
|
enabled: true
|
||||||
|
model: alex-cover-writer:latest
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
ollama_research:
|
||||||
|
api_key: ollama
|
||||||
|
base_url: http://localhost:11434/v1
|
||||||
|
enabled: true
|
||||||
|
model: llama3.1:8b
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
vllm:
|
||||||
|
api_key: ''
|
||||||
|
base_url: http://localhost:8000/v1
|
||||||
|
enabled: true
|
||||||
|
model: __auto__
|
||||||
|
type: openai_compat
|
||||||
|
supports_images: false
|
||||||
|
vision_service:
|
||||||
|
base_url: http://localhost:8002
|
||||||
|
enabled: false
|
||||||
|
type: vision_service
|
||||||
|
supports_images: true
|
||||||
|
fallback_order:
|
||||||
|
- ollama
|
||||||
|
- claude_code
|
||||||
|
- vllm
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
research_fallback_order:
|
||||||
|
- claude_code
|
||||||
|
- vllm
|
||||||
|
- ollama_research
|
||||||
|
- github_copilot
|
||||||
|
- anthropic
|
||||||
|
vision_fallback_order:
|
||||||
|
- vision_service
|
||||||
|
- claude_code
|
||||||
|
- anthropic
|
||||||
|
# Note: 'ollama' (alex-cover-writer) intentionally excluded — research
|
||||||
|
# must never use the fine-tuned writer model, and this also avoids evicting
|
||||||
|
# the writer from GPU memory while a cover letter task is in flight.
|
||||||
24
config/notion.yaml.example
Normal file
24
config/notion.yaml.example
Normal file
|
|
@ -0,0 +1,24 @@
|
||||||
|
# Copy to config/notion.yaml and fill in your values.
|
||||||
|
# notion.yaml is gitignored — never commit it.
|
||||||
|
#
|
||||||
|
# Get your integration token from: https://www.notion.so/my-integrations
|
||||||
|
# Then share the "Tracking Job Applications" database with your integration:
|
||||||
|
# Open the DB in Notion → ... menu → Add connections → select your integration
|
||||||
|
#
|
||||||
|
token: "secret_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
|
||||||
|
database_id: "1bd75cff-7708-8007-8c00-f1de36620a0a"
|
||||||
|
|
||||||
|
field_map:
|
||||||
|
title_field: "Salary"
|
||||||
|
job_title: "Job Title"
|
||||||
|
company: "Company Name"
|
||||||
|
url: "Role Link"
|
||||||
|
source: "Job Source"
|
||||||
|
status: "Status of Application"
|
||||||
|
status_new: "Application Submitted"
|
||||||
|
date_found: "Date Found"
|
||||||
|
remote: "Remote"
|
||||||
|
match_score: "Match Score"
|
||||||
|
keyword_gaps: "Keyword Gaps"
|
||||||
|
notes: "Notes"
|
||||||
|
job_description: "Job Description"
|
||||||
23
config/resume_keywords.yaml
Normal file
23
config/resume_keywords.yaml
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
domains:
|
||||||
|
- B2B SaaS
|
||||||
|
- enterprise software
|
||||||
|
- security
|
||||||
|
- compliance
|
||||||
|
- post-sale lifecycle
|
||||||
|
- SaaS metrics
|
||||||
|
- web security
|
||||||
|
keywords:
|
||||||
|
- churn reduction
|
||||||
|
- escalation management
|
||||||
|
- cross-functional
|
||||||
|
- product feedback loop
|
||||||
|
- customer advocacy
|
||||||
|
skills:
|
||||||
|
- Customer Success
|
||||||
|
- Technical Account Management
|
||||||
|
- Revenue Operations
|
||||||
|
- data analysis
|
||||||
|
- stakeholder management
|
||||||
|
- project management
|
||||||
|
- onboarding
|
||||||
|
- renewal management
|
||||||
33
config/resume_keywords.yaml.example
Normal file
33
config/resume_keywords.yaml.example
Normal file
|
|
@ -0,0 +1,33 @@
|
||||||
|
skills:
|
||||||
|
- Customer Success
|
||||||
|
- Technical Account Management
|
||||||
|
- Revenue Operations
|
||||||
|
- Salesforce
|
||||||
|
- Gainsight
|
||||||
|
- data analysis
|
||||||
|
- stakeholder management
|
||||||
|
- project management
|
||||||
|
- onboarding
|
||||||
|
- renewal management
|
||||||
|
|
||||||
|
domains:
|
||||||
|
- B2B SaaS
|
||||||
|
- enterprise software
|
||||||
|
- security
|
||||||
|
- compliance
|
||||||
|
- post-sale lifecycle
|
||||||
|
- SaaS metrics
|
||||||
|
|
||||||
|
keywords:
|
||||||
|
- QBR
|
||||||
|
- churn reduction
|
||||||
|
- NRR
|
||||||
|
- ARR
|
||||||
|
- MRR
|
||||||
|
- executive sponsorship
|
||||||
|
- VOC
|
||||||
|
- health score
|
||||||
|
- escalation management
|
||||||
|
- cross-functional
|
||||||
|
- product feedback loop
|
||||||
|
- customer advocacy
|
||||||
123
config/search_profiles.yaml
Normal file
123
config/search_profiles.yaml
Normal file
|
|
@ -0,0 +1,123 @@
|
||||||
|
profiles:
|
||||||
|
- boards:
|
||||||
|
- linkedin
|
||||||
|
- indeed
|
||||||
|
- glassdoor
|
||||||
|
- zip_recruiter
|
||||||
|
- google
|
||||||
|
custom_boards:
|
||||||
|
- adzuna
|
||||||
|
- theladders
|
||||||
|
- craigslist
|
||||||
|
exclude_keywords:
|
||||||
|
- sales
|
||||||
|
- account executive
|
||||||
|
- sales engineer
|
||||||
|
- SDR
|
||||||
|
- BDR
|
||||||
|
- business development
|
||||||
|
- sales development
|
||||||
|
- sales manager
|
||||||
|
- sales representative
|
||||||
|
- sales rep
|
||||||
|
hours_old: 240
|
||||||
|
locations:
|
||||||
|
- Remote
|
||||||
|
- San Francisco Bay Area, CA
|
||||||
|
name: cs_leadership
|
||||||
|
results_per_board: 75
|
||||||
|
titles:
|
||||||
|
- Customer Success Manager
|
||||||
|
- Customer Engagement Manager
|
||||||
|
- Director of Customer Success
|
||||||
|
- VP Customer Success
|
||||||
|
- Head of Customer Success
|
||||||
|
- Technical Account Manager
|
||||||
|
- TAM
|
||||||
|
- Customer Experience Lead
|
||||||
|
- CSM
|
||||||
|
- CX
|
||||||
|
- Customer Success Consultant
|
||||||
|
- boards:
|
||||||
|
- linkedin
|
||||||
|
- indeed
|
||||||
|
custom_boards:
|
||||||
|
- adzuna
|
||||||
|
- craigslist
|
||||||
|
exclude_keywords:
|
||||||
|
- sales
|
||||||
|
- account executive
|
||||||
|
- SDR
|
||||||
|
- BDR
|
||||||
|
- sales development
|
||||||
|
hours_old: 336
|
||||||
|
locations:
|
||||||
|
- Remote
|
||||||
|
- San Francisco Bay Area, CA
|
||||||
|
mission_tags:
|
||||||
|
- music
|
||||||
|
name: music_industry
|
||||||
|
results_per_board: 50
|
||||||
|
titles:
|
||||||
|
- Customer Success Manager
|
||||||
|
- Partner Success Manager
|
||||||
|
- Artist Success Manager
|
||||||
|
- Creator Success Manager
|
||||||
|
- Technical Account Manager
|
||||||
|
- Community Manager
|
||||||
|
- Account Manager
|
||||||
|
- Label Relations Manager
|
||||||
|
- boards:
|
||||||
|
- linkedin
|
||||||
|
- indeed
|
||||||
|
custom_boards:
|
||||||
|
- adzuna
|
||||||
|
- craigslist
|
||||||
|
exclude_keywords:
|
||||||
|
- sales
|
||||||
|
- account executive
|
||||||
|
- SDR
|
||||||
|
- BDR
|
||||||
|
hours_old: 336
|
||||||
|
locations:
|
||||||
|
- Remote
|
||||||
|
- San Francisco Bay Area, CA
|
||||||
|
mission_tags:
|
||||||
|
- animal_welfare
|
||||||
|
name: animal_welfare
|
||||||
|
results_per_board: 50
|
||||||
|
titles:
|
||||||
|
- Customer Success Manager
|
||||||
|
- Program Manager
|
||||||
|
- Community Engagement Manager
|
||||||
|
- Operations Manager
|
||||||
|
- Partner Success Manager
|
||||||
|
- Account Manager
|
||||||
|
- Development Manager
|
||||||
|
- boards:
|
||||||
|
- linkedin
|
||||||
|
- indeed
|
||||||
|
custom_boards:
|
||||||
|
- adzuna
|
||||||
|
- craigslist
|
||||||
|
exclude_keywords:
|
||||||
|
- sales
|
||||||
|
- account executive
|
||||||
|
- SDR
|
||||||
|
- BDR
|
||||||
|
hours_old: 336
|
||||||
|
locations:
|
||||||
|
- Remote
|
||||||
|
- San Francisco Bay Area, CA
|
||||||
|
mission_tags:
|
||||||
|
- education
|
||||||
|
name: education
|
||||||
|
results_per_board: 50
|
||||||
|
titles:
|
||||||
|
- Customer Success Manager
|
||||||
|
- District Success Manager
|
||||||
|
- Implementation Specialist
|
||||||
|
- Partner Success Manager
|
||||||
|
- Account Manager
|
||||||
|
- School Success Manager
|
||||||
|
- Customer Experience Manager
|
||||||
0
data/survey_screenshots/.gitkeep
Normal file
0
data/survey_screenshots/.gitkeep
Normal file
68
environment.yml
Normal file
68
environment.yml
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
name: job-seeker
|
||||||
|
# Recreate: conda env create -f environment.yml
|
||||||
|
# Update pinned snapshot: conda env export --no-builds > environment.yml
|
||||||
|
channels:
|
||||||
|
- conda-forge
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- python=3.12
|
||||||
|
- pip
|
||||||
|
- pip:
|
||||||
|
# ── Web UI ────────────────────────────────────────────────────────────────
|
||||||
|
- streamlit>=1.35
|
||||||
|
- watchdog # live reload
|
||||||
|
- reportlab>=4.0 # PDF cover letter export
|
||||||
|
- pandas>=2.0
|
||||||
|
- pyarrow # streamlit data tables
|
||||||
|
- streamlit-paste-button>=0.1.0
|
||||||
|
|
||||||
|
# ── Job scraping ──────────────────────────────────────────────────────────
|
||||||
|
- python-jobspy>=1.1
|
||||||
|
- playwright # browser automation (run: playwright install chromium)
|
||||||
|
- selenium
|
||||||
|
- undetected-chromedriver
|
||||||
|
- webdriver-manager
|
||||||
|
- beautifulsoup4
|
||||||
|
- requests
|
||||||
|
- curl_cffi # Chrome TLS fingerprint — bypasses Cloudflare on The Ladders
|
||||||
|
- fake-useragent # company scraper rotation
|
||||||
|
|
||||||
|
# ── LLM / AI backends ─────────────────────────────────────────────────────
|
||||||
|
- openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers)
|
||||||
|
- anthropic>=0.80 # direct Anthropic API fallback
|
||||||
|
- ollama # Python client for Ollama management
|
||||||
|
- langchain>=0.2
|
||||||
|
- langchain-openai
|
||||||
|
- langchain-anthropic
|
||||||
|
- langchain-ollama
|
||||||
|
- langchain-community
|
||||||
|
- langchain-google-genai
|
||||||
|
- google-generativeai
|
||||||
|
- tiktoken
|
||||||
|
|
||||||
|
# ── Resume matching ───────────────────────────────────────────────────────
|
||||||
|
- scikit-learn>=1.3
|
||||||
|
- rapidfuzz
|
||||||
|
- lib-resume-builder-aihawk
|
||||||
|
|
||||||
|
# ── Notion integration ────────────────────────────────────────────────────
|
||||||
|
- notion-client>=3.0
|
||||||
|
|
||||||
|
# ── Document handling ─────────────────────────────────────────────────────
|
||||||
|
- pypdf
|
||||||
|
- pdfminer-six
|
||||||
|
- pyyaml>=6.0
|
||||||
|
- python-dotenv
|
||||||
|
|
||||||
|
# ── Utilities ─────────────────────────────────────────────────────────────
|
||||||
|
- sqlalchemy
|
||||||
|
- tqdm
|
||||||
|
- loguru
|
||||||
|
- rich
|
||||||
|
- tenacity
|
||||||
|
- httpx
|
||||||
|
|
||||||
|
# ── Testing ───────────────────────────────────────────────────────────────
|
||||||
|
- pytest>=9.0
|
||||||
|
- pytest-cov
|
||||||
|
- pytest-mock
|
||||||
2
pytest.ini
Normal file
2
pytest.ini
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
[pytest]
|
||||||
|
testpaths = tests
|
||||||
0
scripts/__init__.py
Normal file
0
scripts/__init__.py
Normal file
468
scripts/company_research.py
Normal file
468
scripts/company_research.py
Normal file
|
|
@ -0,0 +1,468 @@
|
||||||
|
# scripts/company_research.py
|
||||||
|
"""
|
||||||
|
Pre-interview company research generator.
|
||||||
|
|
||||||
|
Three-phase approach:
|
||||||
|
1. If SearXNG is available (port 8888), use companyScraper.py to fetch live
|
||||||
|
data: CEO name, HQ address, LinkedIn, contact info.
|
||||||
|
1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for
|
||||||
|
recent news snippets (funding, launches, leadership changes, etc.).
|
||||||
|
2. Feed all real data into an LLM prompt to synthesise a structured brief
|
||||||
|
covering company overview, leadership, recent developments, and talking
|
||||||
|
points tailored to Alex.
|
||||||
|
|
||||||
|
Falls back to pure LLM knowledge when SearXNG is offline.
|
||||||
|
|
||||||
|
Usage (standalone):
|
||||||
|
conda run -n job-seeker python scripts/company_research.py --job-id 42
|
||||||
|
conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# ── SearXNG scraper integration ───────────────────────────────────────────────
|
||||||
|
_SCRAPER_DIR = Path("/Library/Development/scrapers")
|
||||||
|
_SCRAPER_AVAILABLE = False
|
||||||
|
|
||||||
|
if _SCRAPER_DIR.exists():
|
||||||
|
sys.path.insert(0, str(_SCRAPER_DIR))
|
||||||
|
try:
|
||||||
|
from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig
|
||||||
|
_SCRAPER_AVAILABLE = True
|
||||||
|
except (ImportError, SystemExit):
|
||||||
|
# companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _searxng_running() -> bool:
|
||||||
|
"""Quick check whether SearXNG is reachable."""
|
||||||
|
try:
|
||||||
|
import requests
|
||||||
|
r = requests.get("http://localhost:8888/", timeout=3)
|
||||||
|
return r.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_company(company: str) -> dict:
|
||||||
|
"""
|
||||||
|
Use companyScraper in minimal mode to pull live CEO / HQ data.
|
||||||
|
Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found').
|
||||||
|
"""
|
||||||
|
mock_args = SimpleNamespace(
|
||||||
|
mode="minimal",
|
||||||
|
verbose=False,
|
||||||
|
dry_run=False,
|
||||||
|
debug=False,
|
||||||
|
use_cache=True,
|
||||||
|
save_raw=False,
|
||||||
|
target_staff=None,
|
||||||
|
include_types=None,
|
||||||
|
exclude_types=None,
|
||||||
|
include_contact=False,
|
||||||
|
include_address=False,
|
||||||
|
include_social=True, # grab LinkedIn while we're at it
|
||||||
|
timeout=20,
|
||||||
|
input_file=None,
|
||||||
|
output_file="/dev/null",
|
||||||
|
searxng_url="http://localhost:8888/",
|
||||||
|
)
|
||||||
|
# Override the singleton Config URL
|
||||||
|
_ScraperConfig.SEARXNG_URL = "http://localhost:8888/"
|
||||||
|
|
||||||
|
scraper = EnhancedCompanyScraper(mock_args)
|
||||||
|
scraper.companies = [company]
|
||||||
|
|
||||||
|
result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"}
|
||||||
|
for search_type in ["ceo", "hq", "social"]:
|
||||||
|
html = scraper.search_company(company, search_type)
|
||||||
|
if search_type == "ceo":
|
||||||
|
result["ceo"] = scraper.extract_ceo(html, company)
|
||||||
|
elif search_type == "hq":
|
||||||
|
result["headquarters"] = scraper.extract_address(html, company)
|
||||||
|
elif search_type == "social":
|
||||||
|
social = scraper.extract_social(html, company)
|
||||||
|
# Pull out just the LinkedIn entry
|
||||||
|
for part in (social or "").split(";"):
|
||||||
|
if "linkedin" in part.lower():
|
||||||
|
result["linkedin"] = part.strip()
|
||||||
|
break
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
_SEARCH_QUERIES = {
|
||||||
|
"news": '"{company}" news 2025 2026',
|
||||||
|
"funding": '"{company}" funding round investors Series valuation',
|
||||||
|
"tech": '"{company}" tech stack engineering technology platform',
|
||||||
|
"competitors": '"{company}" competitors alternatives vs market',
|
||||||
|
"culture": '"{company}" glassdoor culture reviews employees',
|
||||||
|
"accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG',
|
||||||
|
"ceo_press": '"{ceo}" "{company}"', # only used if ceo is known
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _run_search_query(query: str, results: dict, key: str) -> None:
|
||||||
|
"""Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key]."""
|
||||||
|
import requests
|
||||||
|
|
||||||
|
snippets: list[str] = []
|
||||||
|
seen: set[str] = set()
|
||||||
|
try:
|
||||||
|
resp = requests.get(
|
||||||
|
"http://localhost:8888/search",
|
||||||
|
params={"q": query, "format": "json", "language": "en-US"},
|
||||||
|
timeout=12,
|
||||||
|
)
|
||||||
|
if resp.status_code != 200:
|
||||||
|
return
|
||||||
|
for r in resp.json().get("results", [])[:4]:
|
||||||
|
url = r.get("url", "")
|
||||||
|
if url in seen:
|
||||||
|
continue
|
||||||
|
seen.add(url)
|
||||||
|
title = r.get("title", "").strip()
|
||||||
|
content = r.get("content", "").strip()
|
||||||
|
if title or content:
|
||||||
|
snippets.append(f"- **{title}**\n {content}\n <{url}>")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
results[key] = "\n\n".join(snippets)
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Run all search queries in parallel threads.
|
||||||
|
Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press).
|
||||||
|
Missing/failed queries produce empty strings.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
|
||||||
|
results: dict[str, str] = {}
|
||||||
|
threads = []
|
||||||
|
|
||||||
|
keys: list[str] = []
|
||||||
|
for key, pattern in _SEARCH_QUERIES.items():
|
||||||
|
if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found":
|
||||||
|
continue
|
||||||
|
# Use replace() not .format() — company names may contain curly braces
|
||||||
|
query = pattern.replace("{company}", company).replace("{ceo}", ceo)
|
||||||
|
t = threading.Thread(
|
||||||
|
target=_run_search_query,
|
||||||
|
args=(query, results, key),
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
threads.append(t)
|
||||||
|
keys.append(key)
|
||||||
|
t.start()
|
||||||
|
|
||||||
|
for t, key in zip(threads, keys):
|
||||||
|
t.join(timeout=15)
|
||||||
|
# Thread may still be alive after timeout — pre-populate key so
|
||||||
|
# the results dict contract ("missing queries → empty string") holds
|
||||||
|
if t.is_alive():
|
||||||
|
results.setdefault(key, "")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_sections(text: str) -> dict[str, str]:
|
||||||
|
"""Split LLM markdown output on ## headers into named sections."""
|
||||||
|
sections: dict[str, str] = {}
|
||||||
|
pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE)
|
||||||
|
matches = list(pattern.finditer(text))
|
||||||
|
for i, match in enumerate(matches):
|
||||||
|
name = match.group(1).strip()
|
||||||
|
start = match.end()
|
||||||
|
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||||
|
sections[name] = text[start:end].strip()
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml"
|
||||||
|
_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml"
|
||||||
|
|
||||||
|
# Companies where Alex has an NDA — reference as generic label unless
|
||||||
|
# the role is security-focused (score >= 3 matching JD keywords).
|
||||||
|
_NDA_COMPANIES = {"upguard"}
|
||||||
|
|
||||||
|
|
||||||
|
def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]:
|
||||||
|
"""Score each experience entry by keyword overlap with JD; return sorted descending."""
|
||||||
|
jd_lower = jd.lower()
|
||||||
|
scored = []
|
||||||
|
for exp in experiences:
|
||||||
|
text = " ".join([
|
||||||
|
exp.get("position", ""),
|
||||||
|
exp.get("company", ""),
|
||||||
|
" ".join(
|
||||||
|
v
|
||||||
|
for resp in exp.get("key_responsibilities", [])
|
||||||
|
for v in resp.values()
|
||||||
|
),
|
||||||
|
]).lower()
|
||||||
|
score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower)
|
||||||
|
scored.append({**exp, "score": score})
|
||||||
|
return sorted(scored, key=lambda x: x["score"], reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str:
|
||||||
|
"""
|
||||||
|
Build the resume section of the LLM context block.
|
||||||
|
Top 2 scored experiences included in full detail; rest as one-liners.
|
||||||
|
Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)'
|
||||||
|
unless the role is security-focused (score >= 3).
|
||||||
|
"""
|
||||||
|
experiences = resume.get("experience_details", [])
|
||||||
|
if not experiences:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
scored = _score_experiences(experiences, keywords, jd)
|
||||||
|
top2 = scored[:2]
|
||||||
|
rest = scored[2:]
|
||||||
|
|
||||||
|
def _company_label(exp: dict) -> str:
|
||||||
|
company = exp.get("company", "")
|
||||||
|
if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3:
|
||||||
|
return "enterprise security vendor (NDA)"
|
||||||
|
return company
|
||||||
|
|
||||||
|
def _exp_header(exp: dict) -> str:
|
||||||
|
return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})"
|
||||||
|
|
||||||
|
def _exp_bullets(exp: dict) -> str:
|
||||||
|
bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()]
|
||||||
|
return "\n".join(f" - {b}" for b in bullets)
|
||||||
|
|
||||||
|
lines = ["## Alex's Matched Experience"]
|
||||||
|
for exp in top2:
|
||||||
|
lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})")
|
||||||
|
lines.append(_exp_bullets(exp))
|
||||||
|
|
||||||
|
if rest:
|
||||||
|
condensed = ", ".join(_exp_header(e) for e in rest)
|
||||||
|
lines.append(f"\nAlso in Alex's background: {condensed}")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_resume_and_keywords() -> tuple[dict, list[str]]:
|
||||||
|
"""Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list)."""
|
||||||
|
import yaml as _yaml
|
||||||
|
|
||||||
|
resume = {}
|
||||||
|
if _RESUME_YAML.exists():
|
||||||
|
resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {}
|
||||||
|
|
||||||
|
keywords: list[str] = []
|
||||||
|
if _KEYWORDS_YAML.exists():
|
||||||
|
kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {}
|
||||||
|
for lst in kw_cfg.values():
|
||||||
|
if isinstance(lst, list):
|
||||||
|
keywords.extend(lst)
|
||||||
|
|
||||||
|
return resume, keywords
|
||||||
|
|
||||||
|
|
||||||
|
def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict:
|
||||||
|
"""
|
||||||
|
Generate a pre-interview research brief for a job.
|
||||||
|
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
job : dict
|
||||||
|
Job row from the DB (needs at least 'company', 'title', 'description').
|
||||||
|
use_scraper : bool
|
||||||
|
Whether to attempt live data via SearXNG before falling back to LLM.
|
||||||
|
|
||||||
|
Returns
|
||||||
|
-------
|
||||||
|
dict with keys: raw_output, company_brief, ceo_brief, tech_brief,
|
||||||
|
funding_brief, competitors_brief, red_flags, talking_points
|
||||||
|
"""
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
router = LLMRouter()
|
||||||
|
research_order = router.config.get("research_fallback_order") or router.config["fallback_order"]
|
||||||
|
company = job.get("company") or "the company"
|
||||||
|
title = job.get("title") or "this role"
|
||||||
|
jd_excerpt = (job.get("description") or "")[:1500]
|
||||||
|
|
||||||
|
resume, keywords = _load_resume_and_keywords()
|
||||||
|
matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()]
|
||||||
|
resume_context = _build_resume_context(resume, keywords, jd_excerpt)
|
||||||
|
keywords_note = (
|
||||||
|
f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}"
|
||||||
|
if matched_keywords else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
def _stage(msg: str) -> None:
|
||||||
|
if on_stage:
|
||||||
|
try:
|
||||||
|
on_stage(msg)
|
||||||
|
except Exception:
|
||||||
|
pass # never let stage callbacks break the task
|
||||||
|
|
||||||
|
# ── Phase 1: live scrape (optional) ──────────────────────────────────────
|
||||||
|
live_data: dict = {}
|
||||||
|
scrape_note = ""
|
||||||
|
_stage("Checking for live company data…")
|
||||||
|
if use_scraper and _SCRAPER_AVAILABLE and _searxng_running():
|
||||||
|
_stage("Scraping CEO & HQ data…")
|
||||||
|
try:
|
||||||
|
live_data = _scrape_company(company)
|
||||||
|
parts = []
|
||||||
|
if live_data.get("ceo") not in (None, "Not found"):
|
||||||
|
parts.append(f"CEO: {live_data['ceo']}")
|
||||||
|
if live_data.get("headquarters") not in (None, "Not found"):
|
||||||
|
parts.append(f"HQ: {live_data['headquarters']}")
|
||||||
|
if live_data.get("linkedin") not in (None, "Not found"):
|
||||||
|
parts.append(f"LinkedIn: {live_data['linkedin']}")
|
||||||
|
if parts:
|
||||||
|
scrape_note = (
|
||||||
|
"\n\n**Live data retrieved via SearXNG:**\n"
|
||||||
|
+ "\n".join(f"- {p}" for p in parts)
|
||||||
|
+ "\n\nIncorporate these facts where relevant."
|
||||||
|
)
|
||||||
|
except BaseException as e:
|
||||||
|
scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_"
|
||||||
|
|
||||||
|
# ── Phase 1b: parallel search queries ────────────────────────────────────
|
||||||
|
search_data: dict[str, str] = {}
|
||||||
|
_stage("Running web searches…")
|
||||||
|
if use_scraper and _searxng_running():
|
||||||
|
_stage("Running web searches (news, funding, tech, culture)…")
|
||||||
|
try:
|
||||||
|
ceo_name = (live_data.get("ceo") or "") if live_data else ""
|
||||||
|
search_data = _fetch_search_data(company, ceo=ceo_name)
|
||||||
|
except BaseException:
|
||||||
|
pass # best-effort; never fail the whole task
|
||||||
|
|
||||||
|
# Track whether SearXNG actually contributed usable data to this brief.
|
||||||
|
scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0
|
||||||
|
|
||||||
|
def _section_note(key: str, label: str) -> str:
|
||||||
|
text = search_data.get(key, "").strip()
|
||||||
|
return f"\n\n## {label} (live web search)\n\n{text}" if text else ""
|
||||||
|
|
||||||
|
news_note = _section_note("news", "News & Press")
|
||||||
|
funding_note = _section_note("funding", "Funding & Investors")
|
||||||
|
tech_note = _section_note("tech", "Tech Stack")
|
||||||
|
competitors_note = _section_note("competitors", "Competitors")
|
||||||
|
culture_note = _section_note("culture", "Culture & Employee Signals")
|
||||||
|
accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion")
|
||||||
|
ceo_press_note = _section_note("ceo_press", "CEO in the News")
|
||||||
|
|
||||||
|
# ── Phase 2: LLM synthesis ────────────────────────────────────────────────
|
||||||
|
_stage("Generating brief with LLM… (30–90 seconds)")
|
||||||
|
prompt = f"""You are preparing Alex Rivera for a job interview.
|
||||||
|
|
||||||
|
Role: **{title}** at **{company}**
|
||||||
|
|
||||||
|
## Job Description
|
||||||
|
{jd_excerpt}
|
||||||
|
{resume_context}{keywords_note}
|
||||||
|
|
||||||
|
## Live Company Data
|
||||||
|
{scrape_note.strip() or "_(scrape unavailable)_"}
|
||||||
|
{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Produce a structured research brief using **exactly** these eight markdown section headers
|
||||||
|
(include all eight even if a section has limited data — say so honestly):
|
||||||
|
|
||||||
|
## Company Overview
|
||||||
|
What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning.
|
||||||
|
|
||||||
|
## Leadership & Culture
|
||||||
|
CEO background and leadership style, key execs, mission/values statements, Glassdoor themes.
|
||||||
|
|
||||||
|
## Tech Stack & Product
|
||||||
|
Technologies, platforms, and product direction relevant to the {title} role.
|
||||||
|
|
||||||
|
## Funding & Market Position
|
||||||
|
Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape.
|
||||||
|
|
||||||
|
## Recent Developments
|
||||||
|
News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months.
|
||||||
|
Draw on the live snippets above; if none available, note what is publicly known.
|
||||||
|
|
||||||
|
## Red Flags & Watch-outs
|
||||||
|
Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call.
|
||||||
|
If nothing notable, write "No significant red flags identified."
|
||||||
|
|
||||||
|
## Inclusion & Accessibility
|
||||||
|
Assess {company}'s commitment to disability inclusion and accessibility. Cover:
|
||||||
|
- ADA accommodation language in job postings or company policy
|
||||||
|
- Disability Employee Resource Group (ERG) or affinity group
|
||||||
|
- Product or service accessibility (WCAG compliance, adaptive features, AT integrations)
|
||||||
|
- Any public disability/accessibility advocacy, partnerships, or certifications
|
||||||
|
- Glassdoor or press signals about how employees with disabilities experience the company
|
||||||
|
If no specific signals are found, say so clearly — absence of public commitment is itself signal.
|
||||||
|
This section is for Alex's personal decision-making only and will not appear in any application.
|
||||||
|
|
||||||
|
## Talking Points for Alex
|
||||||
|
Five specific talking points for the phone screen. Each must:
|
||||||
|
- Reference a concrete experience from Alex's matched background by name
|
||||||
|
(UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus)
|
||||||
|
- Connect to a specific signal from the JD or company context above
|
||||||
|
- Be 1–2 sentences, ready to speak aloud
|
||||||
|
- Never give generic advice
|
||||||
|
|
||||||
|
---
|
||||||
|
⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call.
|
||||||
|
"""
|
||||||
|
|
||||||
|
raw = router.complete(prompt, fallback_order=research_order)
|
||||||
|
# Strip <think>…</think> blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R)
|
||||||
|
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||||||
|
sections = _parse_sections(raw)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"raw_output": raw,
|
||||||
|
"company_brief": sections.get("Company Overview", ""),
|
||||||
|
"ceo_brief": sections.get("Leadership & Culture", ""),
|
||||||
|
"tech_brief": sections.get("Tech Stack & Product", ""),
|
||||||
|
"funding_brief": sections.get("Funding & Market Position", ""),
|
||||||
|
"competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section
|
||||||
|
"red_flags": sections.get("Red Flags & Watch-outs", ""),
|
||||||
|
"accessibility_brief": sections.get("Inclusion & Accessibility", ""),
|
||||||
|
"talking_points": sections.get("Talking Points for Alex", ""),
|
||||||
|
"scrape_used": scrape_used,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Generate company research brief")
|
||||||
|
parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db")
|
||||||
|
parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, init_db, save_research
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
init_db(DEFAULT_DB)
|
||||||
|
conn = sqlite3.connect(DEFAULT_DB)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}")
|
||||||
|
|
||||||
|
job = dict(row)
|
||||||
|
print(f"Researching: {job['title']} @ {job['company']} …\n")
|
||||||
|
if _SCRAPER_AVAILABLE and not args.no_scrape:
|
||||||
|
print(f"SearXNG available: {_searxng_running()}")
|
||||||
|
|
||||||
|
result = research_company(job, use_scraper=not args.no_scrape)
|
||||||
|
save_research(DEFAULT_DB, job_id=args.job_id, **result)
|
||||||
|
print(result["raw_output"])
|
||||||
|
print(f"\n[Saved to company_research for job {args.job_id}]")
|
||||||
1
scripts/custom_boards/__init__.py
Normal file
1
scripts/custom_boards/__init__.py
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
# Custom job board scrapers — each module exposes scrape(profile, location, results_wanted) -> list[dict]
|
||||||
160
scripts/custom_boards/adzuna.py
Normal file
160
scripts/custom_boards/adzuna.py
Normal file
|
|
@ -0,0 +1,160 @@
|
||||||
|
"""Adzuna Jobs API scraper.
|
||||||
|
|
||||||
|
API docs: https://developer.adzuna.com/docs/search
|
||||||
|
Config: config/adzuna.yaml (gitignored — contains app_id + app_key)
|
||||||
|
|
||||||
|
Each title in the search profile is queried as an exact phrase per location.
|
||||||
|
Returns a list of dicts compatible with scripts.db.insert_job().
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "adzuna.yaml"
|
||||||
|
_BASE_URL = "https://api.adzuna.com/v1/api/jobs/us/search"
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config() -> tuple[str, str]:
|
||||||
|
if not _CONFIG_PATH.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Adzuna config not found: {_CONFIG_PATH}\n"
|
||||||
|
"Copy config/adzuna.yaml.example → config/adzuna.yaml and fill in credentials."
|
||||||
|
)
|
||||||
|
cfg = yaml.safe_load(_CONFIG_PATH.read_text())
|
||||||
|
app_id = (cfg.get("app_id") or "").strip()
|
||||||
|
app_key = (cfg.get("app_key") or "").strip()
|
||||||
|
if not app_id or not app_key:
|
||||||
|
raise ValueError(
|
||||||
|
"config/adzuna.yaml requires both 'app_id' and 'app_key'.\n"
|
||||||
|
"Find your App ID at https://developer.adzuna.com/admin/applications"
|
||||||
|
)
|
||||||
|
return app_id, app_key
|
||||||
|
|
||||||
|
|
||||||
|
def _salary_str(job: dict) -> str:
|
||||||
|
lo = job.get("salary_min")
|
||||||
|
hi = job.get("salary_max")
|
||||||
|
try:
|
||||||
|
if lo and hi:
|
||||||
|
return f"${int(lo):,} – ${int(hi):,}"
|
||||||
|
if lo:
|
||||||
|
return f"${int(lo):,}+"
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _is_remote(location_display: str) -> bool:
|
||||||
|
return "remote" in location_display.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
|
||||||
|
"""Fetch jobs from the Adzuna API for a single location.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile: Search profile dict from search_profiles.yaml.
|
||||||
|
location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
|
||||||
|
results_wanted: Maximum results to return across all titles.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of job dicts with keys: title, company, url, source, location,
|
||||||
|
is_remote, salary, description.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
app_id, app_key = _load_config()
|
||||||
|
except (FileNotFoundError, ValueError) as exc:
|
||||||
|
print(f" [adzuna] Skipped — {exc}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
titles = profile.get("titles", [])
|
||||||
|
hours_old = profile.get("hours_old", 240)
|
||||||
|
max_days_old = max(1, hours_old // 24)
|
||||||
|
is_remote_search = location.lower() == "remote"
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({"Accept": "application/json", "User-Agent": "Mozilla/5.0"})
|
||||||
|
|
||||||
|
seen_ids: set[str] = set()
|
||||||
|
results: list[dict] = []
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
page = 1
|
||||||
|
while len(results) < results_wanted:
|
||||||
|
# Adzuna doesn't support where=remote — it treats it as a city name and
|
||||||
|
# returns 0 results. For remote searches, append "remote" to the what param.
|
||||||
|
if is_remote_search:
|
||||||
|
params = {
|
||||||
|
"app_id": app_id,
|
||||||
|
"app_key": app_key,
|
||||||
|
"results_per_page": 50,
|
||||||
|
"what": f'"{title}" remote',
|
||||||
|
"sort_by": "date",
|
||||||
|
"max_days_old": max_days_old,
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
params = {
|
||||||
|
"app_id": app_id,
|
||||||
|
"app_key": app_key,
|
||||||
|
"results_per_page": 50,
|
||||||
|
"what_phrase": title,
|
||||||
|
"where": location,
|
||||||
|
"sort_by": "date",
|
||||||
|
"max_days_old": max_days_old,
|
||||||
|
}
|
||||||
|
try:
|
||||||
|
resp = session.get(f"{_BASE_URL}/{page}", params=params, timeout=20)
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
print(f" [adzuna] Request error ({title}): {exc}")
|
||||||
|
break
|
||||||
|
|
||||||
|
if resp.status_code == 401:
|
||||||
|
print(" [adzuna] Auth failed — check app_id and app_key in config/adzuna.yaml")
|
||||||
|
return results
|
||||||
|
if resp.status_code != 200:
|
||||||
|
print(f" [adzuna] HTTP {resp.status_code} for '{title}' page {page}")
|
||||||
|
break
|
||||||
|
|
||||||
|
data = resp.json()
|
||||||
|
jobs = data.get("results", [])
|
||||||
|
if not jobs:
|
||||||
|
break
|
||||||
|
|
||||||
|
for job in jobs:
|
||||||
|
job_id = str(job.get("id", ""))
|
||||||
|
if job_id in seen_ids:
|
||||||
|
continue
|
||||||
|
seen_ids.add(job_id)
|
||||||
|
|
||||||
|
loc_display = job.get("location", {}).get("display_name", "")
|
||||||
|
redirect_url = job.get("redirect_url", "")
|
||||||
|
if not redirect_url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"title": job.get("title", ""),
|
||||||
|
"company": job.get("company", {}).get("display_name", ""),
|
||||||
|
"url": redirect_url,
|
||||||
|
"source": "adzuna",
|
||||||
|
"location": loc_display,
|
||||||
|
"is_remote": is_remote_search or _is_remote(loc_display),
|
||||||
|
"salary": _salary_str(job),
|
||||||
|
"description": job.get("description", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
total = data.get("count", 0)
|
||||||
|
if len(results) >= total or len(jobs) < 50:
|
||||||
|
break # last page
|
||||||
|
|
||||||
|
page += 1
|
||||||
|
time.sleep(0.5) # polite pacing between pages
|
||||||
|
|
||||||
|
time.sleep(0.5) # between titles
|
||||||
|
|
||||||
|
return results[:results_wanted]
|
||||||
177
scripts/custom_boards/craigslist.py
Normal file
177
scripts/custom_boards/craigslist.py
Normal file
|
|
@ -0,0 +1,177 @@
|
||||||
|
"""Craigslist job scraper — RSS-based.
|
||||||
|
|
||||||
|
Uses Craigslist's native RSS feed endpoint for discovery.
|
||||||
|
Full job description is populated by the scrape_url background task.
|
||||||
|
Company name and salary (not structured in Craigslist listings) are
|
||||||
|
extracted from the description body by the enrich_craigslist task.
|
||||||
|
|
||||||
|
Config: config/craigslist.yaml (gitignored — metro list + location map)
|
||||||
|
config/craigslist.yaml.example (committed template)
|
||||||
|
|
||||||
|
Returns a list of dicts compatible with scripts.db.insert_job().
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import time
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from email.utils import parsedate_to_datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml"
|
||||||
|
_DEFAULT_CATEGORY = "jjj"
|
||||||
|
_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
_TIMEOUT = 15
|
||||||
|
_SLEEP = 0.5 # seconds between requests — easy to make configurable later
|
||||||
|
|
||||||
|
|
||||||
|
def _load_config() -> dict:
|
||||||
|
if not _CONFIG_PATH.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Craigslist config not found: {_CONFIG_PATH}\n"
|
||||||
|
"Copy config/craigslist.yaml.example → config/craigslist.yaml "
|
||||||
|
"and configure your target metros."
|
||||||
|
)
|
||||||
|
cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {}
|
||||||
|
if not cfg.get("metros"):
|
||||||
|
raise ValueError(
|
||||||
|
"config/craigslist.yaml must contain at least one entry under 'metros'."
|
||||||
|
)
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
def _rss_url(metro: str, category: str, query: str) -> str:
|
||||||
|
return (
|
||||||
|
f"https://{metro}.craigslist.org/search/{category}"
|
||||||
|
f"?query={quote_plus(query)}&format=rss&sort=date"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_pubdate(pubdate_str: str) -> datetime | None:
|
||||||
|
"""Parse an RSS pubDate string to a timezone-aware datetime."""
|
||||||
|
try:
|
||||||
|
return parsedate_to_datetime(pubdate_str)
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_rss(url: str) -> list[dict]:
|
||||||
|
"""Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts."""
|
||||||
|
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
try:
|
||||||
|
root = ET.fromstring(resp.content)
|
||||||
|
except ET.ParseError as exc:
|
||||||
|
raise ValueError(f"Malformed RSS XML: {exc}") from exc
|
||||||
|
|
||||||
|
items = []
|
||||||
|
for item in root.findall(".//item"):
|
||||||
|
def _text(tag: str, _item=item) -> str:
|
||||||
|
el = _item.find(tag)
|
||||||
|
return (el.text or "").strip() if el is not None else ""
|
||||||
|
|
||||||
|
items.append({
|
||||||
|
"title": _text("title"),
|
||||||
|
"link": _text("link"),
|
||||||
|
"description": _text("description"),
|
||||||
|
"pubDate": _text("pubDate"),
|
||||||
|
})
|
||||||
|
return items
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
|
||||||
|
"""Fetch jobs from Craigslist RSS for a single location.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile: Search profile dict from search_profiles.yaml.
|
||||||
|
location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
|
||||||
|
results_wanted: Maximum results to return across all metros and titles.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of job dicts with keys: title, company, url, source, location,
|
||||||
|
is_remote, salary, description.
|
||||||
|
company/salary are empty — filled later by enrich_craigslist task.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
cfg = _load_config()
|
||||||
|
except (FileNotFoundError, ValueError) as exc:
|
||||||
|
print(f" [craigslist] Skipped — {exc}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
metros_all: list[str] = cfg.get("metros", [])
|
||||||
|
location_map: dict[str, str] = cfg.get("location_map", {})
|
||||||
|
category: str = cfg.get("category") or _DEFAULT_CATEGORY
|
||||||
|
|
||||||
|
is_remote_search = location.lower() == "remote"
|
||||||
|
if is_remote_search:
|
||||||
|
metros = metros_all
|
||||||
|
else:
|
||||||
|
metro = location_map.get(location)
|
||||||
|
if not metro:
|
||||||
|
print(f" [craigslist] No metro mapping for '{location}' — skipping")
|
||||||
|
return []
|
||||||
|
metros = [metro]
|
||||||
|
|
||||||
|
titles: list[str] = profile.get("titles", [])
|
||||||
|
hours_old: int = profile.get("hours_old", 240)
|
||||||
|
cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600)
|
||||||
|
|
||||||
|
seen_urls: set[str] = set()
|
||||||
|
results: list[dict] = []
|
||||||
|
|
||||||
|
for metro in metros:
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
for title in titles:
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
url = _rss_url(metro, category, title)
|
||||||
|
try:
|
||||||
|
items = _fetch_rss(url)
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
print(f" [craigslist] HTTP error ({metro}/{title}): {exc}")
|
||||||
|
time.sleep(_SLEEP)
|
||||||
|
continue
|
||||||
|
except ValueError as exc:
|
||||||
|
print(f" [craigslist] Parse error ({metro}/{title}): {exc}")
|
||||||
|
time.sleep(_SLEEP)
|
||||||
|
continue
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
item_url = item.get("link", "")
|
||||||
|
if not item_url or item_url in seen_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pub = _parse_pubdate(item.get("pubDate", ""))
|
||||||
|
if pub and pub.timestamp() < cutoff:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_urls.add(item_url)
|
||||||
|
results.append({
|
||||||
|
"title": item.get("title", ""),
|
||||||
|
"company": "",
|
||||||
|
"url": item_url,
|
||||||
|
"source": "craigslist",
|
||||||
|
"location": f"{metro} (Craigslist)",
|
||||||
|
"is_remote": is_remote_search,
|
||||||
|
"salary": "",
|
||||||
|
"description": "",
|
||||||
|
})
|
||||||
|
|
||||||
|
time.sleep(_SLEEP)
|
||||||
|
|
||||||
|
return results[:results_wanted]
|
||||||
179
scripts/custom_boards/theladders.py
Normal file
179
scripts/custom_boards/theladders.py
Normal file
|
|
@ -0,0 +1,179 @@
|
||||||
|
"""The Ladders scraper — Playwright-based (requires chromium installed).
|
||||||
|
|
||||||
|
The Ladders is a client-side React app (no SSR __NEXT_DATA__). We use Playwright
|
||||||
|
to execute JS, wait for job cards to render, then extract from the DOM.
|
||||||
|
|
||||||
|
Company names are hidden from guest (non-logged-in) users, but are encoded in
|
||||||
|
the job URL slug: /job/{title-slug}-{company-slug}-{location-slug}_{id}
|
||||||
|
|
||||||
|
curl_cffi is no longer needed for this scraper; plain Playwright is sufficient.
|
||||||
|
playwright must be installed: `conda run -n job-seeker python -m playwright install chromium`
|
||||||
|
|
||||||
|
Returns a list of dicts compatible with scripts.db.insert_job().
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
_BASE = "https://www.theladders.com"
|
||||||
|
_SEARCH_PATH = "/jobs/searchjobs/{slug}"
|
||||||
|
|
||||||
|
# Location slug in URLs for remote jobs
|
||||||
|
_REMOTE_SLUG = "virtual-travel"
|
||||||
|
|
||||||
|
|
||||||
|
def _company_from_url(href: str, title_slug: str) -> str:
|
||||||
|
"""
|
||||||
|
Extract company name from The Ladders job URL slug.
|
||||||
|
|
||||||
|
URL format: /job/{title-slug}-{company-slug}-{location-slug}_{id}?ir=1
|
||||||
|
Example: /job/customer-success-manager-gainsight-virtual-travel_85434789
|
||||||
|
→ "Gainsight"
|
||||||
|
"""
|
||||||
|
# Strip path prefix and query
|
||||||
|
slug = href.split("/job/", 1)[-1].split("?")[0]
|
||||||
|
# Strip numeric ID suffix (e.g. _85434789)
|
||||||
|
slug = re.sub(r"_\d+$", "", slug)
|
||||||
|
# Strip known title prefix
|
||||||
|
if slug.startswith(title_slug + "-"):
|
||||||
|
slug = slug[len(title_slug) + 1:]
|
||||||
|
# Strip common location suffixes
|
||||||
|
for loc_suffix in [f"-{_REMOTE_SLUG}", "-new-york", "-los-angeles",
|
||||||
|
"-san-francisco", "-chicago", "-austin", "-seattle",
|
||||||
|
"-boston", "-atlanta", "-remote"]:
|
||||||
|
if slug.endswith(loc_suffix):
|
||||||
|
slug = slug[: -len(loc_suffix)]
|
||||||
|
break
|
||||||
|
# Convert kebab-case → title case
|
||||||
|
return slug.replace("-", " ").title() if slug else ""
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_jobs_js() -> str:
|
||||||
|
"""JS to run in page context — extracts job data from rendered card elements."""
|
||||||
|
return """() => {
|
||||||
|
const cards = document.querySelectorAll('[class*=job-card-container]');
|
||||||
|
return Array.from(cards).map(card => {
|
||||||
|
const link = card.querySelector('p.job-link-wrapper a, a.clipped-text');
|
||||||
|
const salary = card.querySelector('p.salary, .salary-info p');
|
||||||
|
const locEl = card.querySelector('.remote-location-text, .location-info');
|
||||||
|
const remoteEl = card.querySelector('.remote-flag-badge-remote');
|
||||||
|
return {
|
||||||
|
title: link ? link.textContent.trim() : null,
|
||||||
|
href: link ? link.getAttribute('href') : null,
|
||||||
|
salary: salary ? salary.textContent.replace('*','').trim() : null,
|
||||||
|
location: locEl ? locEl.textContent.trim() : null,
|
||||||
|
is_remote: !!remoteEl,
|
||||||
|
};
|
||||||
|
}).filter(j => j.title && j.href);
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Scrape job listings from The Ladders using Playwright.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
profile: Search profile dict (uses 'titles').
|
||||||
|
location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA").
|
||||||
|
results_wanted: Maximum results to return across all titles.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of job dicts with keys: title, company, url, source, location,
|
||||||
|
is_remote, salary, description.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from playwright.sync_api import sync_playwright
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
" [theladders] playwright not installed.\n"
|
||||||
|
" Install: conda run -n job-seeker pip install playwright && "
|
||||||
|
"conda run -n job-seeker python -m playwright install chromium"
|
||||||
|
)
|
||||||
|
return []
|
||||||
|
|
||||||
|
is_remote_search = location.lower() == "remote"
|
||||||
|
results: list[dict] = []
|
||||||
|
seen_urls: set[str] = set()
|
||||||
|
|
||||||
|
with sync_playwright() as p:
|
||||||
|
browser = p.chromium.launch(headless=True)
|
||||||
|
ctx = browser.new_context(
|
||||||
|
user_agent=(
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
page = ctx.new_page()
|
||||||
|
|
||||||
|
for title in profile.get("titles", []):
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
slug = title.lower().replace(" ", "-").replace("/", "-")
|
||||||
|
title_slug = slug # used for company extraction from URL
|
||||||
|
|
||||||
|
params: dict[str, str] = {}
|
||||||
|
if is_remote_search:
|
||||||
|
params["remote"] = "true"
|
||||||
|
elif location:
|
||||||
|
params["location"] = location
|
||||||
|
|
||||||
|
url = _BASE + _SEARCH_PATH.format(slug=slug)
|
||||||
|
if params:
|
||||||
|
query = "&".join(f"{k}={v}" for k, v in params.items())
|
||||||
|
url = f"{url}?{query}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
page.goto(url, timeout=30_000)
|
||||||
|
page.wait_for_load_state("networkidle", timeout=20_000)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" [theladders] Page load error for '{title}': {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw_jobs: list[dict[str, Any]] = page.evaluate(_extract_jobs_js())
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" [theladders] JS extract error for '{title}': {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not raw_jobs:
|
||||||
|
print(f" [theladders] No cards found for '{title}' — selector may need updating")
|
||||||
|
continue
|
||||||
|
|
||||||
|
for job in raw_jobs:
|
||||||
|
href = job.get("href", "")
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
full_url = _BASE + href if href.startswith("/") else href
|
||||||
|
if full_url in seen_urls:
|
||||||
|
continue
|
||||||
|
seen_urls.add(full_url)
|
||||||
|
|
||||||
|
company = _company_from_url(href, title_slug)
|
||||||
|
loc_text = (job.get("location") or "").replace("Remote", "").strip(", ")
|
||||||
|
if is_remote_search or job.get("is_remote"):
|
||||||
|
loc_display = "Remote" + (f" — {loc_text}" if loc_text and loc_text != "US-Anywhere" else "")
|
||||||
|
else:
|
||||||
|
loc_display = loc_text or location
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"title": job.get("title", ""),
|
||||||
|
"company": company,
|
||||||
|
"url": full_url,
|
||||||
|
"source": "theladders",
|
||||||
|
"location": loc_display,
|
||||||
|
"is_remote": bool(job.get("is_remote") or is_remote_search),
|
||||||
|
"salary": job.get("salary") or "",
|
||||||
|
"description": "", # not available in card view; scrape_url will fill in
|
||||||
|
})
|
||||||
|
|
||||||
|
if len(results) >= results_wanted:
|
||||||
|
break
|
||||||
|
|
||||||
|
time.sleep(1) # polite pacing between titles
|
||||||
|
|
||||||
|
browser.close()
|
||||||
|
|
||||||
|
return results[:results_wanted]
|
||||||
728
scripts/db.py
Normal file
728
scripts/db.py
Normal file
|
|
@ -0,0 +1,728 @@
|
||||||
|
"""
|
||||||
|
SQLite staging layer for job listings.
|
||||||
|
Jobs flow: pending → approved/rejected → applied → synced
|
||||||
|
applied → phone_screen → interviewing → offer → hired (or rejected)
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
DEFAULT_DB = Path(__file__).parent.parent / "staging.db"
|
||||||
|
|
||||||
|
CREATE_JOBS = """
|
||||||
|
CREATE TABLE IF NOT EXISTS jobs (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
title TEXT,
|
||||||
|
company TEXT,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
source TEXT,
|
||||||
|
location TEXT,
|
||||||
|
is_remote INTEGER DEFAULT 0,
|
||||||
|
salary TEXT,
|
||||||
|
description TEXT,
|
||||||
|
match_score REAL,
|
||||||
|
keyword_gaps TEXT,
|
||||||
|
date_found TEXT,
|
||||||
|
status TEXT DEFAULT 'pending',
|
||||||
|
notion_page_id TEXT,
|
||||||
|
cover_letter TEXT,
|
||||||
|
applied_at TEXT
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
CREATE_JOB_CONTACTS = """
|
||||||
|
CREATE TABLE IF NOT EXISTS job_contacts (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
job_id INTEGER NOT NULL,
|
||||||
|
direction TEXT DEFAULT 'inbound',
|
||||||
|
subject TEXT,
|
||||||
|
from_addr TEXT,
|
||||||
|
to_addr TEXT,
|
||||||
|
body TEXT,
|
||||||
|
received_at TEXT,
|
||||||
|
is_response_needed INTEGER DEFAULT 0,
|
||||||
|
responded_at TEXT,
|
||||||
|
message_id TEXT,
|
||||||
|
FOREIGN KEY (job_id) REFERENCES jobs(id)
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
_CONTACT_MIGRATIONS = [
|
||||||
|
("message_id", "TEXT"),
|
||||||
|
("stage_signal", "TEXT"),
|
||||||
|
("suggestion_dismissed", "INTEGER DEFAULT 0"),
|
||||||
|
]
|
||||||
|
|
||||||
|
_RESEARCH_MIGRATIONS = [
|
||||||
|
("tech_brief", "TEXT"),
|
||||||
|
("funding_brief", "TEXT"),
|
||||||
|
("competitors_brief", "TEXT"),
|
||||||
|
("red_flags", "TEXT"),
|
||||||
|
("scrape_used", "INTEGER"), # 1 = SearXNG contributed data, 0 = LLM-only
|
||||||
|
("accessibility_brief", "TEXT"), # Inclusion & Accessibility section
|
||||||
|
]
|
||||||
|
|
||||||
|
CREATE_COMPANY_RESEARCH = """
|
||||||
|
CREATE TABLE IF NOT EXISTS company_research (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
job_id INTEGER NOT NULL UNIQUE,
|
||||||
|
generated_at TEXT,
|
||||||
|
company_brief TEXT,
|
||||||
|
ceo_brief TEXT,
|
||||||
|
talking_points TEXT,
|
||||||
|
raw_output TEXT,
|
||||||
|
tech_brief TEXT,
|
||||||
|
funding_brief TEXT,
|
||||||
|
competitors_brief TEXT,
|
||||||
|
red_flags TEXT,
|
||||||
|
FOREIGN KEY (job_id) REFERENCES jobs(id)
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
CREATE_BACKGROUND_TASKS = """
|
||||||
|
CREATE TABLE IF NOT EXISTS background_tasks (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
task_type TEXT NOT NULL,
|
||||||
|
job_id INTEGER NOT NULL,
|
||||||
|
status TEXT NOT NULL DEFAULT 'queued',
|
||||||
|
error TEXT,
|
||||||
|
created_at DATETIME DEFAULT (datetime('now')),
|
||||||
|
started_at DATETIME,
|
||||||
|
finished_at DATETIME,
|
||||||
|
stage TEXT,
|
||||||
|
updated_at DATETIME
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
|
||||||
|
CREATE_SURVEY_RESPONSES = """
|
||||||
|
CREATE TABLE IF NOT EXISTS survey_responses (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
job_id INTEGER NOT NULL REFERENCES jobs(id),
|
||||||
|
survey_name TEXT,
|
||||||
|
received_at DATETIME,
|
||||||
|
source TEXT,
|
||||||
|
raw_input TEXT,
|
||||||
|
image_path TEXT,
|
||||||
|
mode TEXT,
|
||||||
|
llm_output TEXT,
|
||||||
|
reported_score TEXT,
|
||||||
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
||||||
|
);
|
||||||
|
"""
|
||||||
|
|
||||||
|
_MIGRATIONS = [
|
||||||
|
("cover_letter", "TEXT"),
|
||||||
|
("applied_at", "TEXT"),
|
||||||
|
("interview_date", "TEXT"),
|
||||||
|
("rejection_stage", "TEXT"),
|
||||||
|
("phone_screen_at", "TEXT"),
|
||||||
|
("interviewing_at", "TEXT"),
|
||||||
|
("offer_at", "TEXT"),
|
||||||
|
("hired_at", "TEXT"),
|
||||||
|
("survey_at", "TEXT"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _migrate_db(db_path: Path) -> None:
|
||||||
|
"""Add new columns to existing tables without breaking old data."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
for col, coltype in _MIGRATIONS:
|
||||||
|
try:
|
||||||
|
conn.execute(f"ALTER TABLE jobs ADD COLUMN {col} {coltype}")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass # column already exists
|
||||||
|
for col, coltype in _CONTACT_MIGRATIONS:
|
||||||
|
try:
|
||||||
|
conn.execute(f"ALTER TABLE job_contacts ADD COLUMN {col} {coltype}")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
for col, coltype in _RESEARCH_MIGRATIONS:
|
||||||
|
try:
|
||||||
|
conn.execute(f"ALTER TABLE company_research ADD COLUMN {col} {coltype}")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE background_tasks ADD COLUMN stage TEXT")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT")
|
||||||
|
except sqlite3.OperationalError:
|
||||||
|
pass
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def init_db(db_path: Path = DEFAULT_DB) -> None:
|
||||||
|
"""Create tables if they don't exist, then run migrations."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(CREATE_JOBS)
|
||||||
|
conn.execute(CREATE_JOB_CONTACTS)
|
||||||
|
conn.execute(CREATE_COMPANY_RESEARCH)
|
||||||
|
conn.execute(CREATE_BACKGROUND_TASKS)
|
||||||
|
conn.execute(CREATE_SURVEY_RESPONSES)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
_migrate_db(db_path)
|
||||||
|
|
||||||
|
|
||||||
|
def insert_job(db_path: Path = DEFAULT_DB, job: dict = None) -> Optional[int]:
|
||||||
|
"""Insert a job. Returns row id, or None if URL already exists."""
|
||||||
|
if job is None:
|
||||||
|
return None
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
try:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"""INSERT INTO jobs
|
||||||
|
(title, company, url, source, location, is_remote, salary, description, date_found)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(
|
||||||
|
job.get("title", ""),
|
||||||
|
job.get("company", ""),
|
||||||
|
job.get("url", ""),
|
||||||
|
job.get("source", ""),
|
||||||
|
job.get("location", ""),
|
||||||
|
int(bool(job.get("is_remote", False))),
|
||||||
|
job.get("salary", ""),
|
||||||
|
job.get("description", ""),
|
||||||
|
job.get("date_found", ""),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
return cursor.lastrowid
|
||||||
|
except sqlite3.IntegrityError:
|
||||||
|
return None # duplicate URL
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_by_id(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]:
|
||||||
|
"""Return a single job by ID, or None if not found."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
|
||||||
|
def get_jobs_by_status(db_path: Path = DEFAULT_DB, status: str = "pending") -> list[dict]:
|
||||||
|
"""Return all jobs with the given status as a list of dicts."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT * FROM jobs WHERE status = ? ORDER BY date_found DESC, id DESC",
|
||||||
|
(status,),
|
||||||
|
)
|
||||||
|
rows = [dict(row) for row in cursor.fetchall()]
|
||||||
|
conn.close()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def get_email_leads(db_path: Path = DEFAULT_DB) -> list[dict]:
|
||||||
|
"""Return pending jobs with source='email', newest first."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM jobs WHERE source = 'email' AND status = 'pending' "
|
||||||
|
"ORDER BY date_found DESC, id DESC"
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_counts(db_path: Path = DEFAULT_DB) -> dict:
|
||||||
|
"""Return counts per status."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT status, COUNT(*) as n FROM jobs GROUP BY status"
|
||||||
|
)
|
||||||
|
counts = {row[0]: row[1] for row in cursor.fetchall()}
|
||||||
|
conn.close()
|
||||||
|
return counts
|
||||||
|
|
||||||
|
|
||||||
|
def update_job_status(db_path: Path = DEFAULT_DB, ids: list[int] = None, status: str = "approved") -> None:
|
||||||
|
"""Batch-update status for a list of job IDs."""
|
||||||
|
if not ids:
|
||||||
|
return
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE jobs SET status = ? WHERE id IN ({','.join('?' * len(ids))})",
|
||||||
|
[status] + list(ids),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_existing_urls(db_path: Path = DEFAULT_DB) -> set[str]:
|
||||||
|
"""Return all URLs already in staging (any status)."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.execute("SELECT url FROM jobs")
|
||||||
|
urls = {row[0] for row in cursor.fetchall()}
|
||||||
|
conn.close()
|
||||||
|
return urls
|
||||||
|
|
||||||
|
|
||||||
|
def write_match_scores(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
score: float = 0.0, gaps: str = "") -> None:
|
||||||
|
"""Write match score and keyword gaps back to a job row."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE jobs SET match_score = ?, keyword_gaps = ? WHERE id = ?",
|
||||||
|
(score, gaps, job_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def update_cover_letter(db_path: Path = DEFAULT_DB, job_id: int = None, text: str = "") -> None:
|
||||||
|
"""Persist a generated/edited cover letter for a job."""
|
||||||
|
if job_id is None:
|
||||||
|
return
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("UPDATE jobs SET cover_letter = ? WHERE id = ?", (text, job_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
_UPDATABLE_JOB_COLS = {
|
||||||
|
"title", "company", "url", "source", "location", "is_remote",
|
||||||
|
"salary", "description", "match_score", "keyword_gaps",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def update_job_fields(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
fields: dict = None) -> None:
|
||||||
|
"""Update arbitrary job columns. Unknown keys are silently ignored."""
|
||||||
|
if job_id is None or not fields:
|
||||||
|
return
|
||||||
|
safe = {k: v for k, v in fields.items() if k in _UPDATABLE_JOB_COLS}
|
||||||
|
if not safe:
|
||||||
|
return
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
sets = ", ".join(f"{col} = ?" for col in safe)
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE jobs SET {sets} WHERE id = ?",
|
||||||
|
(*safe.values(), job_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def mark_applied(db_path: Path = DEFAULT_DB, ids: list[int] = None) -> None:
|
||||||
|
"""Set status='applied' and record today's date for a list of job IDs."""
|
||||||
|
if not ids:
|
||||||
|
return
|
||||||
|
today = datetime.now().isoformat()[:10]
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE jobs SET status = 'applied', applied_at = ? WHERE id IN ({','.join('?' * len(ids))})",
|
||||||
|
[today] + list(ids),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def kill_stuck_tasks(db_path: Path = DEFAULT_DB) -> int:
|
||||||
|
"""Mark all queued/running background tasks as failed. Returns count killed."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
count = conn.execute(
|
||||||
|
"UPDATE background_tasks SET status='failed', error='Killed by user',"
|
||||||
|
" finished_at=datetime('now') WHERE status IN ('queued','running')"
|
||||||
|
).rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def purge_email_data(db_path: Path = DEFAULT_DB) -> tuple[int, int]:
|
||||||
|
"""Delete all job_contacts rows and email-sourced pending jobs.
|
||||||
|
Returns (contacts_deleted, jobs_deleted).
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
c1 = conn.execute("DELETE FROM job_contacts").rowcount
|
||||||
|
c2 = conn.execute("DELETE FROM jobs WHERE source='email'").rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return c1, c2
|
||||||
|
|
||||||
|
|
||||||
|
def purge_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int:
|
||||||
|
"""Delete jobs matching given statuses. Returns number of rows deleted.
|
||||||
|
If statuses is None or empty, deletes ALL jobs (full reset).
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
if statuses:
|
||||||
|
placeholders = ",".join("?" * len(statuses))
|
||||||
|
cur = conn.execute(f"DELETE FROM jobs WHERE status IN ({placeholders})", statuses)
|
||||||
|
else:
|
||||||
|
cur = conn.execute("DELETE FROM jobs")
|
||||||
|
count = cur.rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def purge_non_remote(db_path: Path = DEFAULT_DB) -> int:
|
||||||
|
"""Delete non-remote jobs that are not yet in the active pipeline.
|
||||||
|
Preserves applied, phone_screen, interviewing, offer, hired, and synced records.
|
||||||
|
Returns number of rows deleted.
|
||||||
|
"""
|
||||||
|
_safe = ("applied", "phone_screen", "interviewing", "offer", "hired", "synced")
|
||||||
|
placeholders = ",".join("?" * len(_safe))
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
count = conn.execute(
|
||||||
|
f"DELETE FROM jobs WHERE (is_remote = 0 OR is_remote IS NULL)"
|
||||||
|
f" AND status NOT IN ({placeholders})",
|
||||||
|
_safe,
|
||||||
|
).rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
def archive_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int:
|
||||||
|
"""Set status='archived' for jobs matching given statuses.
|
||||||
|
|
||||||
|
Archived jobs stay in the DB (preserving dedup by URL) but are invisible
|
||||||
|
to Job Review and other pipeline views.
|
||||||
|
Returns number of rows updated.
|
||||||
|
"""
|
||||||
|
if not statuses:
|
||||||
|
return 0
|
||||||
|
placeholders = ",".join("?" * len(statuses))
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
count = conn.execute(
|
||||||
|
f"UPDATE jobs SET status = 'archived' WHERE status IN ({placeholders})",
|
||||||
|
statuses,
|
||||||
|
).rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return count
|
||||||
|
|
||||||
|
|
||||||
|
# ── Interview pipeline helpers ────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_STAGE_TS_COL = {
|
||||||
|
"phone_screen": "phone_screen_at",
|
||||||
|
"interviewing": "interviewing_at",
|
||||||
|
"offer": "offer_at",
|
||||||
|
"hired": "hired_at",
|
||||||
|
"survey": "survey_at",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_interview_jobs(db_path: Path = DEFAULT_DB) -> dict[str, list[dict]]:
|
||||||
|
"""Return jobs grouped by interview/post-apply stage."""
|
||||||
|
stages = ["applied", "survey", "phone_screen", "interviewing", "offer", "hired", "rejected"]
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
result: dict[str, list[dict]] = {}
|
||||||
|
for stage in stages:
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT * FROM jobs WHERE status = ? ORDER BY applied_at DESC, id DESC",
|
||||||
|
(stage,),
|
||||||
|
)
|
||||||
|
result[stage] = [dict(row) for row in cursor.fetchall()]
|
||||||
|
conn.close()
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def advance_to_stage(db_path: Path = DEFAULT_DB, job_id: int = None, stage: str = "") -> None:
|
||||||
|
"""Move a job to the next interview stage and record a timestamp."""
|
||||||
|
now = datetime.now().isoformat()[:16]
|
||||||
|
ts_col = _STAGE_TS_COL.get(stage)
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
if ts_col:
|
||||||
|
conn.execute(
|
||||||
|
f"UPDATE jobs SET status = ?, {ts_col} = ? WHERE id = ?",
|
||||||
|
(stage, now, job_id),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.execute("UPDATE jobs SET status = ? WHERE id = ?", (stage, job_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def reject_at_stage(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
rejection_stage: str = "") -> None:
|
||||||
|
"""Mark a job as rejected and record at which stage it was rejected."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE jobs SET status = 'rejected', rejection_stage = ? WHERE id = ?",
|
||||||
|
(rejection_stage, job_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def set_interview_date(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
date_str: str = "") -> None:
|
||||||
|
"""Persist an interview date for a job."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("UPDATE jobs SET interview_date = ? WHERE id = ?", (date_str, job_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ── Contact log helpers ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def add_contact(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
direction: str = "inbound", subject: str = "",
|
||||||
|
from_addr: str = "", to_addr: str = "",
|
||||||
|
body: str = "", received_at: str = "",
|
||||||
|
message_id: str = "",
|
||||||
|
stage_signal: str = "") -> int:
|
||||||
|
"""Log an email contact. Returns the new row id."""
|
||||||
|
ts = received_at or datetime.now().isoformat()[:16]
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.execute(
|
||||||
|
"""INSERT INTO job_contacts
|
||||||
|
(job_id, direction, subject, from_addr, to_addr, body,
|
||||||
|
received_at, message_id, stage_signal)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(job_id, direction, subject, from_addr, to_addr, body,
|
||||||
|
ts, message_id, stage_signal or None),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
row_id = cur.lastrowid
|
||||||
|
conn.close()
|
||||||
|
return row_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_contacts(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]:
|
||||||
|
"""Return all contact log entries for a job, oldest first."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT * FROM job_contacts WHERE job_id = ? ORDER BY received_at ASC",
|
||||||
|
(job_id,),
|
||||||
|
)
|
||||||
|
rows = [dict(row) for row in cursor.fetchall()]
|
||||||
|
conn.close()
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def get_unread_stage_signals(db_path: Path = DEFAULT_DB,
|
||||||
|
job_id: int = None) -> list[dict]:
|
||||||
|
"""Return inbound contacts with a non-neutral, non-dismissed stage signal."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT * FROM job_contacts
|
||||||
|
WHERE job_id = ?
|
||||||
|
AND direction = 'inbound'
|
||||||
|
AND stage_signal IS NOT NULL
|
||||||
|
AND stage_signal != 'neutral'
|
||||||
|
AND (suggestion_dismissed IS NULL OR suggestion_dismissed = 0)
|
||||||
|
ORDER BY received_at ASC""",
|
||||||
|
(job_id,),
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def dismiss_stage_signal(db_path: Path = DEFAULT_DB,
|
||||||
|
contact_id: int = None) -> None:
|
||||||
|
"""Mark a stage signal suggestion as dismissed."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE job_contacts SET suggestion_dismissed = 1 WHERE id = ?",
|
||||||
|
(contact_id,),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_message_ids(db_path: Path = DEFAULT_DB) -> set[str]:
|
||||||
|
"""Return all known Message-IDs across all job contacts."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT message_id FROM job_contacts WHERE message_id IS NOT NULL AND message_id != ''"
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return {r[0] for r in rows}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Company research helpers ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def save_research(db_path: Path = DEFAULT_DB, job_id: int = None,
|
||||||
|
company_brief: str = "", ceo_brief: str = "",
|
||||||
|
talking_points: str = "", raw_output: str = "",
|
||||||
|
tech_brief: str = "", funding_brief: str = "",
|
||||||
|
competitors_brief: str = "", red_flags: str = "",
|
||||||
|
accessibility_brief: str = "",
|
||||||
|
scrape_used: int = 0) -> None:
|
||||||
|
"""Insert or replace a company research record for a job."""
|
||||||
|
now = datetime.now().isoformat()[:16]
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"""INSERT INTO company_research
|
||||||
|
(job_id, generated_at, company_brief, ceo_brief, talking_points,
|
||||||
|
raw_output, tech_brief, funding_brief, competitors_brief, red_flags,
|
||||||
|
accessibility_brief, scrape_used)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
ON CONFLICT(job_id) DO UPDATE SET
|
||||||
|
generated_at = excluded.generated_at,
|
||||||
|
company_brief = excluded.company_brief,
|
||||||
|
ceo_brief = excluded.ceo_brief,
|
||||||
|
talking_points = excluded.talking_points,
|
||||||
|
raw_output = excluded.raw_output,
|
||||||
|
tech_brief = excluded.tech_brief,
|
||||||
|
funding_brief = excluded.funding_brief,
|
||||||
|
competitors_brief = excluded.competitors_brief,
|
||||||
|
red_flags = excluded.red_flags,
|
||||||
|
accessibility_brief = excluded.accessibility_brief,
|
||||||
|
scrape_used = excluded.scrape_used""",
|
||||||
|
(job_id, now, company_brief, ceo_brief, talking_points, raw_output,
|
||||||
|
tech_brief, funding_brief, competitors_brief, red_flags,
|
||||||
|
accessibility_brief, scrape_used),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_research(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]:
|
||||||
|
"""Return the company research record for a job, or None if absent."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
cursor = conn.execute(
|
||||||
|
"SELECT * FROM company_research WHERE job_id = ?", (job_id,)
|
||||||
|
)
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
return dict(row) if row else None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Survey response helpers ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def insert_survey_response(
|
||||||
|
db_path: Path = DEFAULT_DB,
|
||||||
|
job_id: int = None,
|
||||||
|
survey_name: str = "",
|
||||||
|
received_at: str = "",
|
||||||
|
source: str = "text_paste",
|
||||||
|
raw_input: str = "",
|
||||||
|
image_path: str = "",
|
||||||
|
mode: str = "quick",
|
||||||
|
llm_output: str = "",
|
||||||
|
reported_score: str = "",
|
||||||
|
) -> int:
|
||||||
|
"""Insert a survey response row. Returns the new row id."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.execute(
|
||||||
|
"""INSERT INTO survey_responses
|
||||||
|
(job_id, survey_name, received_at, source, raw_input,
|
||||||
|
image_path, mode, llm_output, reported_score)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||||
|
(job_id, survey_name or None, received_at or None,
|
||||||
|
source, raw_input or None, image_path or None,
|
||||||
|
mode, llm_output, reported_score or None),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
row_id = cur.lastrowid
|
||||||
|
conn.close()
|
||||||
|
return row_id
|
||||||
|
|
||||||
|
|
||||||
|
def get_survey_responses(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]:
|
||||||
|
"""Return all survey responses for a job, newest first."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT * FROM survey_responses WHERE job_id = ? ORDER BY created_at DESC",
|
||||||
|
(job_id,),
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
# ── Background task helpers ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||||
|
job_id: int = None) -> tuple[int, bool]:
|
||||||
|
"""Insert a new background task.
|
||||||
|
|
||||||
|
Returns (task_id, True) if inserted, or (existing_id, False) if a
|
||||||
|
queued/running task for the same (task_type, job_id) already exists.
|
||||||
|
"""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
existing = conn.execute(
|
||||||
|
"SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')",
|
||||||
|
(task_type, job_id),
|
||||||
|
).fetchone()
|
||||||
|
if existing:
|
||||||
|
conn.close()
|
||||||
|
return existing[0], False
|
||||||
|
cur = conn.execute(
|
||||||
|
"INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')",
|
||||||
|
(task_type, job_id),
|
||||||
|
)
|
||||||
|
task_id = cur.lastrowid
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return task_id, True
|
||||||
|
|
||||||
|
|
||||||
|
def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None,
|
||||||
|
status: str = "", error: Optional[str] = None) -> None:
|
||||||
|
"""Update a task's status and set the appropriate timestamp."""
|
||||||
|
now = datetime.now().isoformat()[:16]
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
if status == "running":
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE background_tasks SET status=?, started_at=?, updated_at=? WHERE id=?",
|
||||||
|
(status, now, now, task_id),
|
||||||
|
)
|
||||||
|
elif status in ("completed", "failed"):
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE background_tasks SET status=?, finished_at=?, updated_at=?, error=? WHERE id=?",
|
||||||
|
(status, now, now, error, task_id),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE background_tasks SET status=?, updated_at=? WHERE id=?",
|
||||||
|
(status, now, task_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def update_task_stage(db_path: Path = DEFAULT_DB, task_id: int = None,
|
||||||
|
stage: str = "") -> None:
|
||||||
|
"""Update the stage label on a running task (for progress display)."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("UPDATE background_tasks SET stage=? WHERE id=?", (stage, task_id))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]:
|
||||||
|
"""Return all queued/running tasks with job title and company joined in."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute("""
|
||||||
|
SELECT bt.*, j.title, j.company
|
||||||
|
FROM background_tasks bt
|
||||||
|
LEFT JOIN jobs j ON j.id = bt.job_id
|
||||||
|
WHERE bt.status IN ('queued', 'running')
|
||||||
|
ORDER BY bt.created_at ASC
|
||||||
|
""").fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [dict(r) for r in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||||
|
job_id: int = None) -> Optional[dict]:
|
||||||
|
"""Return the most recent task row for a (task_type, job_id) pair, or None."""
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute(
|
||||||
|
"""SELECT * FROM background_tasks
|
||||||
|
WHERE task_type=? AND job_id=?
|
||||||
|
ORDER BY id DESC LIMIT 1""",
|
||||||
|
(task_type, job_id),
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
return dict(row) if row else None
|
||||||
285
scripts/discover.py
Normal file
285
scripts/discover.py
Normal file
|
|
@ -0,0 +1,285 @@
|
||||||
|
# scripts/discover.py
|
||||||
|
"""
|
||||||
|
JobSpy → SQLite staging pipeline (default) or Notion (notion_push=True).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/discover.py
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
from jobspy import scrape_jobs
|
||||||
|
from notion_client import Client
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls
|
||||||
|
from scripts.custom_boards import adzuna as _adzuna
|
||||||
|
from scripts.custom_boards import theladders as _theladders
|
||||||
|
from scripts.custom_boards import craigslist as _craigslist
|
||||||
|
|
||||||
|
CONFIG_DIR = Path(__file__).parent.parent / "config"
|
||||||
|
NOTION_CFG = CONFIG_DIR / "notion.yaml"
|
||||||
|
PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml"
|
||||||
|
BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml"
|
||||||
|
|
||||||
|
# Registry of custom board scrapers keyed by name used in search_profiles.yaml
|
||||||
|
CUSTOM_SCRAPERS: dict[str, object] = {
|
||||||
|
"adzuna": _adzuna.scrape,
|
||||||
|
"theladders": _theladders.scrape,
|
||||||
|
"craigslist": _craigslist.scrape,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_config() -> tuple[dict, dict]:
|
||||||
|
profiles = yaml.safe_load(PROFILES_CFG.read_text())
|
||||||
|
notion_cfg = yaml.safe_load(NOTION_CFG.read_text())
|
||||||
|
return profiles, notion_cfg
|
||||||
|
|
||||||
|
|
||||||
|
def load_blocklist() -> dict:
|
||||||
|
"""Load global blocklist config. Returns dict with companies, industries, locations lists."""
|
||||||
|
if not BLOCKLIST_CFG.exists():
|
||||||
|
return {"companies": [], "industries": [], "locations": []}
|
||||||
|
raw = yaml.safe_load(BLOCKLIST_CFG.read_text()) or {}
|
||||||
|
return {
|
||||||
|
"companies": [c.lower() for c in raw.get("companies", []) if c],
|
||||||
|
"industries": [i.lower() for i in raw.get("industries", []) if i],
|
||||||
|
"locations": [loc.lower() for loc in raw.get("locations", []) if loc],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _is_blocklisted(job_row: dict, blocklist: dict) -> bool:
|
||||||
|
"""Return True if this job matches any global blocklist rule."""
|
||||||
|
company_lower = (job_row.get("company") or "").lower()
|
||||||
|
location_lower = (job_row.get("location") or "").lower()
|
||||||
|
desc_lower = (job_row.get("description") or "").lower()
|
||||||
|
content_lower = f"{company_lower} {desc_lower}"
|
||||||
|
|
||||||
|
if any(bl in company_lower for bl in blocklist["companies"]):
|
||||||
|
return True
|
||||||
|
if any(bl in content_lower for bl in blocklist["industries"]):
|
||||||
|
return True
|
||||||
|
if any(bl in location_lower for bl in blocklist["locations"]):
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]:
|
||||||
|
"""Return the set of all job URLs already tracked in Notion (for notion_push mode)."""
|
||||||
|
existing: set[str] = set()
|
||||||
|
has_more = True
|
||||||
|
start_cursor = None
|
||||||
|
while has_more:
|
||||||
|
kwargs: dict = {"database_id": db_id, "page_size": 100}
|
||||||
|
if start_cursor:
|
||||||
|
kwargs["start_cursor"] = start_cursor
|
||||||
|
resp = notion.databases.query(**kwargs)
|
||||||
|
for page in resp["results"]:
|
||||||
|
url = page["properties"].get(url_field, {}).get("url")
|
||||||
|
if url:
|
||||||
|
existing.add(url)
|
||||||
|
has_more = resp.get("has_more", False)
|
||||||
|
start_cursor = resp.get("next_cursor")
|
||||||
|
return existing
|
||||||
|
|
||||||
|
|
||||||
|
def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None:
|
||||||
|
"""Create a new page in the Notion jobs database for a single listing."""
|
||||||
|
min_amt = job.get("min_amount")
|
||||||
|
max_amt = job.get("max_amount")
|
||||||
|
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
|
||||||
|
title_content = f"${int(min_amt):,} – ${int(max_amt):,}"
|
||||||
|
elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""):
|
||||||
|
title_content = str(job["salary_source"])
|
||||||
|
else:
|
||||||
|
title_content = str(job.get("title", "Unknown"))
|
||||||
|
|
||||||
|
job_url = str(job.get("job_url", "") or "")
|
||||||
|
if job_url in ("nan", "None"):
|
||||||
|
job_url = ""
|
||||||
|
|
||||||
|
notion.pages.create(
|
||||||
|
parent={"database_id": db_id},
|
||||||
|
properties={
|
||||||
|
fm["title_field"]: {"title": [{"text": {"content": title_content}}]},
|
||||||
|
fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]},
|
||||||
|
fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]},
|
||||||
|
fm["url"]: {"url": job_url or None},
|
||||||
|
fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]},
|
||||||
|
fm["status"]: {"select": {"name": fm["status_new"]}},
|
||||||
|
fm["remote"]: {"checkbox": bool(job.get("is_remote", False))},
|
||||||
|
fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False) -> None:
|
||||||
|
profiles_cfg, notion_cfg = load_config()
|
||||||
|
fm = notion_cfg["field_map"]
|
||||||
|
blocklist = load_blocklist()
|
||||||
|
|
||||||
|
_bl_summary = {k: len(v) for k, v in blocklist.items() if v}
|
||||||
|
if _bl_summary:
|
||||||
|
print(f"[discover] Blocklist active: {_bl_summary}")
|
||||||
|
|
||||||
|
# SQLite dedup — by URL and by (title, company) to catch cross-board reposts
|
||||||
|
init_db(db_path)
|
||||||
|
existing_urls = db_existing_urls(db_path)
|
||||||
|
|
||||||
|
import sqlite3 as _sqlite3
|
||||||
|
_conn = _sqlite3.connect(db_path)
|
||||||
|
existing_tc = {
|
||||||
|
(r[0].lower().strip()[:80], r[1].lower().strip())
|
||||||
|
for r in _conn.execute("SELECT title, company FROM jobs").fetchall()
|
||||||
|
}
|
||||||
|
_conn.close()
|
||||||
|
|
||||||
|
# Notion dedup (only in notion_push mode)
|
||||||
|
notion = None
|
||||||
|
if notion_push:
|
||||||
|
notion = Client(auth=notion_cfg["token"])
|
||||||
|
existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"])
|
||||||
|
|
||||||
|
print(f"[discover] {len(existing_urls)} existing listings in DB")
|
||||||
|
new_count = 0
|
||||||
|
|
||||||
|
def _s(val, default="") -> str:
|
||||||
|
"""Convert a value to str, treating pandas NaN/None as default."""
|
||||||
|
if val is None:
|
||||||
|
return default
|
||||||
|
s = str(val)
|
||||||
|
return default if s in ("nan", "None", "NaN") else s
|
||||||
|
|
||||||
|
def _insert_if_new(job_row: dict, source_label: str) -> bool:
|
||||||
|
"""Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted."""
|
||||||
|
url = job_row.get("url", "")
|
||||||
|
if not url or url in existing_urls:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Global blocklist — checked before anything else
|
||||||
|
if _is_blocklisted(job_row, blocklist):
|
||||||
|
return False
|
||||||
|
|
||||||
|
title_lower = job_row.get("title", "").lower()
|
||||||
|
desc_lower = job_row.get("description", "").lower()
|
||||||
|
exclude_kw = job_row.get("_exclude_kw", [])
|
||||||
|
if any(kw in title_lower or kw in desc_lower for kw in exclude_kw):
|
||||||
|
return False
|
||||||
|
|
||||||
|
tc_key = (title_lower[:80], job_row.get("company", "").lower().strip())
|
||||||
|
if tc_key in existing_tc:
|
||||||
|
return False
|
||||||
|
existing_tc.add(tc_key)
|
||||||
|
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": job_row.get("title", ""),
|
||||||
|
"company": job_row.get("company", ""),
|
||||||
|
"url": url,
|
||||||
|
"source": job_row.get("source", source_label),
|
||||||
|
"location": job_row.get("location", ""),
|
||||||
|
"is_remote": bool(job_row.get("is_remote", False)),
|
||||||
|
"salary": job_row.get("salary", ""),
|
||||||
|
"description": job_row.get("description", ""),
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
existing_urls.add(url)
|
||||||
|
return True
|
||||||
|
|
||||||
|
for profile in profiles_cfg["profiles"]:
|
||||||
|
print(f"\n[discover] ── Profile: {profile['name']} ──")
|
||||||
|
boards = profile.get("boards", [])
|
||||||
|
custom_boards = profile.get("custom_boards", [])
|
||||||
|
exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])]
|
||||||
|
results_per_board = profile.get("results_per_board", 25)
|
||||||
|
|
||||||
|
for location in profile["locations"]:
|
||||||
|
|
||||||
|
# ── JobSpy boards ──────────────────────────────────────────────────
|
||||||
|
if boards:
|
||||||
|
print(f" [jobspy] {location} — boards: {', '.join(boards)}")
|
||||||
|
try:
|
||||||
|
jobs: pd.DataFrame = scrape_jobs(
|
||||||
|
site_name=boards,
|
||||||
|
search_term=" OR ".join(f'"{t}"' for t in profile["titles"]),
|
||||||
|
location=location,
|
||||||
|
results_wanted=results_per_board,
|
||||||
|
hours_old=profile.get("hours_old", 72),
|
||||||
|
linkedin_fetch_description=True,
|
||||||
|
)
|
||||||
|
print(f" [jobspy] {len(jobs)} raw results")
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" [jobspy] ERROR: {exc}")
|
||||||
|
jobs = pd.DataFrame()
|
||||||
|
|
||||||
|
jobspy_new = 0
|
||||||
|
for _, job in jobs.iterrows():
|
||||||
|
url = str(job.get("job_url", "") or "")
|
||||||
|
if not url or url in ("nan", "None"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
job_dict = job.to_dict()
|
||||||
|
|
||||||
|
# Build salary string from JobSpy numeric fields
|
||||||
|
min_amt = job_dict.get("min_amount")
|
||||||
|
max_amt = job_dict.get("max_amount")
|
||||||
|
salary_str = ""
|
||||||
|
if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)):
|
||||||
|
salary_str = f"${int(min_amt):,} – ${int(max_amt):,}"
|
||||||
|
elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""):
|
||||||
|
salary_str = str(job_dict["salary_source"])
|
||||||
|
|
||||||
|
row = {
|
||||||
|
"url": url,
|
||||||
|
"title": _s(job_dict.get("title")),
|
||||||
|
"company": _s(job_dict.get("company")),
|
||||||
|
"source": _s(job_dict.get("site")),
|
||||||
|
"location": _s(job_dict.get("location")),
|
||||||
|
"is_remote": bool(job_dict.get("is_remote", False)),
|
||||||
|
"salary": salary_str,
|
||||||
|
"description": _s(job_dict.get("description")),
|
||||||
|
"_exclude_kw": exclude_kw,
|
||||||
|
}
|
||||||
|
if _insert_if_new(row, _s(job_dict.get("site"))):
|
||||||
|
if notion_push:
|
||||||
|
push_to_notion(notion, notion_cfg["database_id"], job_dict, fm)
|
||||||
|
new_count += 1
|
||||||
|
jobspy_new += 1
|
||||||
|
print(f" + {row['title']} @ {row['company']} [{row['source']}]")
|
||||||
|
|
||||||
|
print(f" [jobspy] {jobspy_new} new listings from {location}")
|
||||||
|
|
||||||
|
# ── Custom boards ──────────────────────────────────────────────────
|
||||||
|
for board_name in custom_boards:
|
||||||
|
scraper_fn = CUSTOM_SCRAPERS.get(board_name)
|
||||||
|
if scraper_fn is None:
|
||||||
|
print(f" [{board_name}] Unknown scraper — skipping (not in CUSTOM_SCRAPERS registry)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f" [{board_name}] {location} — fetching up to {results_per_board} results …")
|
||||||
|
try:
|
||||||
|
custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f" [{board_name}] ERROR: {exc}")
|
||||||
|
custom_jobs = []
|
||||||
|
|
||||||
|
print(f" [{board_name}] {len(custom_jobs)} raw results")
|
||||||
|
board_new = 0
|
||||||
|
for job in custom_jobs:
|
||||||
|
row = {**job, "_exclude_kw": exclude_kw}
|
||||||
|
if _insert_if_new(row, board_name):
|
||||||
|
new_count += 1
|
||||||
|
board_new += 1
|
||||||
|
print(f" + {job.get('title')} @ {job.get('company')} [{board_name}]")
|
||||||
|
|
||||||
|
print(f" [{board_name}] {board_new} new listings from {location}")
|
||||||
|
|
||||||
|
print(f"\n[discover] Done — {new_count} new listings staged total.")
|
||||||
|
return new_count
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
run_discovery()
|
||||||
284
scripts/enrich_descriptions.py
Normal file
284
scripts/enrich_descriptions.py
Normal file
|
|
@ -0,0 +1,284 @@
|
||||||
|
# scripts/enrich_descriptions.py
|
||||||
|
"""
|
||||||
|
Post-discovery enrichment: retry Glassdoor job description fetches that
|
||||||
|
returned empty/null during the initial scrape (usually rate-limit 429s or
|
||||||
|
expired listings mid-batch).
|
||||||
|
|
||||||
|
Fetches descriptions one at a time with a configurable delay between
|
||||||
|
requests to stay under Glassdoor's rate limit.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/enrich_descriptions.py
|
||||||
|
conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run
|
||||||
|
conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, init_db
|
||||||
|
|
||||||
|
DELAY_SECS = 1.5 # seconds between description fetches
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_job_id(url: str) -> str | None:
|
||||||
|
"""Pull the Glassdoor listing ID from a job URL (…?jl=1234567890)."""
|
||||||
|
m = re.search(r"jl=(\d+)", url or "")
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _setup_scraper():
|
||||||
|
"""
|
||||||
|
Create a Glassdoor scraper instance initialised just enough to call
|
||||||
|
_fetch_job_description() — skips the full job-search setup.
|
||||||
|
"""
|
||||||
|
from jobspy.glassdoor import Glassdoor
|
||||||
|
from jobspy.glassdoor.constant import fallback_token, headers
|
||||||
|
from jobspy.model import ScraperInput, Site
|
||||||
|
from jobspy.util import create_session
|
||||||
|
|
||||||
|
scraper = Glassdoor()
|
||||||
|
scraper.base_url = "https://www.glassdoor.com/"
|
||||||
|
scraper.session = create_session(has_retry=True)
|
||||||
|
token = scraper._get_csrf_token()
|
||||||
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
|
||||||
|
return scraper
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_glassdoor_descriptions(
|
||||||
|
db_path: Path = DEFAULT_DB,
|
||||||
|
dry_run: bool = False,
|
||||||
|
delay: float = DELAY_SECS,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Find Glassdoor jobs with missing descriptions and re-fetch them.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
|
||||||
|
"""
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT id, url, company, title FROM jobs
|
||||||
|
WHERE source = 'glassdoor'
|
||||||
|
AND (description IS NULL OR TRIM(description) = '')
|
||||||
|
ORDER BY id ASC"""
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print("[enrich] No Glassdoor jobs missing descriptions.")
|
||||||
|
return result
|
||||||
|
|
||||||
|
print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions — fetching…")
|
||||||
|
|
||||||
|
try:
|
||||||
|
scraper = _setup_scraper()
|
||||||
|
except Exception as e:
|
||||||
|
msg = f"Glassdoor scraper init failed: {e}"
|
||||||
|
result["errors"].append(msg)
|
||||||
|
result["failed"] = len(rows)
|
||||||
|
print(f"[enrich] ERROR — {msg}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
for db_id, url, company, title in rows:
|
||||||
|
job_id = _extract_job_id(url)
|
||||||
|
if not job_id:
|
||||||
|
msg = f"job #{db_id}: cannot extract listing ID from URL: {url}"
|
||||||
|
result["errors"].append(msg)
|
||||||
|
result["failed"] += 1
|
||||||
|
print(f"[enrich] SKIP — {msg}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
description = scraper._fetch_job_description(int(job_id))
|
||||||
|
if description and description.strip():
|
||||||
|
if not dry_run:
|
||||||
|
upd = sqlite3.connect(db_path)
|
||||||
|
upd.execute(
|
||||||
|
"UPDATE jobs SET description = ? WHERE id = ?",
|
||||||
|
(description, db_id),
|
||||||
|
)
|
||||||
|
upd.commit()
|
||||||
|
upd.close()
|
||||||
|
tag = "[DRY-RUN] " if dry_run else ""
|
||||||
|
print(f"[enrich] {tag}{company} — {title}: {len(description)} chars")
|
||||||
|
result["succeeded"] += 1
|
||||||
|
else:
|
||||||
|
print(f"[enrich] {company} — {title}: empty response (expired listing?)")
|
||||||
|
result["failed"] += 1
|
||||||
|
except Exception as e:
|
||||||
|
msg = f"job #{db_id} ({company}): {e}"
|
||||||
|
result["errors"].append(msg)
|
||||||
|
result["failed"] += 1
|
||||||
|
print(f"[enrich] ERROR — {msg}")
|
||||||
|
|
||||||
|
if delay > 0:
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_all_descriptions(
|
||||||
|
db_path: Path = DEFAULT_DB,
|
||||||
|
dry_run: bool = False,
|
||||||
|
delay: float = DELAY_SECS,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Find ALL jobs with missing/empty descriptions (any source) and re-fetch them.
|
||||||
|
|
||||||
|
Uses scrape_job_url for every source — it handles LinkedIn, Indeed, Glassdoor,
|
||||||
|
Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{"attempted": N, "succeeded": N, "failed": N, "errors": [...]}
|
||||||
|
"""
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
rows = conn.execute(
|
||||||
|
"""SELECT id, url, company, title, source FROM jobs
|
||||||
|
WHERE (description IS NULL OR TRIM(description) = '')
|
||||||
|
AND url IS NOT NULL AND url != ''
|
||||||
|
ORDER BY source, id ASC"""
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []}
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print("[enrich] No jobs with missing descriptions.")
|
||||||
|
return result
|
||||||
|
|
||||||
|
print(f"[enrich] {len(rows)} job(s) missing descriptions — fetching…")
|
||||||
|
|
||||||
|
for db_id, url, company, title, source in rows:
|
||||||
|
if not url.startswith("http"):
|
||||||
|
result["failed"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag = "[DRY-RUN] " if dry_run else ""
|
||||||
|
try:
|
||||||
|
fields = {} if dry_run else scrape_job_url(db_path, db_id)
|
||||||
|
if fields or dry_run:
|
||||||
|
desc_len = len(fields.get("description", "") or "")
|
||||||
|
print(f"[enrich] {tag}[{source}] {company} — {title}: {desc_len} chars")
|
||||||
|
result["succeeded"] += 1
|
||||||
|
else:
|
||||||
|
print(f"[enrich] [{source}] {company} — {title}: no data returned")
|
||||||
|
result["failed"] += 1
|
||||||
|
except Exception as e:
|
||||||
|
msg = f"job #{db_id} ({company}): {e}"
|
||||||
|
result["errors"].append(msg)
|
||||||
|
result["failed"] += 1
|
||||||
|
print(f"[enrich] ERROR — {msg}")
|
||||||
|
|
||||||
|
if delay > 0:
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_craigslist_fields(
|
||||||
|
db_path: Path = DEFAULT_DB,
|
||||||
|
job_id: int = None,
|
||||||
|
) -> dict:
|
||||||
|
"""
|
||||||
|
Use LLM to extract company name and salary from a Craigslist job description.
|
||||||
|
|
||||||
|
Called after scrape_url populates the description for a craigslist job.
|
||||||
|
Only runs when: source='craigslist', company='', description non-empty.
|
||||||
|
|
||||||
|
Returns dict with keys 'company' and/or 'salary' (may be empty strings).
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute(
|
||||||
|
"SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,)
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
return {}
|
||||||
|
if row["source"] != "craigslist":
|
||||||
|
return {}
|
||||||
|
if row["company"]: # already populated
|
||||||
|
return {}
|
||||||
|
if not (row["description"] or "").strip():
|
||||||
|
return {}
|
||||||
|
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
prompt = (
|
||||||
|
"Extract the following from this job posting. "
|
||||||
|
"Return JSON only, no commentary.\n\n"
|
||||||
|
'{"company": "<company name or empty string>", '
|
||||||
|
'"salary": "<salary/compensation or empty string>"}\n\n'
|
||||||
|
f"Posting:\n{row['description'][:3000]}"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
router = LLMRouter()
|
||||||
|
raw = router.complete(prompt)
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
try:
|
||||||
|
clean = re.sub(r"```(?:json)?|```", "", raw).strip()
|
||||||
|
fields = json.loads(clean)
|
||||||
|
except (json.JSONDecodeError, ValueError):
|
||||||
|
print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
extracted = {
|
||||||
|
k: (fields.get(k) or "").strip()
|
||||||
|
for k in ("company", "salary")
|
||||||
|
if (fields.get(k) or "").strip()
|
||||||
|
}
|
||||||
|
|
||||||
|
if extracted:
|
||||||
|
from scripts.db import update_job_fields
|
||||||
|
update_job_fields(db_path, job_id, extracted)
|
||||||
|
print(f"[enrich_craigslist] job {job_id}: "
|
||||||
|
f"company={extracted.get('company', '—')} "
|
||||||
|
f"salary={extracted.get('salary', '—')}")
|
||||||
|
|
||||||
|
return extracted
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Re-fetch missing job descriptions (all sources)"
|
||||||
|
)
|
||||||
|
parser.add_argument("--glassdoor-only", action="store_true",
|
||||||
|
help="Only re-fetch Glassdoor listings (legacy behaviour)")
|
||||||
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
help="Show what would be fetched without saving")
|
||||||
|
parser.add_argument("--delay", type=float, default=DELAY_SECS,
|
||||||
|
help=f"Seconds between requests (default: {DELAY_SECS})")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.glassdoor_only:
|
||||||
|
r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay)
|
||||||
|
else:
|
||||||
|
r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"\n[enrich] Done — {r['succeeded']} fetched, {r['failed']} failed"
|
||||||
|
+ (f", {len(r['errors'])} error(s)" if r["errors"] else "")
|
||||||
|
)
|
||||||
248
scripts/finetune_local.py
Normal file
248
scripts/finetune_local.py
Normal file
|
|
@ -0,0 +1,248 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
# scripts/finetune_local.py
|
||||||
|
"""
|
||||||
|
Local LoRA fine-tune on Alex's cover letter corpus.
|
||||||
|
No HuggingFace account or internet required after the base model is cached.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n ogma python scripts/finetune_local.py
|
||||||
|
conda run -n ogma python scripts/finetune_local.py --model unsloth/Llama-3.2-3B-Instruct
|
||||||
|
conda run -n ogma python scripts/finetune_local.py --epochs 15 --rank 16
|
||||||
|
|
||||||
|
After training, follow the printed instructions to load the model into Ollama.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained
|
||||||
|
# pins every layer to GPU 0, avoiding the accelerate None-device bug that
|
||||||
|
# occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation.
|
||||||
|
# Do NOT set WORLD_SIZE/RANK — that triggers torch.distributed initialisation.
|
||||||
|
os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0")
|
||||||
|
|
||||||
|
# ── Config ────────────────────────────────────────────────────────────────────
|
||||||
|
DEFAULT_MODEL = "unsloth/Llama-3.2-3B-Instruct" # safe on 8 GB VRAM
|
||||||
|
LETTERS_JSONL = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl")
|
||||||
|
OUTPUT_DIR = Path("/Library/Documents/JobSearch/training_data/finetune_output")
|
||||||
|
GGUF_DIR = Path("/Library/Documents/JobSearch/training_data/gguf")
|
||||||
|
OLLAMA_NAME = "alex-cover-writer"
|
||||||
|
|
||||||
|
SYSTEM_PROMPT = (
|
||||||
|
"You are Alex Rivera's personal cover letter writer. "
|
||||||
|
"Write professional, warm, and results-focused cover letters in Alex's voice. "
|
||||||
|
"Draw on her background in customer success, technical account management, "
|
||||||
|
"and revenue operations. Be specific and avoid generic filler."
|
||||||
|
)
|
||||||
|
|
||||||
|
# ── Args ──────────────────────────────────────────────────────────────────────
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model", default=DEFAULT_MODEL, help="Base model (HF repo id or local path)")
|
||||||
|
parser.add_argument("--epochs", type=int, default=10, help="Training epochs (default: 10)")
|
||||||
|
parser.add_argument("--rank", type=int, default=16, help="LoRA rank (default: 16)")
|
||||||
|
parser.add_argument("--batch", type=int, default=2, help="Per-device batch size (default: 2)")
|
||||||
|
parser.add_argument("--no-gguf", action="store_true", help="Skip GGUF export")
|
||||||
|
parser.add_argument("--max-length", type=int, default=1024, help="Max token length (default: 1024)")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f" Alex Cover Letter Fine-Tuner")
|
||||||
|
print(f" Base model : {args.model}")
|
||||||
|
print(f" Epochs : {args.epochs}")
|
||||||
|
print(f" LoRA rank : {args.rank}")
|
||||||
|
print(f" Dataset : {LETTERS_JSONL}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
# ── Load dataset ──────────────────────────────────────────────────────────────
|
||||||
|
if not LETTERS_JSONL.exists():
|
||||||
|
sys.exit(f"ERROR: Dataset not found at {LETTERS_JSONL}\n"
|
||||||
|
"Run: conda run -n job-seeker python scripts/prepare_training_data.py")
|
||||||
|
|
||||||
|
records = [json.loads(l) for l in LETTERS_JSONL.read_text().splitlines() if l.strip()]
|
||||||
|
print(f"Loaded {len(records)} training examples.")
|
||||||
|
|
||||||
|
# Convert to chat format expected by SFTTrainer
|
||||||
|
def to_messages(rec: dict) -> dict:
|
||||||
|
return {"messages": [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": rec["instruction"]},
|
||||||
|
{"role": "assistant", "content": rec["output"]},
|
||||||
|
]}
|
||||||
|
|
||||||
|
chat_data = [to_messages(r) for r in records]
|
||||||
|
|
||||||
|
# ── Load model with unsloth ────────────────────────────────────────────────────
|
||||||
|
try:
|
||||||
|
from unsloth import FastLanguageModel
|
||||||
|
USE_UNSLOTH = True
|
||||||
|
except ImportError:
|
||||||
|
USE_UNSLOTH = False
|
||||||
|
print("WARNING: unsloth not found — falling back to standard transformers + PEFT")
|
||||||
|
print(" Install: pip install 'unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git'")
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
if USE_UNSLOTH:
|
||||||
|
model, tokenizer = FastLanguageModel.from_pretrained(
|
||||||
|
model_name = args.model,
|
||||||
|
max_seq_length = args.max_length,
|
||||||
|
load_in_4bit = True, # QLoRA — fits 7-9B in 8 GB VRAM
|
||||||
|
dtype = None, # auto-detect
|
||||||
|
device_map = {"": 0}, # pin everything to GPU 0; avoids accelerate None-device bug
|
||||||
|
)
|
||||||
|
model = FastLanguageModel.get_peft_model(
|
||||||
|
model,
|
||||||
|
r = args.rank,
|
||||||
|
lora_alpha = args.rank * 2,
|
||||||
|
lora_dropout = 0, # 0 = full unsloth kernel patching (faster)
|
||||||
|
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
|
||||||
|
"gate_proj", "up_proj", "down_proj"],
|
||||||
|
bias = "none",
|
||||||
|
use_gradient_checkpointing = "unsloth",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
|
||||||
|
from peft import LoraConfig, get_peft_model, TaskType
|
||||||
|
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_compute_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.model)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
args.model,
|
||||||
|
quantization_config=bnb_config,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
lora_config = LoraConfig(
|
||||||
|
r=args.rank,
|
||||||
|
lora_alpha=args.rank * 2,
|
||||||
|
lora_dropout=0.05,
|
||||||
|
task_type=TaskType.CAUSAL_LM,
|
||||||
|
)
|
||||||
|
model = get_peft_model(model, lora_config)
|
||||||
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
|
# ── Build HF Dataset ──────────────────────────────────────────────────────────
|
||||||
|
from datasets import Dataset
|
||||||
|
|
||||||
|
raw = Dataset.from_list(chat_data)
|
||||||
|
split = raw.train_test_split(test_size=0.1, seed=42)
|
||||||
|
train_ds = split["train"]
|
||||||
|
eval_ds = split["test"]
|
||||||
|
print(f"Train: {len(train_ds)} Eval: {len(eval_ds)}")
|
||||||
|
|
||||||
|
# formatting_func must ALWAYS return a list of strings.
|
||||||
|
# Unsloth tests it with a single example dict; during training it gets batches.
|
||||||
|
# Gemma 2 has no "system" role — fold it into the first user turn.
|
||||||
|
def _apply_template(msgs):
|
||||||
|
msgs = list(msgs)
|
||||||
|
if msgs and msgs[0]["role"] == "system":
|
||||||
|
sys_text = msgs.pop(0)["content"]
|
||||||
|
if msgs and msgs[0]["role"] == "user":
|
||||||
|
msgs[0] = {"role": "user", "content": f"{sys_text}\n\n{msgs[0]['content']}"}
|
||||||
|
return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False)
|
||||||
|
|
||||||
|
def formatting_func(example):
|
||||||
|
msgs_field = example["messages"]
|
||||||
|
# Single example: messages is a list of role dicts {"role":..., "content":...}
|
||||||
|
# Batched example: messages is a list of those lists
|
||||||
|
if msgs_field and isinstance(msgs_field[0], dict):
|
||||||
|
return [_apply_template(msgs_field)]
|
||||||
|
return [_apply_template(m) for m in msgs_field]
|
||||||
|
|
||||||
|
# ── Train ─────────────────────────────────────────────────────────────────────
|
||||||
|
from trl import SFTTrainer, SFTConfig
|
||||||
|
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
trainer = SFTTrainer(
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
train_dataset=train_ds,
|
||||||
|
eval_dataset=eval_ds,
|
||||||
|
formatting_func=formatting_func,
|
||||||
|
args=SFTConfig(
|
||||||
|
output_dir = str(OUTPUT_DIR),
|
||||||
|
num_train_epochs = args.epochs,
|
||||||
|
per_device_train_batch_size = args.batch,
|
||||||
|
gradient_accumulation_steps = max(1, 8 // args.batch),
|
||||||
|
learning_rate = 2e-4,
|
||||||
|
warmup_ratio = 0.1,
|
||||||
|
lr_scheduler_type = "cosine",
|
||||||
|
fp16 = not torch.cuda.is_bf16_supported(),
|
||||||
|
bf16 = torch.cuda.is_bf16_supported(),
|
||||||
|
logging_steps = 5,
|
||||||
|
eval_strategy = "epoch",
|
||||||
|
save_strategy = "epoch",
|
||||||
|
load_best_model_at_end = True,
|
||||||
|
max_length = args.max_length,
|
||||||
|
report_to = "none",
|
||||||
|
push_to_hub = False, # local only
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nStarting training…")
|
||||||
|
trainer.train()
|
||||||
|
print("Training complete.")
|
||||||
|
|
||||||
|
# ── Save adapter ──────────────────────────────────────────────────────────────
|
||||||
|
adapter_path = OUTPUT_DIR / "adapter"
|
||||||
|
model.save_pretrained(str(adapter_path))
|
||||||
|
tokenizer.save_pretrained(str(adapter_path))
|
||||||
|
print(f"\nLoRA adapter saved to: {adapter_path}")
|
||||||
|
|
||||||
|
# ── GGUF export ───────────────────────────────────────────────────────────────
|
||||||
|
if not args.no_gguf and USE_UNSLOTH:
|
||||||
|
GGUF_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
gguf_path = GGUF_DIR / f"{OLLAMA_NAME}.gguf"
|
||||||
|
print(f"\nExporting GGUF → {gguf_path} …")
|
||||||
|
model.save_pretrained_gguf(
|
||||||
|
str(GGUF_DIR / OLLAMA_NAME),
|
||||||
|
tokenizer,
|
||||||
|
quantization_method="q4_k_m",
|
||||||
|
)
|
||||||
|
# unsloth names the file automatically — find it
|
||||||
|
gguf_files = list(GGUF_DIR.glob("*.gguf"))
|
||||||
|
if gguf_files:
|
||||||
|
gguf_path = gguf_files[0]
|
||||||
|
print(f"GGUF written: {gguf_path}")
|
||||||
|
else:
|
||||||
|
print("GGUF export may have succeeded — check GGUF_DIR above.")
|
||||||
|
else:
|
||||||
|
gguf_path = None
|
||||||
|
|
||||||
|
# ── Print next steps ──────────────────────────────────────────────────────────
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(" DONE — next steps to load into Ollama:")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
if gguf_path and gguf_path.exists():
|
||||||
|
modelfile = OUTPUT_DIR / "Modelfile"
|
||||||
|
modelfile.write_text(f"""FROM {gguf_path}
|
||||||
|
SYSTEM \"\"\"
|
||||||
|
{SYSTEM_PROMPT}
|
||||||
|
\"\"\"
|
||||||
|
PARAMETER temperature 0.7
|
||||||
|
PARAMETER top_p 0.9
|
||||||
|
PARAMETER num_ctx 32768
|
||||||
|
""")
|
||||||
|
print(f"\n1. Modelfile written to: {modelfile}")
|
||||||
|
print(f"\n2. Create the Ollama model:")
|
||||||
|
print(f" ollama create {OLLAMA_NAME} -f {modelfile}")
|
||||||
|
print(f"\n3. Test it:")
|
||||||
|
print(f" ollama run {OLLAMA_NAME} 'Write a cover letter for a Senior Customer Success Manager position at Acme Corp.'")
|
||||||
|
print(f"\n4. Update llm.yaml to use '{OLLAMA_NAME}:latest' as the ollama model,")
|
||||||
|
print(f" then pick it in Settings → LLM Backends → Ollama → Model.")
|
||||||
|
else:
|
||||||
|
print(f"\n Adapter only (no GGUF). To convert manually:")
|
||||||
|
print(f" 1. Merge adapter:")
|
||||||
|
print(f" conda run -n ogma python -c \"")
|
||||||
|
print(f" from peft import AutoPeftModelForCausalLM")
|
||||||
|
print(f" m = AutoPeftModelForCausalLM.from_pretrained('{adapter_path}')")
|
||||||
|
print(f" m.merge_and_unload().save_pretrained('{OUTPUT_DIR}/merged')\"")
|
||||||
|
print(f" 2. Convert to GGUF using textgen env's convert_hf_to_gguf.py")
|
||||||
|
print(f" 3. ollama create {OLLAMA_NAME} -f Modelfile")
|
||||||
|
print()
|
||||||
224
scripts/generate_cover_letter.py
Normal file
224
scripts/generate_cover_letter.py
Normal file
|
|
@ -0,0 +1,224 @@
|
||||||
|
# scripts/generate_cover_letter.py
|
||||||
|
"""
|
||||||
|
Generate a cover letter in Alex's voice using few-shot examples from her corpus.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/generate_cover_letter.py \
|
||||||
|
--title "Director of Customer Success" \
|
||||||
|
--company "Acme Corp" \
|
||||||
|
--description "We are looking for..."
|
||||||
|
|
||||||
|
Or pass a staging DB job ID:
|
||||||
|
conda run -n job-seeker python scripts/generate_cover_letter.py --job-id 42
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
LETTERS_DIR = Path("/Library/Documents/JobSearch")
|
||||||
|
LETTER_GLOB = "*Cover Letter*.md"
|
||||||
|
|
||||||
|
# Background injected into every prompt so the model has Alex's facts
|
||||||
|
SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader.
|
||||||
|
|
||||||
|
Background:
|
||||||
|
- 6+ years in customer success, technical account management, and CS leadership
|
||||||
|
- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95
|
||||||
|
- Also founder of M3 Consulting, a CS advisory practice for SaaS startups
|
||||||
|
- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere
|
||||||
|
- Based in San Francisco Bay Area; open to remote/hybrid
|
||||||
|
- Pronouns: any
|
||||||
|
|
||||||
|
Voice guidelines:
|
||||||
|
- Warm, confident, and specific — never generic
|
||||||
|
- Opens with "I'm delighted/thrilled to apply for [role] at [company]."
|
||||||
|
- 3–4 focused paragraphs, ~250–350 words total
|
||||||
|
- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric)
|
||||||
|
- Para 3: genuine connection to THIS company's mission/product
|
||||||
|
- Closes with "Thank you for considering my application." + warm sign-off
|
||||||
|
- Never use: "I am writing to express my interest", "passionate about making a difference",
|
||||||
|
"I look forward to hearing from you", or any hollow filler phrases
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ── Mission-alignment detection ───────────────────────────────────────────────
|
||||||
|
# When a company/JD signals one of these preferred industries, the cover letter
|
||||||
|
# prompt injects a hint so Para 3 can reflect genuine personal connection.
|
||||||
|
# This does NOT disclose any personal disability or family information.
|
||||||
|
|
||||||
|
_MISSION_SIGNALS: dict[str, list[str]] = {
|
||||||
|
"music": [
|
||||||
|
"music", "spotify", "tidal", "soundcloud", "bandcamp", "apple music",
|
||||||
|
"distrokid", "cd baby", "landr", "beatport", "reverb", "vinyl",
|
||||||
|
"streaming", "artist", "label", "live nation", "ticketmaster", "aeg",
|
||||||
|
"songkick", "concert", "venue", "festival", "audio", "podcast",
|
||||||
|
"studio", "record", "musician", "playlist",
|
||||||
|
],
|
||||||
|
"animal_welfare": [
|
||||||
|
"animal", "shelter", "rescue", "humane society", "spca", "aspca",
|
||||||
|
"veterinary", "vet ", "wildlife", "pet ", "adoption", "foster",
|
||||||
|
"dog", "cat", "feline", "canine", "sanctuary", "zoo",
|
||||||
|
],
|
||||||
|
"education": [
|
||||||
|
"education", "school", "learning", "student", "edtech", "classroom",
|
||||||
|
"curriculum", "tutoring", "academic", "university", "kids", "children",
|
||||||
|
"youth", "literacy", "khan academy", "duolingo", "chegg", "coursera",
|
||||||
|
"instructure", "canvas lms", "clever", "district", "teacher",
|
||||||
|
"k-12", "k12", "grade", "pedagogy",
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
_MISSION_NOTES: dict[str, str] = {
|
||||||
|
"music": (
|
||||||
|
"This company is in the music industry, which is one of Alex's genuinely "
|
||||||
|
"ideal work environments — she has a real personal passion for the music scene. "
|
||||||
|
"Para 3 should warmly and specifically reflect this authentic alignment, not as "
|
||||||
|
"a generic fan statement, but as an honest statement of where she'd love to apply "
|
||||||
|
"her CS skills."
|
||||||
|
),
|
||||||
|
"animal_welfare": (
|
||||||
|
"This organization works in animal welfare/rescue — one of Alex's dream-job "
|
||||||
|
"domains and a genuine personal passion. Para 3 should reflect this authentic "
|
||||||
|
"connection warmly and specifically, tying her CS skills to this mission."
|
||||||
|
),
|
||||||
|
"education": (
|
||||||
|
"This company works in children's education or EdTech — one of Alex's ideal "
|
||||||
|
"work domains, reflecting genuine personal values around learning and young people. "
|
||||||
|
"Para 3 should reflect this authentic connection specifically and warmly."
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def detect_mission_alignment(company: str, description: str) -> str | None:
|
||||||
|
"""Return a mission hint string if company/JD matches a preferred industry, else None."""
|
||||||
|
text = f"{company} {description}".lower()
|
||||||
|
for industry, signals in _MISSION_SIGNALS.items():
|
||||||
|
if any(sig in text for sig in signals):
|
||||||
|
return _MISSION_NOTES[industry]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def load_corpus() -> list[dict]:
|
||||||
|
"""Load all .md cover letters from LETTERS_DIR. Returns list of {path, company, text}."""
|
||||||
|
corpus = []
|
||||||
|
for path in sorted(LETTERS_DIR.glob(LETTER_GLOB)):
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
# Extract company from filename: "Tailscale Cover Letter.md" → "Tailscale"
|
||||||
|
company = re.sub(r"\s*Cover Letter.*", "", path.stem, flags=re.IGNORECASE).strip()
|
||||||
|
corpus.append({"path": path, "company": company, "text": text})
|
||||||
|
return corpus
|
||||||
|
|
||||||
|
|
||||||
|
def find_similar_letters(job_description: str, corpus: list[dict], top_k: int = 3) -> list[dict]:
|
||||||
|
"""Return the top_k letters most similar to the job description by TF-IDF cosine sim."""
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
if not corpus:
|
||||||
|
return []
|
||||||
|
|
||||||
|
docs = [job_description] + [c["text"] for c in corpus]
|
||||||
|
vectorizer = TfidfVectorizer(stop_words="english", max_features=500)
|
||||||
|
tfidf = vectorizer.fit_transform(docs)
|
||||||
|
sims = cosine_similarity(tfidf[0:1], tfidf[1:])[0]
|
||||||
|
|
||||||
|
ranked = sorted(zip(sims, corpus), key=lambda x: x[0], reverse=True)
|
||||||
|
return [entry for _, entry in ranked[:top_k]]
|
||||||
|
|
||||||
|
|
||||||
|
def build_prompt(
|
||||||
|
title: str,
|
||||||
|
company: str,
|
||||||
|
description: str,
|
||||||
|
examples: list[dict],
|
||||||
|
mission_hint: str | None = None,
|
||||||
|
) -> str:
|
||||||
|
parts = [SYSTEM_CONTEXT.strip(), ""]
|
||||||
|
if examples:
|
||||||
|
parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n")
|
||||||
|
for i, ex in enumerate(examples, 1):
|
||||||
|
parts.append(f"--- Example {i} ({ex['company']}) ---")
|
||||||
|
parts.append(ex["text"])
|
||||||
|
parts.append("")
|
||||||
|
parts.append("=== END EXAMPLES ===\n")
|
||||||
|
|
||||||
|
if mission_hint:
|
||||||
|
parts.append(f"⭐ Mission alignment note (for Para 3): {mission_hint}\n")
|
||||||
|
|
||||||
|
parts.append(f"Now write a new cover letter for:")
|
||||||
|
parts.append(f" Role: {title}")
|
||||||
|
parts.append(f" Company: {company}")
|
||||||
|
if description:
|
||||||
|
snippet = description[:1500].strip()
|
||||||
|
parts.append(f"\nJob description excerpt:\n{snippet}")
|
||||||
|
parts.append("\nWrite the full cover letter now:")
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def generate(title: str, company: str, description: str = "", _router=None) -> str:
|
||||||
|
"""Generate a cover letter and return it as a string.
|
||||||
|
|
||||||
|
_router is an optional pre-built LLMRouter (used in tests to avoid real LLM calls).
|
||||||
|
"""
|
||||||
|
corpus = load_corpus()
|
||||||
|
examples = find_similar_letters(description or f"{title} {company}", corpus)
|
||||||
|
mission_hint = detect_mission_alignment(company, description)
|
||||||
|
if mission_hint:
|
||||||
|
print(f"[cover-letter] Mission alignment detected for {company}", file=sys.stderr)
|
||||||
|
prompt = build_prompt(title, company, description, examples, mission_hint=mission_hint)
|
||||||
|
|
||||||
|
if _router is None:
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
_router = LLMRouter()
|
||||||
|
|
||||||
|
print(f"[cover-letter] Generating for: {title} @ {company}", file=sys.stderr)
|
||||||
|
print(f"[cover-letter] Style examples: {[e['company'] for e in examples]}", file=sys.stderr)
|
||||||
|
|
||||||
|
result = _router.complete(prompt)
|
||||||
|
return result.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice")
|
||||||
|
parser.add_argument("--title", help="Job title")
|
||||||
|
parser.add_argument("--company", help="Company name")
|
||||||
|
parser.add_argument("--description", default="", help="Job description text")
|
||||||
|
parser.add_argument("--job-id", type=int, help="Load job from staging.db by ID")
|
||||||
|
parser.add_argument("--output", help="Write output to this file path")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
title, company, description = args.title, args.company, args.description
|
||||||
|
|
||||||
|
if args.job_id is not None:
|
||||||
|
from scripts.db import DEFAULT_DB
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(DEFAULT_DB)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
if not row:
|
||||||
|
print(f"No job with id={args.job_id} in staging.db", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
job = dict(row)
|
||||||
|
title = title or job.get("title", "")
|
||||||
|
company = company or job.get("company", "")
|
||||||
|
description = description or job.get("description", "")
|
||||||
|
|
||||||
|
if not title or not company:
|
||||||
|
parser.error("--title and --company are required (or use --job-id)")
|
||||||
|
|
||||||
|
letter = generate(title, company, description)
|
||||||
|
|
||||||
|
if args.output:
|
||||||
|
Path(args.output).write_text(letter)
|
||||||
|
print(f"Saved to {args.output}", file=sys.stderr)
|
||||||
|
else:
|
||||||
|
print(letter)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
906
scripts/imap_sync.py
Normal file
906
scripts/imap_sync.py
Normal file
|
|
@ -0,0 +1,906 @@
|
||||||
|
# scripts/imap_sync.py
|
||||||
|
"""
|
||||||
|
IMAP email sync — associates recruitment emails with job applications.
|
||||||
|
|
||||||
|
Safety / privacy design:
|
||||||
|
- Only imports emails that pass BOTH checks:
|
||||||
|
1. Sender or subject contains the exact company name (or derived domain)
|
||||||
|
2. Subject contains at least one recruitment keyword
|
||||||
|
- Fuzzy / partial company name matches are rejected
|
||||||
|
- Emails between known personal contacts are never imported
|
||||||
|
- Only the INBOX and Sent folders are touched; no other folders
|
||||||
|
- Credentials stored in config/email.yaml (gitignored)
|
||||||
|
|
||||||
|
Config: config/email.yaml (see config/email.yaml.example)
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/imap_sync.py
|
||||||
|
conda run -n job-seeker python scripts/imap_sync.py --job-id 42
|
||||||
|
conda run -n job-seeker python scripts/imap_sync.py --dry-run
|
||||||
|
"""
|
||||||
|
import email
|
||||||
|
import imaplib
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from email.header import decode_header as _raw_decode_header
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, init_db, get_interview_jobs, add_contact, get_contacts
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
_CLASSIFIER_ROUTER = LLMRouter()
|
||||||
|
|
||||||
|
_CLASSIFY_SYSTEM = (
|
||||||
|
"You are an email classifier. Classify the recruitment email into exactly ONE of these categories:\n"
|
||||||
|
" interview_scheduled, offer_received, rejected, positive_response, survey_received, neutral\n\n"
|
||||||
|
"Rules:\n"
|
||||||
|
"- interview_scheduled: recruiter wants to book a call/interview\n"
|
||||||
|
"- offer_received: job offer is being extended\n"
|
||||||
|
"- rejected: explicitly not moving forward\n"
|
||||||
|
"- positive_response: interested/impressed but no interview booked yet\n"
|
||||||
|
"- survey_received: link or request to complete a survey, assessment, or questionnaire\n"
|
||||||
|
"- neutral: auto-confirmation, generic update, no clear signal\n\n"
|
||||||
|
"Respond with ONLY the category name. No explanation."
|
||||||
|
)
|
||||||
|
|
||||||
|
_CLASSIFY_LABELS = [
|
||||||
|
"interview_scheduled", "offer_received", "rejected",
|
||||||
|
"positive_response", "survey_received", "neutral",
|
||||||
|
]
|
||||||
|
|
||||||
|
CONFIG_PATH = Path(__file__).parent.parent / "config" / "email.yaml"
|
||||||
|
|
||||||
|
# ── Recruitment keyword filter ────────────────────────────────────────────────
|
||||||
|
# An email must match at least one of these in its subject line to be imported.
|
||||||
|
RECRUITMENT_KEYWORDS = {
|
||||||
|
# Application lifecycle
|
||||||
|
"interview", "application", "applicant", "apply", "applied",
|
||||||
|
"position", "opportunity", "role", "opening", "vacancy",
|
||||||
|
"offer", "offer letter", "schedule", "scheduling",
|
||||||
|
"screening", "screen", "phone screen", "video call",
|
||||||
|
"assessment", "hiring", "hired", "recruiter", "recruitment",
|
||||||
|
"talent", "candidate", "recruiting", "next steps", "follow up", "follow-up",
|
||||||
|
"onboarding", "start date", "background check", "reference",
|
||||||
|
"congratulations", "unfortunately", "decision", "update",
|
||||||
|
# Job board / ATS notifications
|
||||||
|
"viewed your profile", "interested in your background",
|
||||||
|
"job alert", "new job", "job match", "job opportunity",
|
||||||
|
"your application", "application received", "application status",
|
||||||
|
"application update", "we received", "thank you for applying",
|
||||||
|
"thanks for applying", "moved forward", "moving forward",
|
||||||
|
"not moving forward", "decided to", "other candidates",
|
||||||
|
"keep your resume", "keep you in mind",
|
||||||
|
# Recruiter outreach
|
||||||
|
"reaching out", "i came across", "your experience",
|
||||||
|
"connect with you", "exciting opportunity", "great fit",
|
||||||
|
"perfect fit", "right fit", "strong fit", "ideal candidate",
|
||||||
|
}
|
||||||
|
|
||||||
|
# ── Rejection / ATS-confirm phrase filter ─────────────────────────────────────
|
||||||
|
# Checked against subject + first 800 chars of body BEFORE calling any LLM.
|
||||||
|
# Covers the cases phi3:mini consistently mis-classifies as "neutral".
|
||||||
|
_REJECTION_PHRASES = [
|
||||||
|
# Explicit rejection — safe to check subject + body
|
||||||
|
"not moving forward", "decided not to move forward",
|
||||||
|
"not selected", "not be moving forward", "will not be moving forward",
|
||||||
|
"unfortunately", "regret to inform", "regret to let you know",
|
||||||
|
"decided to go with other", "decided to pursue other",
|
||||||
|
"other candidates", "other applicants", "position has been filled",
|
||||||
|
"filled the position", "no longer moving forward",
|
||||||
|
"we have decided", "we've decided", "after careful consideration",
|
||||||
|
"at this time we", "at this point we",
|
||||||
|
"we will not", "we won't be", "we are not able",
|
||||||
|
"wish you the best", "best of luck in your",
|
||||||
|
"keep your resume on file",
|
||||||
|
]
|
||||||
|
|
||||||
|
# ATS-confirm phrases — checked against SUBJECT ONLY.
|
||||||
|
# Do NOT check these in the body: recruiters often quote ATS thread history,
|
||||||
|
# so "thank you for applying" can appear in a genuine follow-up body.
|
||||||
|
_ATS_CONFIRM_SUBJECTS = [
|
||||||
|
"application received", "application confirmation",
|
||||||
|
"thanks for applying", "thank you for applying",
|
||||||
|
"thank you for your application",
|
||||||
|
"we received your application",
|
||||||
|
"application has been received",
|
||||||
|
"has received your application",
|
||||||
|
"successfully submitted",
|
||||||
|
"your application for",
|
||||||
|
"you applied to",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Phrases that immediately identify a non-recruitment email (retail, spam, etc.)
|
||||||
|
_SPAM_PHRASES = [
|
||||||
|
# Retail / commerce offers
|
||||||
|
"special offer", "private offer", "exclusive offer", "limited time offer",
|
||||||
|
"limited-time offer", "sent you a special offer", "sent you an offer",
|
||||||
|
"holiday offer", "seasonal offer", "membership offer",
|
||||||
|
"round trip from $", "bonus points",
|
||||||
|
"% off", "% discount", "save up to", "free shipping",
|
||||||
|
"unsubscribe", "view in browser", "view this email in",
|
||||||
|
"update your preferences", "email preferences",
|
||||||
|
# LinkedIn apply confirmations & digests (not new inbound leads)
|
||||||
|
"your application was sent to",
|
||||||
|
"your application was viewed by",
|
||||||
|
"application updates this week",
|
||||||
|
"don't forget to complete your application",
|
||||||
|
"view your application updates",
|
||||||
|
"you have new application updates",
|
||||||
|
# Indeed apply confirmations
|
||||||
|
"indeed application:",
|
||||||
|
# DocuSign / e-signature
|
||||||
|
"requests you to sign",
|
||||||
|
"has sent you a reminder",
|
||||||
|
"please sign",
|
||||||
|
# Security / MFA codes
|
||||||
|
"security code for your application",
|
||||||
|
"verification code",
|
||||||
|
]
|
||||||
|
|
||||||
|
# Subject prefixes that identify non-job emails
|
||||||
|
_SPAM_SUBJECT_PREFIXES = [
|
||||||
|
"@", # "@user sent you a special offer" — Depop / social commerce
|
||||||
|
"re: fw:", # forwarded chains unlikely to be first-contact recruitment
|
||||||
|
"accepted:", # Google Calendar accepted invite
|
||||||
|
"notification:", # Google Calendar notification
|
||||||
|
"[meeting reminder]", # Google Calendar meeting reminder
|
||||||
|
"updated invitation:", # Google Calendar update
|
||||||
|
"[updated]", # Google Calendar update
|
||||||
|
"reminder:", # Generic reminder (AAA digital interview reminders, etc.)
|
||||||
|
"📄", # Newsletter/article emoji prefix
|
||||||
|
"invitation from", # Google Calendar invite forwarded by name
|
||||||
|
]
|
||||||
|
|
||||||
|
# Unicode-safe "don't forget" variants (Gmail renders typographic apostrophes)
|
||||||
|
_DONT_FORGET_VARIANTS = [
|
||||||
|
"don't forget to complete your application", # straight apostrophe
|
||||||
|
"don\u2019t forget to complete your application", # right single quotation mark '
|
||||||
|
"don\u2018t forget to complete your application", # left single quotation mark '
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _has_rejection_or_ats_signal(subject: str, body: str) -> bool:
|
||||||
|
"""Return True if the email is a rejection, ATS auto-confirmation, or non-recruitment spam."""
|
||||||
|
subject_lower = subject.lower().strip()
|
||||||
|
|
||||||
|
# Fast subject-prefix checks (Depop "@user", etc.)
|
||||||
|
if any(subject_lower.startswith(p) for p in _SPAM_SUBJECT_PREFIXES):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Fast subject-only check for ATS confirmations
|
||||||
|
if any(phrase in subject_lower for phrase in _ATS_CONFIRM_SUBJECTS):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# Check subject + opening body for rejection and spam phrases
|
||||||
|
haystack = subject_lower + " " + body[:1500].lower()
|
||||||
|
if any(phrase in haystack for phrase in _REJECTION_PHRASES + _SPAM_PHRASES):
|
||||||
|
return True
|
||||||
|
# Unicode-safe "don't forget" check (handles straight, right, and left apostrophes)
|
||||||
|
raw = (subject + " " + body[:1500]).lower()
|
||||||
|
return any(phrase in raw for phrase in _DONT_FORGET_VARIANTS)
|
||||||
|
|
||||||
|
|
||||||
|
# Legal entity suffixes to strip when normalising company names
|
||||||
|
_LEGAL_SUFFIXES = re.compile(
|
||||||
|
r",?\s*\b(Inc|LLC|Ltd|Limited|Corp|Corporation|Co|GmbH|AG|plc|PLC|SAS|SA|NV|BV|LP|LLP)\b\.?\s*$",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Job-board SLDs that must never be used as company-match search terms.
|
||||||
|
# A LinkedIn job URL has domain "linkedin.com" → SLD "linkedin", which would
|
||||||
|
# incorrectly match every LinkedIn notification email against every LinkedIn job.
|
||||||
|
_JOB_BOARD_SLDS = {
|
||||||
|
"linkedin", "indeed", "glassdoor", "ziprecruiter", "monster",
|
||||||
|
"careerbuilder", "dice", "simplyhired", "wellfound", "angellist",
|
||||||
|
"greenhouse", "lever", "workday", "taleo", "icims", "smartrecruiters",
|
||||||
|
"bamboohr", "ashby", "rippling", "jobvite", "workable", "gusto",
|
||||||
|
"paylocity", "paycom", "adp", "breezy", "recruitee", "jazz",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Helpers ───────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _decode_str(value: Optional[str]) -> str:
|
||||||
|
"""Decode an RFC2047-encoded header value to a plain Python string."""
|
||||||
|
if not value:
|
||||||
|
return ""
|
||||||
|
parts = _raw_decode_header(value)
|
||||||
|
result = []
|
||||||
|
for part, encoding in parts:
|
||||||
|
if isinstance(part, bytes):
|
||||||
|
result.append(part.decode(encoding or "utf-8", errors="replace"))
|
||||||
|
else:
|
||||||
|
result.append(str(part))
|
||||||
|
return " ".join(result).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_domain(url_or_email: str) -> str:
|
||||||
|
"""
|
||||||
|
Pull the bare domain from a URL (https://company.com/jobs/...) or
|
||||||
|
an email address (recruiter@company.com). Returns '' if none found.
|
||||||
|
"""
|
||||||
|
url_or_email = url_or_email.strip()
|
||||||
|
if "@" in url_or_email:
|
||||||
|
return url_or_email.split("@")[-1].split(">")[0].strip().lower()
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url_or_email)
|
||||||
|
host = parsed.netloc or parsed.path
|
||||||
|
# strip www.
|
||||||
|
return re.sub(r"^www\.", "", host).lower()
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _normalise_company(company: str) -> str:
|
||||||
|
"""Strip legal suffixes and extra whitespace from a company name."""
|
||||||
|
return _LEGAL_SUFFIXES.sub("", company).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _company_search_terms(company: str, job_url: str = "") -> list[str]:
|
||||||
|
"""
|
||||||
|
Return a list of strings that must appear (case-insensitively) in the
|
||||||
|
email's from-address or subject for it to be considered a match.
|
||||||
|
|
||||||
|
We are deliberately conservative:
|
||||||
|
- Use the full normalised company name (not just the first word)
|
||||||
|
- Also include the company domain derived from the job URL, but ONLY
|
||||||
|
when the domain belongs to the actual company (not a job board).
|
||||||
|
LinkedIn jobs link to linkedin.com — if we used "linkedin" as a term
|
||||||
|
we'd match every LinkedIn notification email against every LinkedIn job.
|
||||||
|
"""
|
||||||
|
terms = []
|
||||||
|
clean = _normalise_company(company)
|
||||||
|
if len(clean) >= 3:
|
||||||
|
terms.append(clean.lower())
|
||||||
|
|
||||||
|
domain = _extract_domain(job_url)
|
||||||
|
if domain and len(domain) > 4:
|
||||||
|
sld = domain.split(".")[0]
|
||||||
|
if len(sld) >= 3 and sld not in terms and sld not in _JOB_BOARD_SLDS:
|
||||||
|
terms.append(sld)
|
||||||
|
|
||||||
|
return terms
|
||||||
|
|
||||||
|
|
||||||
|
def _has_recruitment_keyword(subject: str) -> bool:
|
||||||
|
"""Return True if the subject contains at least one recruitment keyword."""
|
||||||
|
subject_lower = subject.lower()
|
||||||
|
return any(kw in subject_lower for kw in RECRUITMENT_KEYWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
def _email_is_relevant(from_addr: str, subject: str, search_terms: list[str]) -> bool:
|
||||||
|
"""
|
||||||
|
Two-gate filter:
|
||||||
|
Gate 1 — from-address OR subject must contain an exact company term
|
||||||
|
Gate 2 — subject must contain a recruitment keyword
|
||||||
|
|
||||||
|
Both gates must pass. This prevents importing unrelated emails that
|
||||||
|
happen to mention a company name in passing.
|
||||||
|
"""
|
||||||
|
combined = (from_addr + " " + subject).lower()
|
||||||
|
|
||||||
|
gate1 = any(term in combined for term in search_terms)
|
||||||
|
gate2 = _has_recruitment_keyword(subject)
|
||||||
|
|
||||||
|
return gate1 and gate2
|
||||||
|
|
||||||
|
|
||||||
|
def _get_existing_message_ids(job_id: int, db_path: Path) -> set[str]:
|
||||||
|
contacts = get_contacts(db_path, job_id=job_id)
|
||||||
|
return {c.get("message_id", "") for c in contacts if c.get("message_id")}
|
||||||
|
|
||||||
|
|
||||||
|
def classify_stage_signal(subject: str, body: str) -> Optional[str]:
|
||||||
|
"""Classify an inbound email into a pipeline stage signal.
|
||||||
|
|
||||||
|
Returns one of the 5 label strings, or None on failure.
|
||||||
|
Uses phi3:mini via Ollama (benchmarked 100% on 12-case test set).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
prompt = f"Subject: {subject}\n\nEmail: {body[:400]}"
|
||||||
|
raw = _CLASSIFIER_ROUTER.complete(
|
||||||
|
prompt,
|
||||||
|
system=_CLASSIFY_SYSTEM,
|
||||||
|
model_override="llama3.1:8b",
|
||||||
|
fallback_order=["ollama_research"],
|
||||||
|
)
|
||||||
|
# Strip <think> blocks (in case a reasoning model slips through)
|
||||||
|
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL)
|
||||||
|
text = text.lower().strip()
|
||||||
|
for label in _CLASSIFY_LABELS:
|
||||||
|
if text.startswith(label) or label in text:
|
||||||
|
return label
|
||||||
|
return "neutral"
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
_EXTRACT_SYSTEM = (
|
||||||
|
"Extract the hiring company name and job title from this recruitment email, "
|
||||||
|
"but ONLY if it represents genuine new recruiter outreach — i.e. a recruiter "
|
||||||
|
"contacting you about an open role for the first time.\n\n"
|
||||||
|
"Return {\"company\": null, \"title\": null} if the email is any of:\n"
|
||||||
|
" - A rejection or 'not moving forward' notice\n"
|
||||||
|
" - An ATS auto-confirmation ('we received your application')\n"
|
||||||
|
" - A status update for an application already in progress\n"
|
||||||
|
" - A generic job-alert digest or newsletter\n"
|
||||||
|
" - A follow-up you sent, not a reply from a recruiter\n\n"
|
||||||
|
"Otherwise respond with ONLY valid JSON: "
|
||||||
|
'{"company": "Company Name", "title": "Job Title"}.'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_lead_info(subject: str, body: str,
|
||||||
|
from_addr: str) -> tuple[Optional[str], Optional[str]]:
|
||||||
|
"""Use LLM to extract (company, title) from an unmatched recruitment email.
|
||||||
|
|
||||||
|
Returns (company, title) or (None, None) on failure / low confidence.
|
||||||
|
"""
|
||||||
|
import json as _json
|
||||||
|
try:
|
||||||
|
prompt = (
|
||||||
|
f"From: {from_addr}\n"
|
||||||
|
f"Subject: {subject}\n\n"
|
||||||
|
f"Email excerpt:\n{body[:600]}"
|
||||||
|
)
|
||||||
|
raw = _CLASSIFIER_ROUTER.complete(
|
||||||
|
prompt,
|
||||||
|
system=_EXTRACT_SYSTEM,
|
||||||
|
fallback_order=["ollama_research"],
|
||||||
|
)
|
||||||
|
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||||||
|
m = re.search(r'\{.*\}', text, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
return None, None
|
||||||
|
data = _json.loads(m.group())
|
||||||
|
company = data.get("company") or None
|
||||||
|
title = data.get("title") or None
|
||||||
|
return company, title
|
||||||
|
except Exception:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
# Keywords that indicate an email in a curated label needs attention.
|
||||||
|
# Intentionally separate from RECRUITMENT_KEYWORDS — these are action-oriented.
|
||||||
|
_TODO_LABEL_KEYWORDS = {
|
||||||
|
"action needed", "action required",
|
||||||
|
"please complete", "please submit", "please respond", "please reply",
|
||||||
|
"response needed", "response required",
|
||||||
|
"next steps", "next step",
|
||||||
|
"follow up", "follow-up",
|
||||||
|
"deadline", "by end of",
|
||||||
|
"your offer", "offer letter",
|
||||||
|
"background check", "reference check",
|
||||||
|
"onboarding", "start date",
|
||||||
|
"congrats", "congratulations",
|
||||||
|
"we'd like to", "we would like to",
|
||||||
|
"interview", "schedule", "scheduling",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _has_todo_keyword(subject: str) -> bool:
|
||||||
|
"""Return True if the subject contains a TODO-label action keyword."""
|
||||||
|
subject_lower = subject.lower()
|
||||||
|
return any(kw in subject_lower for kw in _TODO_LABEL_KEYWORDS)
|
||||||
|
|
||||||
|
|
||||||
|
_LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com"
|
||||||
|
|
||||||
|
# Social-proof / nav lines to skip when parsing alert blocks
|
||||||
|
_ALERT_SKIP_PHRASES = {
|
||||||
|
"school alumni", "apply with", "actively hiring", "manage alerts",
|
||||||
|
"view all jobs", "your job alert", "new jobs match",
|
||||||
|
"unsubscribe", "linkedin corporation",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def parse_linkedin_alert(body: str) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Parse the plain-text body of a LinkedIn Job Alert digest email.
|
||||||
|
|
||||||
|
Returns a list of dicts: {title, company, location, url}.
|
||||||
|
URL is canonicalized to https://www.linkedin.com/jobs/view/<id>/
|
||||||
|
(tracking parameters stripped).
|
||||||
|
"""
|
||||||
|
jobs = []
|
||||||
|
# Split on separator lines (10+ dashes)
|
||||||
|
blocks = re.split(r"\n\s*-{10,}\s*\n", body)
|
||||||
|
for block in blocks:
|
||||||
|
lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()]
|
||||||
|
|
||||||
|
# Find "View job:" URL
|
||||||
|
url = None
|
||||||
|
for line in lines:
|
||||||
|
m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
raw_url = m.group(1)
|
||||||
|
job_id_m = re.search(r"/jobs/view/(\d+)", raw_url)
|
||||||
|
if job_id_m:
|
||||||
|
url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/"
|
||||||
|
break
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter noise lines
|
||||||
|
content = [
|
||||||
|
ln for ln in lines
|
||||||
|
if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES)
|
||||||
|
and not ln.lower().startswith("view job:")
|
||||||
|
and not ln.startswith("http")
|
||||||
|
]
|
||||||
|
if len(content) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
jobs.append({
|
||||||
|
"title": content[0],
|
||||||
|
"company": content[1],
|
||||||
|
"location": content[2] if len(content) > 2 else "",
|
||||||
|
"url": url,
|
||||||
|
})
|
||||||
|
return jobs
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_todo_label(conn: imaplib.IMAP4, cfg: dict, db_path: Path,
|
||||||
|
active_jobs: list[dict],
|
||||||
|
known_message_ids: set) -> int:
|
||||||
|
"""Scan the configured Gmail label for action emails, matching them to pipeline jobs.
|
||||||
|
|
||||||
|
Two gates per email:
|
||||||
|
1. Company name appears in from-address or subject (same as sync_job_emails)
|
||||||
|
2. Subject contains a TODO-label action keyword
|
||||||
|
|
||||||
|
Returns count of new contacts attached.
|
||||||
|
"""
|
||||||
|
label = cfg.get("todo_label", "").strip()
|
||||||
|
if not label:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
lookback = int(cfg.get("lookback_days", 90))
|
||||||
|
since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
|
||||||
|
|
||||||
|
# Search the label folder for any emails (no keyword pre-filter — it's curated)
|
||||||
|
uids = _search_folder(conn, label, "ALL", since)
|
||||||
|
if not uids:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
# Build a lookup: search_term → [job, ...] for all active jobs
|
||||||
|
term_to_jobs: dict[str, list[dict]] = {}
|
||||||
|
for job in active_jobs:
|
||||||
|
for term in _company_search_terms(job.get("company", ""), job.get("url", "")):
|
||||||
|
term_to_jobs.setdefault(term, []).append(job)
|
||||||
|
|
||||||
|
added = 0
|
||||||
|
for uid in uids:
|
||||||
|
parsed = _parse_message(conn, uid)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
mid = parsed["message_id"]
|
||||||
|
if mid in known_message_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Gate 1: company name match — from_addr + subject + first 300 chars of body
|
||||||
|
# Body fallback catches ATS emails (e.g. noreply@greenhouse.io) where the
|
||||||
|
# company name only appears in the email body, not the sender or subject.
|
||||||
|
combined = (
|
||||||
|
parsed["from_addr"] + " " +
|
||||||
|
parsed["subject"] + " " +
|
||||||
|
parsed["body"][:300]
|
||||||
|
).lower()
|
||||||
|
matched_jobs = []
|
||||||
|
for term, jobs in term_to_jobs.items():
|
||||||
|
if term in combined:
|
||||||
|
matched_jobs.extend(jobs)
|
||||||
|
# Deduplicate by job id
|
||||||
|
seen_ids: set[int] = set()
|
||||||
|
matched_jobs = [j for j in matched_jobs if not (j["id"] in seen_ids or seen_ids.add(j["id"]))] # type: ignore[func-returns-value]
|
||||||
|
if not matched_jobs:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Gate 2: action keyword in subject
|
||||||
|
if not _has_todo_keyword(parsed["subject"]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for job in matched_jobs:
|
||||||
|
contact_id = add_contact(
|
||||||
|
db_path, job_id=job["id"], direction="inbound",
|
||||||
|
subject=parsed["subject"],
|
||||||
|
from_addr=parsed["from_addr"],
|
||||||
|
to_addr=parsed["to_addr"],
|
||||||
|
body=parsed["body"],
|
||||||
|
received_at=parsed["date"][:16] if parsed["date"] else since,
|
||||||
|
message_id=mid,
|
||||||
|
)
|
||||||
|
signal = classify_stage_signal(parsed["subject"], parsed["body"])
|
||||||
|
if signal and signal != "neutral":
|
||||||
|
_update_contact_signal(db_path, contact_id, signal)
|
||||||
|
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
added += 1
|
||||||
|
print(f"[imap] TODO label → {matched_jobs[0].get('company')} — {parsed['subject'][:60]}")
|
||||||
|
|
||||||
|
return added
|
||||||
|
|
||||||
|
|
||||||
|
def _scan_unmatched_leads(conn: imaplib.IMAP4, cfg: dict,
|
||||||
|
db_path: Path,
|
||||||
|
known_message_ids: set) -> int:
|
||||||
|
"""Scan INBOX for recruitment emails not matched to any pipeline job.
|
||||||
|
|
||||||
|
Calls LLM to extract company/title; inserts qualifying emails as pending jobs.
|
||||||
|
Returns the count of new leads inserted.
|
||||||
|
"""
|
||||||
|
from scripts.db import get_existing_urls, insert_job, add_contact as _add_contact
|
||||||
|
|
||||||
|
lookback = int(cfg.get("lookback_days", 90))
|
||||||
|
since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
|
||||||
|
|
||||||
|
broad_terms = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"]
|
||||||
|
all_uids: set = set()
|
||||||
|
for term in broad_terms:
|
||||||
|
uids = _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since)
|
||||||
|
all_uids.update(uids)
|
||||||
|
|
||||||
|
existing_urls = get_existing_urls(db_path)
|
||||||
|
new_leads = 0
|
||||||
|
|
||||||
|
for uid in all_uids:
|
||||||
|
parsed = _parse_message(conn, uid)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
mid = parsed["message_id"]
|
||||||
|
if mid in known_message_ids:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ── LinkedIn Job Alert digest — parse each card individually ──────
|
||||||
|
if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower():
|
||||||
|
cards = parse_linkedin_alert(parsed["body"])
|
||||||
|
for card in cards:
|
||||||
|
if card["url"] in existing_urls:
|
||||||
|
continue
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": card["title"],
|
||||||
|
"company": card["company"],
|
||||||
|
"url": card["url"],
|
||||||
|
"source": "linkedin",
|
||||||
|
"location": card["location"],
|
||||||
|
"is_remote": 0,
|
||||||
|
"salary": "",
|
||||||
|
"description": "",
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
if job_id:
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
submit_task(db_path, "scrape_url", job_id)
|
||||||
|
existing_urls.add(card["url"])
|
||||||
|
new_leads += 1
|
||||||
|
print(f"[imap] LinkedIn alert → {card['company']} — {card['title']}")
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
continue # skip normal LLM extraction path
|
||||||
|
|
||||||
|
if not _has_recruitment_keyword(parsed["subject"]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Fast phrase-based rejection / ATS-confirm filter (catches what phi3 misses)
|
||||||
|
if _has_rejection_or_ats_signal(parsed["subject"], parsed["body"]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# LLM classification as secondary gate — skip on rejection or classifier failure
|
||||||
|
signal = classify_stage_signal(parsed["subject"], parsed["body"])
|
||||||
|
if signal is None or signal == "rejected":
|
||||||
|
continue
|
||||||
|
|
||||||
|
company, title = extract_lead_info(
|
||||||
|
parsed["subject"], parsed["body"], parsed["from_addr"]
|
||||||
|
)
|
||||||
|
if not company:
|
||||||
|
continue
|
||||||
|
|
||||||
|
from_domain = _extract_domain(parsed["from_addr"]) or "unknown"
|
||||||
|
mid_hash = str(abs(hash(mid)))[:10]
|
||||||
|
synthetic_url = f"email://{from_domain}/{mid_hash}"
|
||||||
|
|
||||||
|
if synthetic_url in existing_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": title or "(untitled)",
|
||||||
|
"company": company,
|
||||||
|
"url": synthetic_url,
|
||||||
|
"source": "email",
|
||||||
|
"location": "",
|
||||||
|
"is_remote": 0,
|
||||||
|
"salary": "",
|
||||||
|
"description": parsed["body"][:2000],
|
||||||
|
"date_found": datetime.now().isoformat()[:10],
|
||||||
|
})
|
||||||
|
if job_id:
|
||||||
|
_add_contact(db_path, job_id=job_id, direction="inbound",
|
||||||
|
subject=parsed["subject"],
|
||||||
|
from_addr=parsed["from_addr"],
|
||||||
|
body=parsed["body"],
|
||||||
|
received_at=parsed["date"][:16] if parsed["date"] else "",
|
||||||
|
message_id=mid)
|
||||||
|
known_message_ids.add(mid)
|
||||||
|
existing_urls.add(synthetic_url)
|
||||||
|
new_leads += 1
|
||||||
|
|
||||||
|
return new_leads
|
||||||
|
|
||||||
|
|
||||||
|
# ── IMAP connection ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def load_config() -> dict:
|
||||||
|
if not CONFIG_PATH.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"Email config not found: {CONFIG_PATH}\n"
|
||||||
|
f"Copy config/email.yaml.example → config/email.yaml and fill it in."
|
||||||
|
)
|
||||||
|
return yaml.safe_load(CONFIG_PATH.read_text()) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def connect(cfg: dict) -> imaplib.IMAP4:
|
||||||
|
host = cfg.get("host", "imap.gmail.com")
|
||||||
|
port = int(cfg.get("port", 993))
|
||||||
|
use_ssl = cfg.get("use_ssl", True)
|
||||||
|
conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port)
|
||||||
|
conn.login(cfg["username"], cfg["password"])
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_sent_folder(conn: imaplib.IMAP4) -> str:
|
||||||
|
"""Try to auto-detect the Sent folder name."""
|
||||||
|
candidates = ["[Gmail]/Sent Mail", "Sent", "Sent Items", "Sent Messages", "INBOX.Sent"]
|
||||||
|
try:
|
||||||
|
_, folder_list = conn.list()
|
||||||
|
flat = " ".join(f.decode() for f in (folder_list or []))
|
||||||
|
for candidate in candidates:
|
||||||
|
if candidate.lower() in flat.lower():
|
||||||
|
return candidate
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return "Sent"
|
||||||
|
|
||||||
|
|
||||||
|
def _quote_folder(name: str) -> str:
|
||||||
|
"""Quote an IMAP folder name if it contains spaces.
|
||||||
|
Escapes internal backslashes and double-quotes per RFC 3501.
|
||||||
|
e.g. 'TO DO JOBS' → '"TO DO JOBS"', 'My "Jobs"' → '"My \\"Jobs\\""'
|
||||||
|
"""
|
||||||
|
if " " in name:
|
||||||
|
escaped = name.replace("\\", "\\\\").replace('"', '\\"')
|
||||||
|
return f'"{escaped}"'
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def _search_folder(conn: imaplib.IMAP4, folder: str, criteria: str,
|
||||||
|
since: str) -> list[bytes]:
|
||||||
|
"""SELECT a folder and return matching UID list (empty on any error)."""
|
||||||
|
try:
|
||||||
|
conn.select(_quote_folder(folder), readonly=True)
|
||||||
|
_, data = conn.search(None, f'(SINCE "{since}" {criteria})')
|
||||||
|
return data[0].split() if data and data[0] else []
|
||||||
|
except Exception:
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]:
|
||||||
|
"""Fetch and parse one message. Returns None on failure."""
|
||||||
|
try:
|
||||||
|
_, data = conn.fetch(uid, "(RFC822)")
|
||||||
|
if not data or not data[0]:
|
||||||
|
return None
|
||||||
|
msg = email.message_from_bytes(data[0][1])
|
||||||
|
|
||||||
|
body = ""
|
||||||
|
if msg.is_multipart():
|
||||||
|
for part in msg.walk():
|
||||||
|
if part.get_content_type() == "text/plain":
|
||||||
|
try:
|
||||||
|
body = part.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
body = msg.get_payload(decode=True).decode("utf-8", errors="replace")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
mid = msg.get("Message-ID", "").strip()
|
||||||
|
if not mid:
|
||||||
|
return None # No Message-ID → can't dedup; skip to avoid repeat inserts
|
||||||
|
|
||||||
|
return {
|
||||||
|
"message_id": mid,
|
||||||
|
"subject": _decode_str(msg.get("Subject")),
|
||||||
|
"from_addr": _decode_str(msg.get("From")),
|
||||||
|
"to_addr": _decode_str(msg.get("To")),
|
||||||
|
"date": _decode_str(msg.get("Date")),
|
||||||
|
"body": body[:4000],
|
||||||
|
}
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Per-job sync ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _update_contact_signal(db_path: Path, contact_id: int, signal: str) -> None:
|
||||||
|
"""Write a stage signal onto an existing contact row."""
|
||||||
|
import sqlite3 as _sqlite3
|
||||||
|
conn = _sqlite3.connect(db_path)
|
||||||
|
conn.execute(
|
||||||
|
"UPDATE job_contacts SET stage_signal = ? WHERE id = ?",
|
||||||
|
(signal, contact_id),
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def sync_job_emails(job: dict, conn: imaplib.IMAP4, cfg: dict,
|
||||||
|
db_path: Path, dry_run: bool = False) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Sync recruitment emails for one job.
|
||||||
|
Returns (inbound_added, outbound_added).
|
||||||
|
"""
|
||||||
|
company = (job.get("company") or "").strip()
|
||||||
|
if not company:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
search_terms = _company_search_terms(company, job.get("url", ""))
|
||||||
|
if not search_terms:
|
||||||
|
return 0, 0
|
||||||
|
|
||||||
|
lookback = int(cfg.get("lookback_days", 90))
|
||||||
|
since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y")
|
||||||
|
existing_ids = _get_existing_message_ids(job["id"], db_path)
|
||||||
|
|
||||||
|
inbound = outbound = 0
|
||||||
|
|
||||||
|
for term in search_terms:
|
||||||
|
# ── INBOX — inbound ───────────────────────────────────────────────
|
||||||
|
uids = _search_folder(
|
||||||
|
conn, "INBOX",
|
||||||
|
f'(OR FROM "{term}" SUBJECT "{term}")',
|
||||||
|
since,
|
||||||
|
)
|
||||||
|
for uid in uids:
|
||||||
|
parsed = _parse_message(conn, uid)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
if parsed["message_id"] in existing_ids:
|
||||||
|
continue
|
||||||
|
if not _email_is_relevant(parsed["from_addr"], parsed["subject"], search_terms):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
contact_id = add_contact(
|
||||||
|
db_path, job_id=job["id"], direction="inbound",
|
||||||
|
subject=parsed["subject"], from_addr=parsed["from_addr"],
|
||||||
|
to_addr=parsed["to_addr"], body=parsed["body"],
|
||||||
|
received_at=parsed["date"][:16] if parsed["date"] else since,
|
||||||
|
message_id=parsed["message_id"],
|
||||||
|
)
|
||||||
|
signal = classify_stage_signal(parsed["subject"], parsed["body"])
|
||||||
|
if signal and signal != "neutral":
|
||||||
|
_update_contact_signal(db_path, contact_id, signal)
|
||||||
|
existing_ids.add(parsed["message_id"])
|
||||||
|
inbound += 1
|
||||||
|
|
||||||
|
# ── Sent — outbound ───────────────────────────────────────────────
|
||||||
|
sent_folder = cfg.get("sent_folder") or _detect_sent_folder(conn)
|
||||||
|
uids = _search_folder(
|
||||||
|
conn, sent_folder,
|
||||||
|
f'(OR TO "{term}" SUBJECT "{term}")',
|
||||||
|
since,
|
||||||
|
)
|
||||||
|
for uid in uids:
|
||||||
|
parsed = _parse_message(conn, uid)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
if parsed["message_id"] in existing_ids:
|
||||||
|
continue
|
||||||
|
if not _email_is_relevant(parsed["to_addr"], parsed["subject"], search_terms):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
add_contact(
|
||||||
|
db_path, job_id=job["id"], direction="outbound",
|
||||||
|
subject=parsed["subject"], from_addr=parsed["from_addr"],
|
||||||
|
to_addr=parsed["to_addr"], body=parsed["body"],
|
||||||
|
received_at=parsed["date"][:16] if parsed["date"] else since,
|
||||||
|
message_id=parsed["message_id"],
|
||||||
|
)
|
||||||
|
existing_ids.add(parsed["message_id"])
|
||||||
|
outbound += 1
|
||||||
|
|
||||||
|
return inbound, outbound
|
||||||
|
|
||||||
|
|
||||||
|
# ── Main entry ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def sync_all(db_path: Path = DEFAULT_DB,
|
||||||
|
dry_run: bool = False,
|
||||||
|
job_ids: Optional[list[int]] = None,
|
||||||
|
on_stage=None) -> dict:
|
||||||
|
"""
|
||||||
|
Sync emails for all active pipeline jobs (or a specific subset).
|
||||||
|
|
||||||
|
Returns a summary dict:
|
||||||
|
{"synced": N, "inbound": N, "outbound": N, "errors": [...]}
|
||||||
|
"""
|
||||||
|
def _stage(msg: str) -> None:
|
||||||
|
if on_stage:
|
||||||
|
on_stage(msg)
|
||||||
|
|
||||||
|
cfg = load_config()
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
jobs_by_stage = get_interview_jobs(db_path)
|
||||||
|
active_stages = ["applied", "phone_screen", "interviewing", "offer", "hired"]
|
||||||
|
all_active = [j for stage in active_stages for j in jobs_by_stage.get(stage, [])]
|
||||||
|
|
||||||
|
if job_ids:
|
||||||
|
all_active = [j for j in all_active if j["id"] in job_ids]
|
||||||
|
|
||||||
|
if not all_active:
|
||||||
|
return {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "todo_attached": 0, "errors": []}
|
||||||
|
|
||||||
|
_stage("connecting")
|
||||||
|
print(f"[imap] Connecting to {cfg.get('host', 'imap.gmail.com')} …")
|
||||||
|
conn = connect(cfg)
|
||||||
|
summary = {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "errors": []}
|
||||||
|
|
||||||
|
try:
|
||||||
|
for i, job in enumerate(all_active, 1):
|
||||||
|
_stage(f"job {i}/{len(all_active)}")
|
||||||
|
try:
|
||||||
|
inb, out = sync_job_emails(job, conn, cfg, db_path, dry_run=dry_run)
|
||||||
|
label = "DRY-RUN " if dry_run else ""
|
||||||
|
print(f"[imap] {label}{job.get('company'):30s} +{inb} in +{out} out")
|
||||||
|
if inb + out > 0:
|
||||||
|
summary["synced"] += 1
|
||||||
|
summary["inbound"] += inb
|
||||||
|
summary["outbound"] += out
|
||||||
|
except Exception as e:
|
||||||
|
msg = f"{job.get('company')}: {e}"
|
||||||
|
summary["errors"].append(msg)
|
||||||
|
print(f"[imap] ERROR — {msg}")
|
||||||
|
|
||||||
|
_stage("scanning todo label")
|
||||||
|
from scripts.db import get_all_message_ids
|
||||||
|
known_mids = get_all_message_ids(db_path)
|
||||||
|
summary["todo_attached"] = _scan_todo_label(conn, cfg, db_path, all_active, known_mids)
|
||||||
|
|
||||||
|
_stage("scanning leads")
|
||||||
|
summary["new_leads"] = _scan_unmatched_leads(conn, cfg, db_path, known_mids)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
conn.logout()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return summary
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Sync IMAP emails to job contacts")
|
||||||
|
parser.add_argument("--job-id", type=int, nargs="+", help="Sync only these job IDs")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Show matches without saving")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
result = sync_all(
|
||||||
|
dry_run=args.dry_run,
|
||||||
|
job_ids=args.job_id,
|
||||||
|
)
|
||||||
|
print(f"\n[imap] Done — {result['synced']} jobs updated, "
|
||||||
|
f"{result['inbound']} inbound, {result['outbound']} outbound"
|
||||||
|
+ (f", {len(result['errors'])} errors" if result["errors"] else ""))
|
||||||
170
scripts/llm_router.py
Normal file
170
scripts/llm_router.py
Normal file
|
|
@ -0,0 +1,170 @@
|
||||||
|
"""
|
||||||
|
LLM abstraction layer with priority fallback chain.
|
||||||
|
Reads config/llm.yaml. Tries backends in order; falls back on any error.
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import yaml
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
class LLMRouter:
|
||||||
|
def __init__(self, config_path: Path = CONFIG_PATH):
|
||||||
|
with open(config_path) as f:
|
||||||
|
self.config = yaml.safe_load(f)
|
||||||
|
|
||||||
|
def _is_reachable(self, base_url: str) -> bool:
|
||||||
|
"""Quick health-check ping. Returns True if backend is up."""
|
||||||
|
health_url = base_url.rstrip("/").removesuffix("/v1") + "/health"
|
||||||
|
try:
|
||||||
|
resp = requests.get(health_url, timeout=2)
|
||||||
|
return resp.status_code < 500
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def _resolve_model(self, client: OpenAI, model: str) -> str:
|
||||||
|
"""Resolve __auto__ to the first model served by vLLM."""
|
||||||
|
if model != "__auto__":
|
||||||
|
return model
|
||||||
|
models = client.models.list()
|
||||||
|
return models.data[0].id
|
||||||
|
|
||||||
|
def complete(self, prompt: str, system: str | None = None,
|
||||||
|
model_override: str | None = None,
|
||||||
|
fallback_order: list[str] | None = None,
|
||||||
|
images: list[str] | None = None) -> str:
|
||||||
|
"""
|
||||||
|
Generate a completion. Tries each backend in fallback_order.
|
||||||
|
|
||||||
|
model_override: when set, replaces the configured model for
|
||||||
|
openai_compat backends (e.g. pass a research-specific ollama model).
|
||||||
|
fallback_order: when set, overrides config fallback_order for this
|
||||||
|
call (e.g. pass config["research_fallback_order"] for research tasks).
|
||||||
|
images: optional list of base64-encoded PNG/JPG strings. When provided,
|
||||||
|
backends without supports_images=true are skipped. vision_service backends
|
||||||
|
are only tried when images is provided.
|
||||||
|
Raises RuntimeError if all backends are exhausted.
|
||||||
|
"""
|
||||||
|
order = fallback_order if fallback_order is not None else self.config["fallback_order"]
|
||||||
|
for name in order:
|
||||||
|
backend = self.config["backends"][name]
|
||||||
|
|
||||||
|
if not backend.get("enabled", True):
|
||||||
|
print(f"[LLMRouter] {name}: disabled, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
supports_images = backend.get("supports_images", False)
|
||||||
|
is_vision_service = backend["type"] == "vision_service"
|
||||||
|
|
||||||
|
# vision_service only used when images provided
|
||||||
|
if is_vision_service and not images:
|
||||||
|
print(f"[LLMRouter] {name}: vision_service skipped (no images)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# non-vision backends skipped when images provided and they don't support it
|
||||||
|
if images and not supports_images and not is_vision_service:
|
||||||
|
print(f"[LLMRouter] {name}: no image support, skipping")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if is_vision_service:
|
||||||
|
if not self._is_reachable(backend["base_url"]):
|
||||||
|
print(f"[LLMRouter] {name}: unreachable, skipping")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
resp = requests.post(
|
||||||
|
backend["base_url"].rstrip("/") + "/analyze",
|
||||||
|
json={
|
||||||
|
"prompt": prompt,
|
||||||
|
"image_base64": images[0] if images else "",
|
||||||
|
},
|
||||||
|
timeout=60,
|
||||||
|
)
|
||||||
|
resp.raise_for_status()
|
||||||
|
print(f"[LLMRouter] Used backend: {name} (vision_service)")
|
||||||
|
return resp.json()["text"]
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[LLMRouter] {name}: error — {e}, trying next")
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif backend["type"] == "openai_compat":
|
||||||
|
if not self._is_reachable(backend["base_url"]):
|
||||||
|
print(f"[LLMRouter] {name}: unreachable, skipping")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
client = OpenAI(
|
||||||
|
base_url=backend["base_url"],
|
||||||
|
api_key=backend.get("api_key") or "any",
|
||||||
|
)
|
||||||
|
raw_model = model_override or backend["model"]
|
||||||
|
model = self._resolve_model(client, raw_model)
|
||||||
|
messages = []
|
||||||
|
if system:
|
||||||
|
messages.append({"role": "system", "content": system})
|
||||||
|
if images and supports_images:
|
||||||
|
content = [{"type": "text", "text": prompt}]
|
||||||
|
for img in images:
|
||||||
|
content.append({
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": f"data:image/png;base64,{img}"},
|
||||||
|
})
|
||||||
|
messages.append({"role": "user", "content": content})
|
||||||
|
else:
|
||||||
|
messages.append({"role": "user", "content": prompt})
|
||||||
|
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=model, messages=messages
|
||||||
|
)
|
||||||
|
print(f"[LLMRouter] Used backend: {name} ({model})")
|
||||||
|
return resp.choices[0].message.content
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[LLMRouter] {name}: error — {e}, trying next")
|
||||||
|
continue
|
||||||
|
|
||||||
|
elif backend["type"] == "anthropic":
|
||||||
|
api_key = os.environ.get(backend["api_key_env"], "")
|
||||||
|
if not api_key:
|
||||||
|
print(f"[LLMRouter] {name}: {backend['api_key_env']} not set, skipping")
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
import anthropic as _anthropic
|
||||||
|
client = _anthropic.Anthropic(api_key=api_key)
|
||||||
|
if images and supports_images:
|
||||||
|
content = []
|
||||||
|
for img in images:
|
||||||
|
content.append({
|
||||||
|
"type": "image",
|
||||||
|
"source": {"type": "base64", "media_type": "image/png", "data": img},
|
||||||
|
})
|
||||||
|
content.append({"type": "text", "text": prompt})
|
||||||
|
else:
|
||||||
|
content = prompt
|
||||||
|
kwargs: dict = {
|
||||||
|
"model": backend["model"],
|
||||||
|
"max_tokens": 4096,
|
||||||
|
"messages": [{"role": "user", "content": content}],
|
||||||
|
}
|
||||||
|
if system:
|
||||||
|
kwargs["system"] = system
|
||||||
|
msg = client.messages.create(**kwargs)
|
||||||
|
print(f"[LLMRouter] Used backend: {name}")
|
||||||
|
return msg.content[0].text
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[LLMRouter] {name}: error — {e}, trying next")
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise RuntimeError("All LLM backends exhausted")
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton for convenience
|
||||||
|
_router: LLMRouter | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def complete(prompt: str, system: str | None = None) -> str:
|
||||||
|
global _router
|
||||||
|
if _router is None:
|
||||||
|
_router = LLMRouter()
|
||||||
|
return _router.complete(prompt, system)
|
||||||
106
scripts/manage-ui.sh
Executable file
106
scripts/manage-ui.sh
Executable file
|
|
@ -0,0 +1,106 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# scripts/manage-ui.sh — manage the Streamlit job-seeker web UI
|
||||||
|
# Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
|
||||||
|
STREAMLIT_BIN="/devl/miniconda3/envs/job-seeker/bin/streamlit"
|
||||||
|
APP_ENTRY="$REPO_DIR/app/app.py"
|
||||||
|
PID_FILE="$REPO_DIR/.streamlit.pid"
|
||||||
|
LOG_FILE="$REPO_DIR/.streamlit.log"
|
||||||
|
PORT="${STREAMLIT_PORT:-8501}"
|
||||||
|
|
||||||
|
start() {
|
||||||
|
if is_running; then
|
||||||
|
echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting Streamlit on http://localhost:$PORT …"
|
||||||
|
"$STREAMLIT_BIN" run "$APP_ENTRY" \
|
||||||
|
--server.port "$PORT" \
|
||||||
|
--server.headless true \
|
||||||
|
--server.fileWatcherType none \
|
||||||
|
> "$LOG_FILE" 2>&1 &
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
if is_running; then
|
||||||
|
echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "Failed to start. Check logs: $LOG_FILE"
|
||||||
|
tail -20 "$LOG_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
stop() {
|
||||||
|
if ! is_running; then
|
||||||
|
echo "Not running."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
echo "Stopping PID $PID …"
|
||||||
|
kill "$PID" 2>/dev/null || true
|
||||||
|
sleep 1
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
kill -9 "$PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
echo "Stopped."
|
||||||
|
}
|
||||||
|
|
||||||
|
restart() {
|
||||||
|
stop
|
||||||
|
sleep 1
|
||||||
|
start
|
||||||
|
}
|
||||||
|
|
||||||
|
status() {
|
||||||
|
if is_running; then
|
||||||
|
echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT"
|
||||||
|
else
|
||||||
|
echo "Not running."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
logs() {
|
||||||
|
if [[ -f "$LOG_FILE" ]]; then
|
||||||
|
tail -50 "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "No log file found at $LOG_FILE"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
is_running() {
|
||||||
|
if [[ -f "$PID_FILE" ]]; then
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
CMD="${1:-help}"
|
||||||
|
case "$CMD" in
|
||||||
|
start) start ;;
|
||||||
|
stop) stop ;;
|
||||||
|
restart) restart ;;
|
||||||
|
status) status ;;
|
||||||
|
logs) logs ;;
|
||||||
|
*)
|
||||||
|
echo "Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]"
|
||||||
|
echo ""
|
||||||
|
echo " start Start the Streamlit UI (default port: $PORT)"
|
||||||
|
echo " stop Stop the running UI"
|
||||||
|
echo " restart Stop then start"
|
||||||
|
echo " status Show whether it's running"
|
||||||
|
echo " logs Tail the last 50 lines of the log"
|
||||||
|
echo ""
|
||||||
|
echo " STREAMLIT_PORT=8502 bash scripts/manage-ui.sh start (custom port)"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
113
scripts/manage-vision.sh
Executable file
113
scripts/manage-vision.sh
Executable file
|
|
@ -0,0 +1,113 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# scripts/manage-vision.sh — manage the moondream2 vision service
|
||||||
|
# Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs
|
||||||
|
#
|
||||||
|
# First-time setup:
|
||||||
|
# conda env create -f scripts/vision_service/environment.yml
|
||||||
|
#
|
||||||
|
# On first start, moondream2 is downloaded from HuggingFace (~1.8GB).
|
||||||
|
# Model stays resident in memory between requests.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
CONDA_ENV="job-seeker-vision"
|
||||||
|
UVICORN_BIN="/devl/miniconda3/envs/${CONDA_ENV}/bin/uvicorn"
|
||||||
|
PID_FILE="/tmp/vision-service.pid"
|
||||||
|
LOG_FILE="/tmp/vision-service.log"
|
||||||
|
PORT=8002
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(dirname "$SCRIPT_DIR")"
|
||||||
|
|
||||||
|
is_running() {
|
||||||
|
if [[ -f "$PID_FILE" ]]; then
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
start() {
|
||||||
|
if is_running; then
|
||||||
|
echo "Already running (PID $(cat "$PID_FILE"))."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$UVICORN_BIN" ]]; then
|
||||||
|
echo "ERROR: conda env '$CONDA_ENV' not found."
|
||||||
|
echo "Install with: conda env create -f scripts/vision_service/environment.yml"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting vision service (moondream2) on port $PORT…"
|
||||||
|
cd "$REPO_ROOT"
|
||||||
|
PYTHONPATH="$REPO_ROOT" "$UVICORN_BIN" \
|
||||||
|
scripts.vision_service.main:app \
|
||||||
|
--host 0.0.0.0 \
|
||||||
|
--port "$PORT" \
|
||||||
|
> "$LOG_FILE" 2>&1 &
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
sleep 2
|
||||||
|
|
||||||
|
if is_running; then
|
||||||
|
echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
|
||||||
|
echo "Health: http://localhost:$PORT/health"
|
||||||
|
else
|
||||||
|
echo "Failed to start. Check logs: $LOG_FILE"
|
||||||
|
tail -20 "$LOG_FILE"
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
stop() {
|
||||||
|
if ! is_running; then
|
||||||
|
echo "Not running."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
echo "Stopping PID $PID…"
|
||||||
|
kill "$PID" 2>/dev/null || true
|
||||||
|
sleep 2
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
kill -9 "$PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
echo "Stopped."
|
||||||
|
}
|
||||||
|
|
||||||
|
restart() { stop; sleep 1; start; }
|
||||||
|
|
||||||
|
status() {
|
||||||
|
if is_running; then
|
||||||
|
echo "Running (PID $(cat "$PID_FILE")) — http://localhost:$PORT"
|
||||||
|
curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || true
|
||||||
|
else
|
||||||
|
echo "Not running."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
logs() {
|
||||||
|
if [[ -f "$LOG_FILE" ]]; then
|
||||||
|
tail -50 "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "No log file at $LOG_FILE"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
CMD="${1:-help}"
|
||||||
|
case "$CMD" in
|
||||||
|
start) start ;;
|
||||||
|
stop) stop ;;
|
||||||
|
restart) restart ;;
|
||||||
|
status) status ;;
|
||||||
|
logs) logs ;;
|
||||||
|
*)
|
||||||
|
echo "Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs"
|
||||||
|
echo ""
|
||||||
|
echo " Manages the moondream2 vision service on port $PORT."
|
||||||
|
echo " First-time setup: conda env create -f scripts/vision_service/environment.yml"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
160
scripts/manage-vllm.sh
Executable file
160
scripts/manage-vllm.sh
Executable file
|
|
@ -0,0 +1,160 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
# scripts/manage-vllm.sh — manage the vLLM inference server
|
||||||
|
# Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python"
|
||||||
|
MODEL_DIR="/Library/Assets/LLM/vllm/models"
|
||||||
|
PID_FILE="/tmp/vllm-server.pid"
|
||||||
|
LOG_FILE="/tmp/vllm-server.log"
|
||||||
|
MODEL_FILE="/tmp/vllm-server.model"
|
||||||
|
PORT=8000
|
||||||
|
GPU=1
|
||||||
|
|
||||||
|
_list_model_names() {
|
||||||
|
if [[ -d "$MODEL_DIR" ]]; then
|
||||||
|
find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
is_running() {
|
||||||
|
if [[ -f "$PID_FILE" ]]; then
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
return 1
|
||||||
|
}
|
||||||
|
|
||||||
|
start() {
|
||||||
|
local model_name="${1:-}"
|
||||||
|
|
||||||
|
if [[ -z "$model_name" ]]; then
|
||||||
|
model_name=$(_list_model_names | head -1)
|
||||||
|
if [[ -z "$model_name" ]]; then
|
||||||
|
echo "No models found in $MODEL_DIR"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
local model_path
|
||||||
|
if [[ "$model_name" == /* ]]; then
|
||||||
|
model_path="$model_name"
|
||||||
|
model_name=$(basename "$model_path")
|
||||||
|
else
|
||||||
|
model_path="$MODEL_DIR/$model_name"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -d "$model_path" ]]; then
|
||||||
|
echo "Model not found: $model_path"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if is_running; then
|
||||||
|
echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload."
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…"
|
||||||
|
echo "$model_name" > "$MODEL_FILE"
|
||||||
|
|
||||||
|
# Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard
|
||||||
|
# transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928.
|
||||||
|
CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \
|
||||||
|
--model "$model_path" \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 3072 \
|
||||||
|
--gpu-memory-utilization 0.75 \
|
||||||
|
--enforce-eager \
|
||||||
|
--max-num-seqs 8 \
|
||||||
|
--port "$PORT" \
|
||||||
|
> "$LOG_FILE" 2>&1 &
|
||||||
|
echo $! > "$PID_FILE"
|
||||||
|
sleep 3
|
||||||
|
|
||||||
|
if is_running; then
|
||||||
|
echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "Failed to start. Check logs: $LOG_FILE"
|
||||||
|
tail -20 "$LOG_FILE"
|
||||||
|
rm -f "$PID_FILE" "$MODEL_FILE"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
stop() {
|
||||||
|
if ! is_running; then
|
||||||
|
echo "Not running."
|
||||||
|
rm -f "$PID_FILE"
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
PID=$(cat "$PID_FILE")
|
||||||
|
echo "Stopping PID $PID …"
|
||||||
|
kill "$PID" 2>/dev/null || true
|
||||||
|
sleep 2
|
||||||
|
if kill -0 "$PID" 2>/dev/null; then
|
||||||
|
kill -9 "$PID" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
rm -f "$PID_FILE" "$MODEL_FILE"
|
||||||
|
echo "Stopped."
|
||||||
|
}
|
||||||
|
|
||||||
|
restart() {
|
||||||
|
local model_name="${1:-}"
|
||||||
|
stop
|
||||||
|
sleep 1
|
||||||
|
start "$model_name"
|
||||||
|
}
|
||||||
|
|
||||||
|
status() {
|
||||||
|
if is_running; then
|
||||||
|
local model=""
|
||||||
|
if [[ -f "$MODEL_FILE" ]]; then
|
||||||
|
model=" — model: $(cat "$MODEL_FILE")"
|
||||||
|
fi
|
||||||
|
echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model"
|
||||||
|
else
|
||||||
|
echo "Not running."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
logs() {
|
||||||
|
if [[ -f "$LOG_FILE" ]]; then
|
||||||
|
tail -50 "$LOG_FILE"
|
||||||
|
else
|
||||||
|
echo "No log file found at $LOG_FILE"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
list() {
|
||||||
|
echo "Available models in $MODEL_DIR:"
|
||||||
|
_list_model_names | while read -r name; do
|
||||||
|
echo " - $name"
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
CMD="${1:-help}"
|
||||||
|
case "$CMD" in
|
||||||
|
start) start "${2:-}" ;;
|
||||||
|
stop) stop ;;
|
||||||
|
restart) restart "${2:-}" ;;
|
||||||
|
status) status ;;
|
||||||
|
logs) logs ;;
|
||||||
|
list) list ;;
|
||||||
|
*)
|
||||||
|
echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]"
|
||||||
|
echo ""
|
||||||
|
echo " start [model] Start vLLM with the specified model (default: first in $MODEL_DIR)"
|
||||||
|
echo " stop Stop the running vLLM server"
|
||||||
|
echo " restart [model] Stop then start (pass a new model name to swap)"
|
||||||
|
echo " status Show whether it's running and which model is loaded"
|
||||||
|
echo " logs Tail the last 50 lines of the log"
|
||||||
|
echo " list List available models"
|
||||||
|
echo ""
|
||||||
|
echo " GPU: $GPU (CUDA_VISIBLE_DEVICES)"
|
||||||
|
echo " Port: $PORT"
|
||||||
|
;;
|
||||||
|
esac
|
||||||
156
scripts/match.py
Normal file
156
scripts/match.py
Normal file
|
|
@ -0,0 +1,156 @@
|
||||||
|
"""
|
||||||
|
Resume match scoring.
|
||||||
|
|
||||||
|
Two modes:
|
||||||
|
1. SQLite batch — score all unscored pending/approved jobs in staging.db
|
||||||
|
Usage: python scripts/match.py
|
||||||
|
|
||||||
|
2. Notion single — score one Notion page by URL/ID and write results back
|
||||||
|
Usage: python scripts/match.py <notion-page-url-or-id>
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import requests
|
||||||
|
import yaml
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from notion_client import Client
|
||||||
|
|
||||||
|
CONFIG_DIR = Path(__file__).parent.parent / "config"
|
||||||
|
RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
def load_notion() -> tuple[Client, dict]:
|
||||||
|
cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
|
||||||
|
return Client(auth=cfg["token"]), cfg["field_map"]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_page_id(url_or_id: str) -> str:
|
||||||
|
"""Extract 32-char Notion page ID from a URL or return as-is."""
|
||||||
|
clean = url_or_id.replace("-", "")
|
||||||
|
match = re.search(r"[0-9a-f]{32}", clean)
|
||||||
|
return match.group(0) if match else url_or_id.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str:
|
||||||
|
page = notion.pages.retrieve(page_id)
|
||||||
|
return page["properties"][url_field]["url"] or ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_job_description(url: str) -> str:
|
||||||
|
"""Fetch a job listing URL and return its visible text."""
|
||||||
|
resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
for tag in soup(["script", "style", "nav", "header", "footer"]):
|
||||||
|
tag.decompose()
|
||||||
|
return " ".join(soup.get_text(separator=" ").split())
|
||||||
|
|
||||||
|
|
||||||
|
def read_resume_text() -> str:
|
||||||
|
"""Extract text from the ATS-clean PDF resume."""
|
||||||
|
import pypdf
|
||||||
|
reader = pypdf.PdfReader(str(RESUME_PATH))
|
||||||
|
return " ".join(page.extract_text() or "" for page in reader.pages)
|
||||||
|
|
||||||
|
|
||||||
|
def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]:
|
||||||
|
"""
|
||||||
|
Score resume against job description using TF-IDF cosine similarity.
|
||||||
|
Returns (score 0–100, list of high-value job keywords missing from resume).
|
||||||
|
"""
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
|
vectorizer = TfidfVectorizer(stop_words="english", max_features=200)
|
||||||
|
tfidf = vectorizer.fit_transform([resume_text, job_text])
|
||||||
|
score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100
|
||||||
|
|
||||||
|
resume_terms = set(resume_text.lower().split())
|
||||||
|
feature_names = vectorizer.get_feature_names_out()
|
||||||
|
job_tfidf = tfidf[1].toarray()[0]
|
||||||
|
top_indices = np.argsort(job_tfidf)[::-1][:30]
|
||||||
|
top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0]
|
||||||
|
gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10] # t==t drops NaN
|
||||||
|
|
||||||
|
return round(score, 1), gaps
|
||||||
|
|
||||||
|
|
||||||
|
def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None:
|
||||||
|
notion.pages.update(
|
||||||
|
page_id=page_id,
|
||||||
|
properties={
|
||||||
|
fm["match_score"]: {"number": score},
|
||||||
|
fm["keyword_gaps"]: {"rich_text": [{"text": {"content": ", ".join(gaps)}}]},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def run_match(page_url_or_id: str) -> None:
|
||||||
|
notion, fm = load_notion()
|
||||||
|
page_id = extract_page_id(page_url_or_id)
|
||||||
|
|
||||||
|
print(f"[match] Page ID: {page_id}")
|
||||||
|
job_url = get_job_url_from_notion(notion, page_id, fm["url"])
|
||||||
|
print(f"[match] Fetching job description from: {job_url}")
|
||||||
|
|
||||||
|
job_text = extract_job_description(job_url)
|
||||||
|
resume_text = read_resume_text()
|
||||||
|
|
||||||
|
score, gaps = match_score(resume_text, job_text)
|
||||||
|
print(f"[match] Score: {score}/100")
|
||||||
|
print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}")
|
||||||
|
|
||||||
|
write_match_to_notion(notion, page_id, score, gaps, fm)
|
||||||
|
print("[match] Written to Notion.")
|
||||||
|
|
||||||
|
|
||||||
|
def score_pending_jobs(db_path: Path = None) -> int:
|
||||||
|
"""
|
||||||
|
Score all unscored jobs (any status) in SQLite using the description
|
||||||
|
already scraped during discovery. Writes match_score + keyword_gaps back.
|
||||||
|
Returns the number of jobs scored.
|
||||||
|
"""
|
||||||
|
from scripts.db import DEFAULT_DB, write_match_scores
|
||||||
|
|
||||||
|
if db_path is None:
|
||||||
|
db_path = DEFAULT_DB
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
rows = conn.execute(
|
||||||
|
"SELECT id, title, company, description FROM jobs "
|
||||||
|
"WHERE match_score IS NULL "
|
||||||
|
"AND description IS NOT NULL AND description != '' AND description != 'nan'"
|
||||||
|
).fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print("[match] No unscored jobs with descriptions found.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
resume_text = read_resume_text()
|
||||||
|
scored = 0
|
||||||
|
for row in rows:
|
||||||
|
job_id, title, company, description = row["id"], row["title"], row["company"], row["description"]
|
||||||
|
try:
|
||||||
|
score, gaps = match_score(resume_text, description)
|
||||||
|
write_match_scores(db_path, job_id, score, ", ".join(gaps))
|
||||||
|
print(f"[match] {title} @ {company}: {score}/100 gaps: {', '.join(gaps) or 'none'}")
|
||||||
|
scored += 1
|
||||||
|
except Exception as e:
|
||||||
|
print(f"[match] Error scoring job {job_id}: {e}")
|
||||||
|
|
||||||
|
print(f"[match] Done — {scored} jobs scored.")
|
||||||
|
return scored
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
score_pending_jobs()
|
||||||
|
else:
|
||||||
|
run_match(sys.argv[1])
|
||||||
134
scripts/prepare_training_data.py
Normal file
134
scripts/prepare_training_data.py
Normal file
|
|
@ -0,0 +1,134 @@
|
||||||
|
# scripts/prepare_training_data.py
|
||||||
|
"""
|
||||||
|
Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning.
|
||||||
|
|
||||||
|
Outputs a JSONL file where each line is:
|
||||||
|
{"instruction": "Write a cover letter for the [role] position at [company].",
|
||||||
|
"output": "<full letter text>"}
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/prepare_training_data.py
|
||||||
|
conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
LETTERS_DIR = Path("/Library/Documents/JobSearch")
|
||||||
|
# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter")
|
||||||
|
LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"]
|
||||||
|
DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl"
|
||||||
|
|
||||||
|
# Patterns that appear in opening sentences to extract role
|
||||||
|
ROLE_PATTERNS = [
|
||||||
|
r"apply for (?:the )?(.+?) (?:position|role|opportunity) at",
|
||||||
|
r"apply for (?:the )?(.+?) (?:at|with)\b",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def extract_role_from_text(text: str) -> str:
|
||||||
|
"""Try to extract the role title from the first ~500 chars of a cover letter."""
|
||||||
|
# Search the opening of the letter, skipping past any greeting line
|
||||||
|
search_text = text[:600]
|
||||||
|
for pattern in ROLE_PATTERNS:
|
||||||
|
m = re.search(pattern, search_text, re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
role = m.group(1).strip().rstrip(".")
|
||||||
|
# Filter out noise — role should be ≤6 words
|
||||||
|
if 1 <= len(role.split()) <= 6:
|
||||||
|
return role
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def extract_company_from_filename(stem: str) -> str:
|
||||||
|
"""Extract company name from cover letter filename stem."""
|
||||||
|
return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def strip_greeting(text: str) -> str:
|
||||||
|
"""Remove the 'Dear X,' line so the output is just the letter body + sign-off."""
|
||||||
|
lines = text.splitlines()
|
||||||
|
for i, line in enumerate(lines):
|
||||||
|
if line.strip().lower().startswith("dear "):
|
||||||
|
# Skip the greeting line and any following blank lines
|
||||||
|
rest = lines[i + 1:]
|
||||||
|
while rest and not rest[0].strip():
|
||||||
|
rest = rest[1:]
|
||||||
|
return "\n".join(rest).strip()
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]:
|
||||||
|
"""Parse all cover letters and return list of training records."""
|
||||||
|
records = []
|
||||||
|
seen: set[Path] = set()
|
||||||
|
all_paths = []
|
||||||
|
for glob in LETTER_GLOBS:
|
||||||
|
for p in letters_dir.glob(glob):
|
||||||
|
if p not in seen:
|
||||||
|
seen.add(p)
|
||||||
|
all_paths.append(p)
|
||||||
|
for path in sorted(all_paths):
|
||||||
|
text = path.read_text(encoding="utf-8", errors="ignore").strip()
|
||||||
|
if not text or len(text) < 100:
|
||||||
|
continue
|
||||||
|
|
||||||
|
company = extract_company_from_filename(path.stem)
|
||||||
|
role = extract_role_from_text(text)
|
||||||
|
body = strip_greeting(text)
|
||||||
|
|
||||||
|
if not role:
|
||||||
|
# Use a generic instruction when role extraction fails
|
||||||
|
instruction = f"Write a cover letter for a position at {company}."
|
||||||
|
else:
|
||||||
|
instruction = f"Write a cover letter for the {role} position at {company}."
|
||||||
|
|
||||||
|
records.append({
|
||||||
|
"instruction": instruction,
|
||||||
|
"output": body,
|
||||||
|
"source_file": path.name,
|
||||||
|
})
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def write_jsonl(records: list[dict], output_path: Path) -> None:
|
||||||
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
|
for record in records:
|
||||||
|
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus")
|
||||||
|
parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path")
|
||||||
|
parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters")
|
||||||
|
parser.add_argument("--stats", action="store_true", help="Print statistics and exit")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
records = build_records(Path(args.letters_dir))
|
||||||
|
|
||||||
|
if args.stats:
|
||||||
|
print(f"Total letters: {len(records)}")
|
||||||
|
with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position"))
|
||||||
|
print(f"Role extracted: {with_role}/{len(records)}")
|
||||||
|
avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1)
|
||||||
|
print(f"Avg letter length: {avg_len:.0f} chars")
|
||||||
|
for r in records:
|
||||||
|
print(f" {r['source_file']!r:55s} → {r['instruction'][:70]}")
|
||||||
|
return
|
||||||
|
|
||||||
|
output_path = Path(args.output)
|
||||||
|
write_jsonl(records, output_path)
|
||||||
|
print(f"Wrote {len(records)} training records to {output_path}")
|
||||||
|
print()
|
||||||
|
print("Next step for LoRA fine-tuning:")
|
||||||
|
print(" 1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct")
|
||||||
|
print(" 2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)")
|
||||||
|
print(" 3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh — hugging-face-model-trainer skill")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
228
scripts/scrape_url.py
Normal file
228
scripts/scrape_url.py
Normal file
|
|
@ -0,0 +1,228 @@
|
||||||
|
# scripts/scrape_url.py
|
||||||
|
"""
|
||||||
|
Scrape a job listing from its URL and update the job record.
|
||||||
|
|
||||||
|
Supports:
|
||||||
|
- LinkedIn (guest jobs API — no auth required)
|
||||||
|
- Indeed (HTML parse)
|
||||||
|
- Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py)
|
||||||
|
- Generic (JSON-LD → og:tags fallback)
|
||||||
|
|
||||||
|
Usage (background task — called by task_runner):
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
scrape_job_url(db_path, job_id)
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlparse, urlencode, parse_qsl
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, update_job_fields
|
||||||
|
|
||||||
|
_STRIP_PARAMS = {
|
||||||
|
"utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term",
|
||||||
|
"trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig",
|
||||||
|
"eid", "otpToken", "ssid", "fmid",
|
||||||
|
}
|
||||||
|
|
||||||
|
_HEADERS = {
|
||||||
|
"User-Agent": (
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
_TIMEOUT = 12
|
||||||
|
|
||||||
|
|
||||||
|
def _detect_board(url: str) -> str:
|
||||||
|
"""Return 'linkedin', 'indeed', 'glassdoor', or 'generic'."""
|
||||||
|
url_lower = url.lower()
|
||||||
|
if "linkedin.com" in url_lower:
|
||||||
|
return "linkedin"
|
||||||
|
if "indeed.com" in url_lower:
|
||||||
|
return "indeed"
|
||||||
|
if "glassdoor.com" in url_lower:
|
||||||
|
return "glassdoor"
|
||||||
|
return "generic"
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_linkedin_job_id(url: str) -> Optional[str]:
|
||||||
|
"""Extract numeric job ID from a LinkedIn job URL."""
|
||||||
|
m = re.search(r"/jobs/view/(\d+)", url)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def canonicalize_url(url: str) -> str:
|
||||||
|
"""
|
||||||
|
Strip tracking parameters from a job URL and return a clean canonical form.
|
||||||
|
|
||||||
|
LinkedIn: https://www.linkedin.com/jobs/view/<id>/?trk=... → https://www.linkedin.com/jobs/view/<id>/
|
||||||
|
Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId
|
||||||
|
"""
|
||||||
|
url = url.strip()
|
||||||
|
if "linkedin.com" in url.lower():
|
||||||
|
job_id = _extract_linkedin_job_id(url)
|
||||||
|
if job_id:
|
||||||
|
return f"https://www.linkedin.com/jobs/view/{job_id}/"
|
||||||
|
parsed = urlparse(url)
|
||||||
|
clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS])
|
||||||
|
return parsed._replace(query=clean_qs).geturl()
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_linkedin(url: str) -> dict:
|
||||||
|
"""Fetch via LinkedIn guest jobs API (no auth required)."""
|
||||||
|
job_id = _extract_linkedin_job_id(url)
|
||||||
|
if not job_id:
|
||||||
|
return {}
|
||||||
|
api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}"
|
||||||
|
resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
soup = BeautifulSoup(resp.text, "html.parser")
|
||||||
|
|
||||||
|
def _text(selector, **kwargs):
|
||||||
|
tag = soup.find(selector, **kwargs)
|
||||||
|
return tag.get_text(strip=True) if tag else ""
|
||||||
|
|
||||||
|
title = _text("h2", class_="top-card-layout__title")
|
||||||
|
company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link")
|
||||||
|
location = _text("span", class_="topcard__flavor--bullet")
|
||||||
|
desc_div = soup.find("div", class_="show-more-less-html__markup")
|
||||||
|
description = desc_div.get_text(separator="\n", strip=True) if desc_div else ""
|
||||||
|
|
||||||
|
return {k: v for k, v in {
|
||||||
|
"title": title,
|
||||||
|
"company": company,
|
||||||
|
"location": location,
|
||||||
|
"description": description,
|
||||||
|
"source": "linkedin",
|
||||||
|
}.items() if v}
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_indeed(url: str) -> dict:
|
||||||
|
"""Scrape an Indeed job page."""
|
||||||
|
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return _parse_json_ld_or_og(resp.text) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_glassdoor(url: str) -> dict:
|
||||||
|
"""Re-use JobSpy's Glassdoor scraper for description fetch."""
|
||||||
|
m = re.search(r"jl=(\d+)", url)
|
||||||
|
if not m:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
from jobspy.glassdoor import Glassdoor
|
||||||
|
from jobspy.glassdoor.constant import fallback_token, headers
|
||||||
|
from jobspy.model import ScraperInput, Site
|
||||||
|
from jobspy.util import create_session
|
||||||
|
|
||||||
|
scraper = Glassdoor()
|
||||||
|
scraper.base_url = "https://www.glassdoor.com/"
|
||||||
|
scraper.session = create_session(has_retry=True)
|
||||||
|
token = scraper._get_csrf_token()
|
||||||
|
headers["gd-csrf-token"] = token if token else fallback_token
|
||||||
|
scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR])
|
||||||
|
description = scraper._fetch_job_description(int(m.group(1)))
|
||||||
|
return {"description": description} if description else {}
|
||||||
|
except Exception:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_json_ld_or_og(html: str) -> dict:
|
||||||
|
"""Extract job fields from JSON-LD structured data, then og: meta tags."""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
|
||||||
|
for script in soup.find_all("script", type="application/ld+json"):
|
||||||
|
try:
|
||||||
|
data = json.loads(script.string or "")
|
||||||
|
if isinstance(data, list):
|
||||||
|
data = next((d for d in data if d.get("@type") == "JobPosting"), {})
|
||||||
|
if data.get("@type") == "JobPosting":
|
||||||
|
org = data.get("hiringOrganization") or {}
|
||||||
|
loc = data.get("jobLocation") or {}
|
||||||
|
if isinstance(loc, list):
|
||||||
|
loc = loc[0] if loc else {}
|
||||||
|
addr = loc.get("address") or {}
|
||||||
|
location = (
|
||||||
|
addr.get("addressLocality", "") or
|
||||||
|
addr.get("addressRegion", "") or
|
||||||
|
addr.get("addressCountry", "")
|
||||||
|
)
|
||||||
|
return {k: v for k, v in {
|
||||||
|
"title": data.get("title", ""),
|
||||||
|
"company": org.get("name", ""),
|
||||||
|
"location": location,
|
||||||
|
"description": data.get("description", ""),
|
||||||
|
"salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "",
|
||||||
|
}.items() if v}
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
|
def _meta(prop):
|
||||||
|
tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop})
|
||||||
|
return tag.get("content", "") if tag else ""
|
||||||
|
|
||||||
|
title_tag = soup.find("title")
|
||||||
|
title = _meta("og:title") or (title_tag.get_text(strip=True) if title_tag else "")
|
||||||
|
description = _meta("og:description")
|
||||||
|
return {k: v for k, v in {"title": title, "description": description}.items() if v}
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_generic(url: str) -> dict:
|
||||||
|
resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return _parse_json_ld_or_og(resp.text) or {}
|
||||||
|
|
||||||
|
|
||||||
|
def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict:
|
||||||
|
"""
|
||||||
|
Fetch the job listing at the stored URL and update the job record.
|
||||||
|
|
||||||
|
Returns the dict of fields scraped (may be empty on failure).
|
||||||
|
Does not raise — failures are logged and the job row is left as-is.
|
||||||
|
"""
|
||||||
|
if job_id is None:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
if not row:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
url = row["url"] or ""
|
||||||
|
if not url.startswith("http"):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
board = _detect_board(url)
|
||||||
|
try:
|
||||||
|
if board == "linkedin":
|
||||||
|
fields = _scrape_linkedin(url)
|
||||||
|
elif board == "indeed":
|
||||||
|
fields = _scrape_indeed(url)
|
||||||
|
elif board == "glassdoor":
|
||||||
|
fields = _scrape_glassdoor(url)
|
||||||
|
else:
|
||||||
|
fields = _scrape_generic(url)
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}")
|
||||||
|
return {}
|
||||||
|
except Exception as exc:
|
||||||
|
print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}")
|
||||||
|
return {}
|
||||||
|
|
||||||
|
if fields:
|
||||||
|
fields.pop("url", None)
|
||||||
|
update_job_fields(db_path, job_id, fields)
|
||||||
|
print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}")
|
||||||
|
|
||||||
|
return fields
|
||||||
97
scripts/sync.py
Normal file
97
scripts/sync.py
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
# scripts/sync.py
|
||||||
|
"""
|
||||||
|
Push approved jobs from SQLite staging to Notion.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/sync.py
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from notion_client import Client
|
||||||
|
|
||||||
|
from scripts.db import DEFAULT_DB, get_jobs_by_status, update_job_status
|
||||||
|
|
||||||
|
CONFIG_DIR = Path(__file__).parent.parent / "config"
|
||||||
|
|
||||||
|
|
||||||
|
def load_notion_config() -> dict:
|
||||||
|
return yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def _build_properties(job: dict, fm: dict, include_optional: bool = True) -> dict:
|
||||||
|
"""Build the Notion properties dict for a job. Optional fields (match_score,
|
||||||
|
keyword_gaps) are included by default but can be dropped for DBs that don't
|
||||||
|
have those columns yet."""
|
||||||
|
props = {
|
||||||
|
fm["title_field"]: {"title": [{"text": {"content": job.get("salary") or job.get("title", "")}}]},
|
||||||
|
fm["job_title"]: {"rich_text": [{"text": {"content": job.get("title", "")}}]},
|
||||||
|
fm["company"]: {"rich_text": [{"text": {"content": job.get("company", "")}}]},
|
||||||
|
fm["url"]: {"url": job.get("url") or None},
|
||||||
|
fm["source"]: {"multi_select": [{"name": job.get("source", "unknown").title()}]},
|
||||||
|
fm["status"]: {"select": {"name": fm["status_new"]}},
|
||||||
|
fm["remote"]: {"checkbox": bool(job.get("is_remote", 0))},
|
||||||
|
fm["date_found"]: {"date": {"start": job.get("date_found", datetime.now().isoformat()[:10])}},
|
||||||
|
}
|
||||||
|
if include_optional:
|
||||||
|
score = job.get("match_score")
|
||||||
|
if score is not None and fm.get("match_score"):
|
||||||
|
props[fm["match_score"]] = {"number": score}
|
||||||
|
gaps = job.get("keyword_gaps")
|
||||||
|
if gaps and fm.get("keyword_gaps"):
|
||||||
|
props[fm["keyword_gaps"]] = {"rich_text": [{"text": {"content": gaps}}]}
|
||||||
|
return props
|
||||||
|
|
||||||
|
|
||||||
|
def sync_to_notion(db_path: Path = DEFAULT_DB) -> int:
|
||||||
|
"""Push all approved and applied jobs to Notion. Returns count synced."""
|
||||||
|
cfg = load_notion_config()
|
||||||
|
notion = Client(auth=cfg["token"])
|
||||||
|
db_id = cfg["database_id"]
|
||||||
|
fm = cfg["field_map"]
|
||||||
|
|
||||||
|
approved = get_jobs_by_status(db_path, "approved")
|
||||||
|
applied = get_jobs_by_status(db_path, "applied")
|
||||||
|
pending_sync = approved + applied
|
||||||
|
if not pending_sync:
|
||||||
|
print("[sync] No approved/applied jobs to sync.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
synced_ids = []
|
||||||
|
for job in pending_sync:
|
||||||
|
try:
|
||||||
|
notion.pages.create(
|
||||||
|
parent={"database_id": db_id},
|
||||||
|
properties=_build_properties(job, fm, include_optional=True),
|
||||||
|
)
|
||||||
|
synced_ids.append(job["id"])
|
||||||
|
print(f"[sync] + {job.get('title')} @ {job.get('company')}")
|
||||||
|
except Exception as e:
|
||||||
|
err = str(e)
|
||||||
|
# Notion returns 400 validation_error when a property column doesn't exist yet.
|
||||||
|
# Fall back to core fields only and warn the user.
|
||||||
|
if "validation_error" in err or "Could not find property" in err:
|
||||||
|
try:
|
||||||
|
notion.pages.create(
|
||||||
|
parent={"database_id": db_id},
|
||||||
|
properties=_build_properties(job, fm, include_optional=False),
|
||||||
|
)
|
||||||
|
synced_ids.append(job["id"])
|
||||||
|
print(f"[sync] + {job.get('title')} @ {job.get('company')} "
|
||||||
|
f"(skipped optional fields — add Match Score / Keyword Gaps columns to Notion DB)")
|
||||||
|
except Exception as e2:
|
||||||
|
print(f"[sync] Error syncing {job.get('url')}: {e2}")
|
||||||
|
else:
|
||||||
|
print(f"[sync] Error syncing {job.get('url')}: {e}")
|
||||||
|
|
||||||
|
update_job_status(db_path, synced_ids, "synced")
|
||||||
|
print(f"[sync] Done — {len(synced_ids)} jobs synced to Notion.")
|
||||||
|
return len(synced_ids)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sync_to_notion()
|
||||||
155
scripts/task_runner.py
Normal file
155
scripts/task_runner.py
Normal file
|
|
@ -0,0 +1,155 @@
|
||||||
|
# scripts/task_runner.py
|
||||||
|
"""
|
||||||
|
Background task runner for LLM generation tasks.
|
||||||
|
|
||||||
|
Submitting a task inserts a row in background_tasks and spawns a daemon thread.
|
||||||
|
The thread calls the appropriate generator, writes results to existing tables,
|
||||||
|
and marks the task completed or failed.
|
||||||
|
|
||||||
|
Deduplication: only one queued/running task per (task_type, job_id) is allowed.
|
||||||
|
Different task types for the same job run concurrently (e.g. cover letter + research).
|
||||||
|
"""
|
||||||
|
import sqlite3
|
||||||
|
import threading
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from scripts.db import (
|
||||||
|
DEFAULT_DB,
|
||||||
|
insert_task,
|
||||||
|
update_task_status,
|
||||||
|
update_task_stage,
|
||||||
|
update_cover_letter,
|
||||||
|
save_research,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "",
|
||||||
|
job_id: int = None) -> tuple[int, bool]:
|
||||||
|
"""Submit a background LLM task.
|
||||||
|
|
||||||
|
Returns (task_id, True) if a new task was queued and a thread spawned.
|
||||||
|
Returns (existing_id, False) if an identical task is already in-flight.
|
||||||
|
"""
|
||||||
|
task_id, is_new = insert_task(db_path, task_type, job_id)
|
||||||
|
if is_new:
|
||||||
|
t = threading.Thread(
|
||||||
|
target=_run_task,
|
||||||
|
args=(db_path, task_id, task_type, job_id),
|
||||||
|
daemon=True,
|
||||||
|
)
|
||||||
|
t.start()
|
||||||
|
return task_id, is_new
|
||||||
|
|
||||||
|
|
||||||
|
def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None:
|
||||||
|
"""Thread body: run the generator and persist the result."""
|
||||||
|
# job_id == 0 means a global task (e.g. discovery) with no associated job row.
|
||||||
|
job: dict = {}
|
||||||
|
if job_id:
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
if row is None:
|
||||||
|
update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found")
|
||||||
|
return
|
||||||
|
job = dict(row)
|
||||||
|
|
||||||
|
update_task_status(db_path, task_id, "running")
|
||||||
|
|
||||||
|
try:
|
||||||
|
if task_type == "discovery":
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
new_count = run_discovery(db_path)
|
||||||
|
n = new_count or 0
|
||||||
|
update_task_status(
|
||||||
|
db_path, task_id, "completed",
|
||||||
|
error=f"{n} new listing{'s' if n != 1 else ''} added",
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
elif task_type == "cover_letter":
|
||||||
|
from scripts.generate_cover_letter import generate
|
||||||
|
result = generate(
|
||||||
|
job.get("title", ""),
|
||||||
|
job.get("company", ""),
|
||||||
|
job.get("description", ""),
|
||||||
|
)
|
||||||
|
update_cover_letter(db_path, job_id, result)
|
||||||
|
|
||||||
|
elif task_type == "company_research":
|
||||||
|
from scripts.company_research import research_company
|
||||||
|
result = research_company(
|
||||||
|
job,
|
||||||
|
on_stage=lambda s: update_task_stage(db_path, task_id, s),
|
||||||
|
)
|
||||||
|
save_research(db_path, job_id=job_id, **result)
|
||||||
|
|
||||||
|
elif task_type == "enrich_descriptions":
|
||||||
|
from scripts.enrich_descriptions import enrich_all_descriptions
|
||||||
|
r = enrich_all_descriptions(db_path)
|
||||||
|
errs = len(r.get("errors", []))
|
||||||
|
msg = (
|
||||||
|
f"{r['succeeded']} description(s) fetched, {r['failed']} failed"
|
||||||
|
+ (f", {errs} error(s)" if errs else "")
|
||||||
|
)
|
||||||
|
update_task_status(db_path, task_id, "completed", error=msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
elif task_type == "scrape_url":
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
fields = scrape_job_url(db_path, job_id)
|
||||||
|
title = fields.get("title") or job.get("url", "?")
|
||||||
|
company = fields.get("company", "")
|
||||||
|
msg = f"{title}" + (f" @ {company}" if company else "")
|
||||||
|
update_task_status(db_path, task_id, "completed", error=msg)
|
||||||
|
# Auto-enrich company/salary for Craigslist jobs
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
job_row = conn.execute(
|
||||||
|
"SELECT source, company FROM jobs WHERE id=?", (job_id,)
|
||||||
|
).fetchone()
|
||||||
|
conn.close()
|
||||||
|
if job_row and job_row["source"] == "craigslist" and not job_row["company"]:
|
||||||
|
submit_task(db_path, "enrich_craigslist", job_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
elif task_type == "enrich_craigslist":
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
extracted = enrich_craigslist_fields(db_path, job_id)
|
||||||
|
company = extracted.get("company", "")
|
||||||
|
msg = f"company={company}" if company else "no company found"
|
||||||
|
update_task_status(db_path, task_id, "completed", error=msg)
|
||||||
|
return
|
||||||
|
|
||||||
|
elif task_type == "email_sync":
|
||||||
|
try:
|
||||||
|
from scripts.imap_sync import sync_all
|
||||||
|
result = sync_all(db_path,
|
||||||
|
on_stage=lambda s: update_task_stage(db_path, task_id, s))
|
||||||
|
leads = result.get("new_leads", 0)
|
||||||
|
todo = result.get("todo_attached", 0)
|
||||||
|
errs = len(result.get("errors", []))
|
||||||
|
msg = (
|
||||||
|
f"{result['synced']} jobs updated, "
|
||||||
|
f"+{result['inbound']} in, +{result['outbound']} out"
|
||||||
|
+ (f", {leads} new lead(s)" if leads else "")
|
||||||
|
+ (f", {todo} todo attached" if todo else "")
|
||||||
|
+ (f", {errs} error(s)" if errs else "")
|
||||||
|
)
|
||||||
|
update_task_status(db_path, task_id, "completed", error=msg)
|
||||||
|
return
|
||||||
|
except FileNotFoundError:
|
||||||
|
update_task_status(db_path, task_id, "failed",
|
||||||
|
error="Email not configured — go to Settings → Email")
|
||||||
|
return
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown task_type: {task_type!r}")
|
||||||
|
|
||||||
|
update_task_status(db_path, task_id, "completed")
|
||||||
|
|
||||||
|
except BaseException as exc:
|
||||||
|
# BaseException catches SystemExit (from companyScraper sys.exit calls)
|
||||||
|
# in addition to regular exceptions.
|
||||||
|
update_task_status(db_path, task_id, "failed", error=str(exc))
|
||||||
159
scripts/test_email_classify.py
Normal file
159
scripts/test_email_classify.py
Normal file
|
|
@ -0,0 +1,159 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Compare email classifiers across models on a live sample from IMAP.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
conda run -n job-seeker python scripts/test_email_classify.py
|
||||||
|
conda run -n job-seeker python scripts/test_email_classify.py --limit 30
|
||||||
|
conda run -n job-seeker python scripts/test_email_classify.py --dry-run # phrase filter only, no LLM
|
||||||
|
|
||||||
|
Outputs a table: subject | phrase_blocked | phi3 | llama3.1 | vllm
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.imap_sync import (
|
||||||
|
load_config, connect, _search_folder, _parse_message,
|
||||||
|
_has_recruitment_keyword, _has_rejection_or_ats_signal,
|
||||||
|
_CLASSIFY_SYSTEM, _CLASSIFY_LABELS,
|
||||||
|
_REJECTION_PHRASES, _SPAM_PHRASES, _ATS_CONFIRM_SUBJECTS, _SPAM_SUBJECT_PREFIXES,
|
||||||
|
)
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
_ROUTER = LLMRouter()
|
||||||
|
|
||||||
|
MODELS = {
|
||||||
|
"phi3": ("phi3:mini", ["ollama_research"]),
|
||||||
|
"llama3": ("llama3.1:8b", ["ollama_research"]),
|
||||||
|
"vllm": ("__auto__", ["vllm"]),
|
||||||
|
}
|
||||||
|
|
||||||
|
BROAD_TERMS = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"]
|
||||||
|
|
||||||
|
|
||||||
|
def _classify(subject: str, body: str, model_override: str, fallback_order: list) -> str:
|
||||||
|
try:
|
||||||
|
prompt = f"Subject: {subject}\n\nEmail: {body[:600]}"
|
||||||
|
raw = _ROUTER.complete(
|
||||||
|
prompt,
|
||||||
|
system=_CLASSIFY_SYSTEM,
|
||||||
|
model_override=model_override,
|
||||||
|
fallback_order=fallback_order,
|
||||||
|
)
|
||||||
|
text = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).lower().strip()
|
||||||
|
for label in _CLASSIFY_LABELS:
|
||||||
|
if text.startswith(label) or label in text:
|
||||||
|
return label
|
||||||
|
return f"? ({text[:30]})"
|
||||||
|
except Exception as e:
|
||||||
|
return f"ERR: {e!s:.20}"
|
||||||
|
|
||||||
|
|
||||||
|
def _short(s: str, n: int = 55) -> str:
|
||||||
|
return s if len(s) <= n else s[:n - 1] + "…"
|
||||||
|
|
||||||
|
|
||||||
|
def _explain_block(subject: str, body: str) -> str:
|
||||||
|
"""Return the first phrase/rule that triggered a block."""
|
||||||
|
subject_lower = subject.lower().strip()
|
||||||
|
for p in _SPAM_SUBJECT_PREFIXES:
|
||||||
|
if subject_lower.startswith(p):
|
||||||
|
return f"subject prefix: {p!r}"
|
||||||
|
for p in _ATS_CONFIRM_SUBJECTS:
|
||||||
|
if p in subject_lower:
|
||||||
|
return f"ATS subject: {p!r}"
|
||||||
|
haystack = subject_lower + " " + body[:800].lower()
|
||||||
|
for p in _REJECTION_PHRASES + _SPAM_PHRASES:
|
||||||
|
if p in haystack:
|
||||||
|
return f"phrase: {p!r}"
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--limit", type=int, default=20, help="Max emails to test")
|
||||||
|
parser.add_argument("--days", type=int, default=90)
|
||||||
|
parser.add_argument("--dry-run", action="store_true",
|
||||||
|
help="Skip LLM calls — show phrase filter only")
|
||||||
|
parser.add_argument("--verbose", action="store_true",
|
||||||
|
help="Show which phrase triggered each BLOCK")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
cfg = load_config()
|
||||||
|
since = (datetime.now() - timedelta(days=args.days)).strftime("%d-%b-%Y")
|
||||||
|
|
||||||
|
print(f"Connecting to {cfg.get('host')} …")
|
||||||
|
conn = connect(cfg)
|
||||||
|
|
||||||
|
# Collect unique UIDs across broad terms
|
||||||
|
all_uids: dict[bytes, None] = {}
|
||||||
|
for term in BROAD_TERMS:
|
||||||
|
for uid in _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since):
|
||||||
|
all_uids[uid] = None
|
||||||
|
|
||||||
|
sample = list(all_uids.keys())[: args.limit]
|
||||||
|
print(f"Fetched {len(all_uids)} matching UIDs, testing {len(sample)}\n")
|
||||||
|
|
||||||
|
# Header
|
||||||
|
if args.dry_run:
|
||||||
|
print(f"{'Subject':<56} {'RK':3} {'Phrase':7}")
|
||||||
|
print("-" * 72)
|
||||||
|
else:
|
||||||
|
print(f"{'Subject':<56} {'RK':3} {'Phrase':7} {'phi3':<20} {'llama3':<20} {'vllm':<20}")
|
||||||
|
print("-" * 130)
|
||||||
|
|
||||||
|
passed = skipped = 0
|
||||||
|
rows = []
|
||||||
|
|
||||||
|
for uid in sample:
|
||||||
|
parsed = _parse_message(conn, uid)
|
||||||
|
if not parsed:
|
||||||
|
continue
|
||||||
|
subj = parsed["subject"]
|
||||||
|
body = parsed["body"]
|
||||||
|
|
||||||
|
has_rk = _has_recruitment_keyword(subj)
|
||||||
|
phrase_block = _has_rejection_or_ats_signal(subj, body)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
rk_mark = "✓" if has_rk else "✗"
|
||||||
|
pb_mark = "BLOCK" if phrase_block else "pass"
|
||||||
|
line = f"{_short(subj):<56} {rk_mark:3} {pb_mark:7}"
|
||||||
|
if phrase_block and args.verbose:
|
||||||
|
reason = _explain_block(subj, body)
|
||||||
|
line += f" [{reason}]"
|
||||||
|
print(line)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if phrase_block or not has_rk:
|
||||||
|
skipped += 1
|
||||||
|
rk_mark = "✓" if has_rk else "✗"
|
||||||
|
pb_mark = "BLOCK" if phrase_block else "pass"
|
||||||
|
print(f"{_short(subj):<56} {rk_mark:3} {pb_mark:7} {'—':<20} {'—':<20} {'—':<20}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
passed += 1
|
||||||
|
results = {}
|
||||||
|
for name, (model, fallback) in MODELS.items():
|
||||||
|
results[name] = _classify(subj, body, model, fallback)
|
||||||
|
|
||||||
|
pb_mark = "pass"
|
||||||
|
print(f"{_short(subj):<56} {'✓':3} {pb_mark:7} "
|
||||||
|
f"{results['phi3']:<20} {results['llama3']:<20} {results['vllm']:<20}")
|
||||||
|
|
||||||
|
if not args.dry_run:
|
||||||
|
print(f"\nPhrase-blocked or no-keyword: {skipped} | Reached LLMs: {passed}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn.logout()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
17
scripts/vision_service/environment.yml
Normal file
17
scripts/vision_service/environment.yml
Normal file
|
|
@ -0,0 +1,17 @@
|
||||||
|
name: job-seeker-vision
|
||||||
|
channels:
|
||||||
|
- conda-forge
|
||||||
|
- defaults
|
||||||
|
dependencies:
|
||||||
|
- python=3.11
|
||||||
|
- pip
|
||||||
|
- pip:
|
||||||
|
- torch>=2.0.0
|
||||||
|
- torchvision>=0.15.0
|
||||||
|
- transformers>=4.40.0
|
||||||
|
- accelerate>=0.26.0
|
||||||
|
- bitsandbytes>=0.43.0
|
||||||
|
- einops>=0.7.0
|
||||||
|
- Pillow>=10.0.0
|
||||||
|
- fastapi>=0.110.0
|
||||||
|
- "uvicorn[standard]>=0.27.0"
|
||||||
98
scripts/vision_service/main.py
Normal file
98
scripts/vision_service/main.py
Normal file
|
|
@ -0,0 +1,98 @@
|
||||||
|
"""
|
||||||
|
Vision service — moondream2 inference for survey screenshot analysis.
|
||||||
|
|
||||||
|
Start: bash scripts/manage-vision.sh start
|
||||||
|
Or directly: conda run -n job-seeker-vision uvicorn scripts.vision_service.main:app --port 8002
|
||||||
|
|
||||||
|
First run downloads moondream2 from HuggingFace (~1.8GB).
|
||||||
|
Model is loaded lazily on first /analyze request and stays resident.
|
||||||
|
GPU is used if available (CUDA); falls back to CPU.
|
||||||
|
4-bit quantization on GPU keeps VRAM footprint ~1.5GB.
|
||||||
|
"""
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
|
||||||
|
from fastapi import FastAPI, HTTPException
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
app = FastAPI(title="Job Seeker Vision Service")
|
||||||
|
|
||||||
|
# Module-level model state — lazy loaded on first /analyze request
|
||||||
|
_model = None
|
||||||
|
_tokenizer = None
|
||||||
|
_device = "cpu"
|
||||||
|
_loading = False
|
||||||
|
|
||||||
|
|
||||||
|
def _load_model() -> None:
|
||||||
|
global _model, _tokenizer, _device, _loading
|
||||||
|
if _model is not None:
|
||||||
|
return
|
||||||
|
_loading = True
|
||||||
|
print("[vision] Loading moondream2…")
|
||||||
|
import torch
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
model_id = "vikhyatk/moondream2"
|
||||||
|
revision = "2025-01-09"
|
||||||
|
_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
if _device == "cuda":
|
||||||
|
from transformers import BitsAndBytesConfig
|
||||||
|
bnb = BitsAndBytesConfig(load_in_4bit=True)
|
||||||
|
_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id, revision=revision,
|
||||||
|
quantization_config=bnb,
|
||||||
|
trust_remote_code=True,
|
||||||
|
device_map="auto",
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_id, revision=revision,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
_model.to(_device)
|
||||||
|
|
||||||
|
_tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
|
||||||
|
_loading = False
|
||||||
|
print(f"[vision] moondream2 ready on {_device}")
|
||||||
|
|
||||||
|
|
||||||
|
class AnalyzeRequest(BaseModel):
|
||||||
|
prompt: str
|
||||||
|
image_base64: str
|
||||||
|
|
||||||
|
|
||||||
|
class AnalyzeResponse(BaseModel):
|
||||||
|
text: str
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
def health():
|
||||||
|
import torch
|
||||||
|
return {
|
||||||
|
"status": "loading" if _loading else "ok",
|
||||||
|
"model": "moondream2",
|
||||||
|
"gpu": torch.cuda.is_available(),
|
||||||
|
"loaded": _model is not None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/analyze", response_model=AnalyzeResponse)
|
||||||
|
def analyze(req: AnalyzeRequest):
|
||||||
|
from PIL import Image
|
||||||
|
import torch
|
||||||
|
|
||||||
|
_load_model()
|
||||||
|
|
||||||
|
try:
|
||||||
|
image_data = base64.b64decode(req.image_base64)
|
||||||
|
image = Image.open(io.BytesIO(image_data)).convert("RGB")
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=400, detail=f"Invalid image: {e}")
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
enc_image = _model.encode_image(image)
|
||||||
|
answer = _model.answer_question(enc_image, req.prompt, _tokenizer)
|
||||||
|
|
||||||
|
return AnalyzeResponse(text=answer)
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
84
tests/test_company_research.py
Normal file
84
tests/test_company_research.py
Normal file
|
|
@ -0,0 +1,84 @@
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from scripts.company_research import _score_experiences, _build_resume_context, _load_resume_and_keywords
|
||||||
|
|
||||||
|
|
||||||
|
RESUME = {
|
||||||
|
"experience_details": [
|
||||||
|
{
|
||||||
|
"position": "Lead Technical Account Manager",
|
||||||
|
"company": "UpGuard",
|
||||||
|
"employment_period": "10/2022 - 05/2023",
|
||||||
|
"key_responsibilities": [
|
||||||
|
{"r1": "Managed enterprise security accounts worth $2M ARR"},
|
||||||
|
{"r2": "Led QBR cadence with C-suite stakeholders"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"position": "Founder and Principal Consultant",
|
||||||
|
"company": "M3 Consulting Services",
|
||||||
|
"employment_period": "07/2023 - Present",
|
||||||
|
"key_responsibilities": [
|
||||||
|
{"r1": "Revenue operations consulting for SaaS clients"},
|
||||||
|
{"r2": "Built customer success frameworks"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"position": "Customer Success Manager",
|
||||||
|
"company": "Generic Co",
|
||||||
|
"employment_period": "01/2020 - 09/2022",
|
||||||
|
"key_responsibilities": [
|
||||||
|
{"r1": "Managed SMB portfolio"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
KEYWORDS = ["ARR", "QBR", "enterprise", "security", "stakeholder"]
|
||||||
|
JD = "Looking for a TAM with enterprise ARR experience and QBR facilitation skills."
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_experiences_returns_sorted():
|
||||||
|
"""UpGuard entry should score highest — most keywords present in text and JD."""
|
||||||
|
scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD)
|
||||||
|
assert scored[0]["company"] == "UpGuard"
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_experiences_adds_score_key():
|
||||||
|
"""Each returned entry has a 'score' integer key."""
|
||||||
|
scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD)
|
||||||
|
for e in scored:
|
||||||
|
assert isinstance(e["score"], int)
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_resume_context_top2_in_full():
|
||||||
|
"""Top 2 experiences appear with full bullet detail."""
|
||||||
|
ctx = _build_resume_context(RESUME, KEYWORDS, JD)
|
||||||
|
assert "Lead Technical Account Manager" in ctx
|
||||||
|
assert "Managed enterprise security accounts" in ctx
|
||||||
|
assert "Founder and Principal Consultant" in ctx
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_resume_context_rest_condensed():
|
||||||
|
"""Remaining experiences appear as condensed one-liners, not full bullets."""
|
||||||
|
ctx = _build_resume_context(RESUME, KEYWORDS, JD)
|
||||||
|
assert "Also in Alex" in ctx
|
||||||
|
assert "Generic Co" in ctx
|
||||||
|
# Generic Co bullets should NOT appear in full
|
||||||
|
assert "Managed SMB portfolio" not in ctx
|
||||||
|
|
||||||
|
|
||||||
|
def test_upguard_nda_low_score():
|
||||||
|
"""UpGuard name replaced with 'enterprise security vendor' when score < 3."""
|
||||||
|
ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops")
|
||||||
|
assert "enterprise security vendor" in ctx
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_resume_and_keywords_returns_lists():
|
||||||
|
"""_load_resume_and_keywords returns a tuple of (dict, list[str])."""
|
||||||
|
resume, keywords = _load_resume_and_keywords()
|
||||||
|
assert isinstance(resume, dict)
|
||||||
|
assert isinstance(keywords, list)
|
||||||
|
assert all(isinstance(k, str) for k in keywords)
|
||||||
120
tests/test_cover_letter.py
Normal file
120
tests/test_cover_letter.py
Normal file
|
|
@ -0,0 +1,120 @@
|
||||||
|
# tests/test_cover_letter.py
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
# ── prepare_training_data tests ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_extract_role_from_text():
|
||||||
|
"""extract_role_from_text pulls the role title from the opening sentence."""
|
||||||
|
from scripts.prepare_training_data import extract_role_from_text
|
||||||
|
|
||||||
|
text = "Dear Tailscale Hiring Team,\n\nI'm delighted to apply for the Customer Support Manager position at Tailscale."
|
||||||
|
assert extract_role_from_text(text) == "Customer Support Manager"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_role_handles_missing():
|
||||||
|
"""extract_role_from_text returns empty string if no role found."""
|
||||||
|
from scripts.prepare_training_data import extract_role_from_text
|
||||||
|
|
||||||
|
assert extract_role_from_text("Dear Team,\n\nHello there.") == ""
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_company_from_filename():
|
||||||
|
"""extract_company_from_filename strips 'Cover Letter' suffix."""
|
||||||
|
from scripts.prepare_training_data import extract_company_from_filename
|
||||||
|
|
||||||
|
assert extract_company_from_filename("Tailscale Cover Letter") == "Tailscale"
|
||||||
|
assert extract_company_from_filename("Dagster Labs Cover Letter.md") == "Dagster Labs"
|
||||||
|
|
||||||
|
|
||||||
|
def test_strip_greeting():
|
||||||
|
"""strip_greeting removes the 'Dear X,' line and returns the body."""
|
||||||
|
from scripts.prepare_training_data import strip_greeting
|
||||||
|
|
||||||
|
text = "Dear Hiring Team,\n\nI'm delighted to apply for the CSM role.\n\nBest regards,\nAlex"
|
||||||
|
result = strip_greeting(text)
|
||||||
|
assert result.startswith("I'm delighted")
|
||||||
|
assert "Dear" not in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_records_from_tmp_corpus(tmp_path):
|
||||||
|
"""build_records parses a small corpus directory into training records."""
|
||||||
|
from scripts.prepare_training_data import build_records
|
||||||
|
|
||||||
|
letter = tmp_path / "Acme Corp Cover Letter.md"
|
||||||
|
letter.write_text(
|
||||||
|
"Dear Acme Hiring Team,\n\n"
|
||||||
|
"I'm delighted to apply for the Director of Customer Success position at Acme Corp. "
|
||||||
|
"With six years of experience, I bring strong skills.\n\n"
|
||||||
|
"Best regards,\nAlex Rivera"
|
||||||
|
)
|
||||||
|
|
||||||
|
records = build_records(tmp_path)
|
||||||
|
assert len(records) == 1
|
||||||
|
assert "Acme Corp" in records[0]["instruction"]
|
||||||
|
assert "Director of Customer Success" in records[0]["instruction"]
|
||||||
|
assert records[0]["output"].startswith("I'm delighted")
|
||||||
|
|
||||||
|
|
||||||
|
def test_build_records_skips_empty_files(tmp_path):
|
||||||
|
"""build_records ignores empty or very short files."""
|
||||||
|
from scripts.prepare_training_data import build_records
|
||||||
|
|
||||||
|
(tmp_path / "Empty Cover Letter.md").write_text("")
|
||||||
|
(tmp_path / "Tiny Cover Letter.md").write_text("Hi")
|
||||||
|
|
||||||
|
records = build_records(tmp_path)
|
||||||
|
assert len(records) == 0
|
||||||
|
|
||||||
|
|
||||||
|
# ── generate_cover_letter tests ───────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_find_similar_letters_returns_top_k():
|
||||||
|
"""find_similar_letters returns at most top_k entries."""
|
||||||
|
from scripts.generate_cover_letter import find_similar_letters
|
||||||
|
|
||||||
|
corpus = [
|
||||||
|
{"company": "Acme", "text": "customer success technical account management SaaS"},
|
||||||
|
{"company": "Beta", "text": "software engineering backend python"},
|
||||||
|
{"company": "Gamma", "text": "customer onboarding enterprise NPS"},
|
||||||
|
{"company": "Delta", "text": "customer success manager renewal QBR"},
|
||||||
|
]
|
||||||
|
results = find_similar_letters("customer success manager enterprise SaaS", corpus, top_k=2)
|
||||||
|
assert len(results) == 2
|
||||||
|
# Should prefer customer success companies over software engineering
|
||||||
|
companies = [r["company"] for r in results]
|
||||||
|
assert "Beta" not in companies
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_corpus_returns_list():
|
||||||
|
"""load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash)."""
|
||||||
|
from scripts.generate_cover_letter import load_corpus, LETTERS_DIR
|
||||||
|
|
||||||
|
if LETTERS_DIR.exists():
|
||||||
|
corpus = load_corpus()
|
||||||
|
assert isinstance(corpus, list)
|
||||||
|
if corpus:
|
||||||
|
assert "company" in corpus[0]
|
||||||
|
assert "text" in corpus[0]
|
||||||
|
else:
|
||||||
|
pytest.skip("LETTERS_DIR not present in this environment")
|
||||||
|
|
||||||
|
|
||||||
|
def test_generate_calls_llm_router():
|
||||||
|
"""generate() calls the router's complete() and returns its output."""
|
||||||
|
from scripts.generate_cover_letter import generate
|
||||||
|
|
||||||
|
fake_corpus = [
|
||||||
|
{"company": "Acme", "text": "I'm delighted to apply for the CSM role at Acme."},
|
||||||
|
]
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = "Dear Hiring Team,\n\nI'm delighted to apply.\n\nWarm regards,\nAlex Rivera"
|
||||||
|
|
||||||
|
with patch("scripts.generate_cover_letter.load_corpus", return_value=fake_corpus):
|
||||||
|
result = generate("Customer Success Manager", "TestCo", "Looking for a CSM",
|
||||||
|
_router=mock_router)
|
||||||
|
|
||||||
|
mock_router.complete.assert_called_once()
|
||||||
|
assert "Alex Rivera" in result
|
||||||
211
tests/test_craigslist.py
Normal file
211
tests/test_craigslist.py
Normal file
|
|
@ -0,0 +1,211 @@
|
||||||
|
"""Tests for Craigslist RSS scraper."""
|
||||||
|
from datetime import datetime, timezone, timedelta
|
||||||
|
from email.utils import format_datetime
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
# ── RSS fixture helpers ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_rss(items: list[dict]) -> bytes:
|
||||||
|
"""Build minimal Craigslist-style RSS XML from a list of item dicts."""
|
||||||
|
channel = ET.Element("channel")
|
||||||
|
for item_data in items:
|
||||||
|
item = ET.SubElement(channel, "item")
|
||||||
|
for tag, value in item_data.items():
|
||||||
|
el = ET.SubElement(item, tag)
|
||||||
|
el.text = value
|
||||||
|
rss = ET.Element("rss")
|
||||||
|
rss.append(channel)
|
||||||
|
return ET.tostring(rss, encoding="utf-8", xml_declaration=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _pubdate(hours_ago: float = 1.0) -> str:
|
||||||
|
"""Return an RFC 2822 pubDate string for N hours ago."""
|
||||||
|
dt = datetime.now(tz=timezone.utc) - timedelta(hours=hours_ago)
|
||||||
|
return format_datetime(dt)
|
||||||
|
|
||||||
|
|
||||||
|
def _mock_resp(content: bytes, status_code: int = 200) -> MagicMock:
|
||||||
|
mock = MagicMock()
|
||||||
|
mock.status_code = status_code
|
||||||
|
mock.content = content
|
||||||
|
mock.raise_for_status = MagicMock()
|
||||||
|
if status_code >= 400:
|
||||||
|
mock.raise_for_status.side_effect = requests.HTTPError(f"HTTP {status_code}")
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
# ── Fixtures ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_SAMPLE_RSS = _make_rss([{
|
||||||
|
"title": "Customer Success Manager",
|
||||||
|
"link": "https://sfbay.craigslist.org/jjj/d/csm-role/1234567890.html",
|
||||||
|
"description": "Great CSM role at Acme Corp. Salary $120k.",
|
||||||
|
"pubDate": _pubdate(1),
|
||||||
|
}])
|
||||||
|
|
||||||
|
_TWO_ITEM_RSS = _make_rss([
|
||||||
|
{
|
||||||
|
"title": "Customer Success Manager",
|
||||||
|
"link": "https://sfbay.craigslist.org/jjj/d/csm-role/1111111111.html",
|
||||||
|
"description": "CSM role 1.",
|
||||||
|
"pubDate": _pubdate(1),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Account Manager",
|
||||||
|
"link": "https://sfbay.craigslist.org/jjj/d/am-role/2222222222.html",
|
||||||
|
"description": "AM role.",
|
||||||
|
"pubDate": _pubdate(2),
|
||||||
|
},
|
||||||
|
])
|
||||||
|
|
||||||
|
_OLD_ITEM_RSS = _make_rss([{
|
||||||
|
"title": "Old Job",
|
||||||
|
"link": "https://sfbay.craigslist.org/jjj/d/old-job/9999999999.html",
|
||||||
|
"description": "Very old posting.",
|
||||||
|
"pubDate": _pubdate(hours_ago=500),
|
||||||
|
}])
|
||||||
|
|
||||||
|
_TWO_METRO_CONFIG = {
|
||||||
|
"metros": ["sfbay", "newyork"],
|
||||||
|
"location_map": {
|
||||||
|
"San Francisco Bay Area, CA": "sfbay",
|
||||||
|
"New York, NY": "newyork",
|
||||||
|
},
|
||||||
|
"category": "jjj",
|
||||||
|
}
|
||||||
|
|
||||||
|
_SINGLE_METRO_CONFIG = {
|
||||||
|
"metros": ["sfbay"],
|
||||||
|
"location_map": {"San Francisco Bay Area, CA": "sfbay"},
|
||||||
|
}
|
||||||
|
|
||||||
|
_PROFILE = {"titles": ["Customer Success Manager"], "hours_old": 240}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Tests ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_scrape_returns_empty_on_missing_config():
|
||||||
|
"""Missing craigslist.yaml → returns [] without raising."""
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
side_effect=FileNotFoundError("config not found")):
|
||||||
|
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_remote_hits_all_metros():
|
||||||
|
"""location='Remote' triggers one RSS fetch per configured metro."""
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_TWO_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "Remote")
|
||||||
|
|
||||||
|
assert mock_get.call_count == 2
|
||||||
|
fetched_urls = [call.args[0] for call in mock_get.call_args_list]
|
||||||
|
assert any("sfbay" in u for u in fetched_urls)
|
||||||
|
assert any("newyork" in u for u in fetched_urls)
|
||||||
|
assert all(r["is_remote"] for r in result)
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_location_map_resolves():
|
||||||
|
"""Known location string maps to exactly one metro."""
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_TWO_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=_mock_resp(_SAMPLE_RSS)) as mock_get:
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||||
|
|
||||||
|
assert mock_get.call_count == 1
|
||||||
|
assert "sfbay" in mock_get.call_args.args[0]
|
||||||
|
assert len(result) == 1
|
||||||
|
assert result[0]["is_remote"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_location_not_in_map_returns_empty():
|
||||||
|
"""Location not in location_map → [] without raising."""
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_SINGLE_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get") as mock_get:
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "Portland, OR")
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
mock_get.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_hours_old_filter():
|
||||||
|
"""Items older than hours_old are excluded."""
|
||||||
|
profile = {"titles": ["Customer Success Manager"], "hours_old": 48}
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_SINGLE_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=_mock_resp(_OLD_ITEM_RSS)):
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(profile, "San Francisco Bay Area, CA")
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_dedup_within_run():
|
||||||
|
"""Same URL from two different metros is only returned once."""
|
||||||
|
same_url_rss = _make_rss([{
|
||||||
|
"title": "CSM Role",
|
||||||
|
"link": "https://sfbay.craigslist.org/jjj/d/csm/1234.html",
|
||||||
|
"description": "Same job.",
|
||||||
|
"pubDate": _pubdate(1),
|
||||||
|
}])
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_TWO_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=_mock_resp(same_url_rss)):
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "Remote")
|
||||||
|
|
||||||
|
urls = [r["url"] for r in result]
|
||||||
|
assert len(urls) == len(set(urls))
|
||||||
|
|
||||||
|
|
||||||
|
def test_http_error_graceful():
|
||||||
|
"""HTTP error → [] without raising."""
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_SINGLE_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
side_effect=requests.RequestException("timeout")):
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||||
|
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_malformed_xml_graceful():
|
||||||
|
"""Malformed RSS XML → [] without raising."""
|
||||||
|
bad_resp = MagicMock()
|
||||||
|
bad_resp.content = b"this is not xml <<<<"
|
||||||
|
bad_resp.raise_for_status = MagicMock()
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_SINGLE_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=bad_resp):
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA")
|
||||||
|
assert result == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_results_wanted_cap():
|
||||||
|
"""Never returns more than results_wanted items."""
|
||||||
|
with patch("scripts.custom_boards.craigslist._load_config",
|
||||||
|
return_value=_TWO_METRO_CONFIG):
|
||||||
|
with patch("scripts.custom_boards.craigslist.requests.get",
|
||||||
|
return_value=_mock_resp(_TWO_ITEM_RSS)):
|
||||||
|
from scripts.custom_boards import craigslist
|
||||||
|
result = craigslist.scrape(_PROFILE, "Remote", results_wanted=1)
|
||||||
|
|
||||||
|
assert len(result) <= 1
|
||||||
560
tests/test_db.py
Normal file
560
tests/test_db.py
Normal file
|
|
@ -0,0 +1,560 @@
|
||||||
|
import pytest
|
||||||
|
import sqlite3
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
|
||||||
|
def test_init_db_creates_jobs_table(tmp_path):
|
||||||
|
"""init_db creates a jobs table with correct schema."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'")
|
||||||
|
assert cursor.fetchone() is not None
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_job_returns_id(tmp_path):
|
||||||
|
"""insert_job inserts a row and returns its id."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job = {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://example.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "$100k", "description": "Great role", "date_found": "2026-02-20",
|
||||||
|
}
|
||||||
|
row_id = insert_job(db_path, job)
|
||||||
|
assert isinstance(row_id, int)
|
||||||
|
assert row_id > 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_job_skips_duplicate_url(tmp_path):
|
||||||
|
"""insert_job returns None if URL already exists."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20"}
|
||||||
|
insert_job(db_path, job)
|
||||||
|
result = insert_job(db_path, job)
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_jobs_by_status(tmp_path):
|
||||||
|
"""get_jobs_by_status returns only jobs with matching status."""
|
||||||
|
from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20"}
|
||||||
|
row_id = insert_job(db_path, job)
|
||||||
|
update_job_status(db_path, [row_id], "approved")
|
||||||
|
approved = get_jobs_by_status(db_path, "approved")
|
||||||
|
pending = get_jobs_by_status(db_path, "pending")
|
||||||
|
assert len(approved) == 1
|
||||||
|
assert len(pending) == 0
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_job_status_batch(tmp_path):
|
||||||
|
"""update_job_status updates multiple rows at once."""
|
||||||
|
from scripts.db import init_db, insert_job, update_job_status, get_jobs_by_status
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
ids = []
|
||||||
|
for i in range(3):
|
||||||
|
job = {"title": f"Job {i}", "company": "Co", "url": f"https://example.com/{i}",
|
||||||
|
"source": "indeed", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20"}
|
||||||
|
ids.append(insert_job(db_path, job))
|
||||||
|
update_job_status(db_path, ids, "rejected")
|
||||||
|
assert len(get_jobs_by_status(db_path, "rejected")) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_migrate_db_adds_columns_to_existing_db(tmp_path):
|
||||||
|
"""_migrate_db adds cover_letter and applied_at to a db created without them."""
|
||||||
|
import sqlite3
|
||||||
|
from scripts.db import _migrate_db
|
||||||
|
db_path = tmp_path / "legacy.db"
|
||||||
|
# Create old-style table without the new columns
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.execute("""CREATE TABLE jobs (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
title TEXT, company TEXT, url TEXT UNIQUE, status TEXT DEFAULT 'pending'
|
||||||
|
)""")
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
_migrate_db(db_path)
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()}
|
||||||
|
conn.close()
|
||||||
|
assert "cover_letter" in cols
|
||||||
|
assert "applied_at" in cols
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_cover_letter(tmp_path):
|
||||||
|
"""update_cover_letter persists text to the DB."""
|
||||||
|
from scripts.db import init_db, insert_job, update_cover_letter, get_jobs_by_status
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
update_cover_letter(db_path, job_id, "Dear Hiring Manager,\nGreat role!")
|
||||||
|
rows = get_jobs_by_status(db_path, "pending")
|
||||||
|
assert rows[0]["cover_letter"] == "Dear Hiring Manager,\nGreat role!"
|
||||||
|
|
||||||
|
|
||||||
|
def test_mark_applied_sets_status_and_date(tmp_path):
|
||||||
|
"""mark_applied sets status='applied' and populates applied_at."""
|
||||||
|
from scripts.db import init_db, insert_job, mark_applied, get_jobs_by_status
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
mark_applied(db_path, [job_id])
|
||||||
|
applied = get_jobs_by_status(db_path, "applied")
|
||||||
|
assert len(applied) == 1
|
||||||
|
assert applied[0]["status"] == "applied"
|
||||||
|
assert applied[0]["applied_at"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
# ── background_tasks tests ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_init_db_creates_background_tasks_table(tmp_path):
|
||||||
|
"""init_db creates a background_tasks table."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'"
|
||||||
|
)
|
||||||
|
assert cur.fetchone() is not None
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_task_returns_id_and_true(tmp_path):
|
||||||
|
"""insert_task returns (task_id, True) for a new task."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
task_id, is_new = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
assert isinstance(task_id, int) and task_id > 0
|
||||||
|
assert is_new is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_task_deduplicates_active_task(tmp_path):
|
||||||
|
"""insert_task returns (existing_id, False) if a queued/running task already exists."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
first_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
second_id, is_new = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
assert second_id == first_id
|
||||||
|
assert is_new is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_task_allows_different_types_same_job(tmp_path):
|
||||||
|
"""insert_task allows cover_letter and company_research for the same job concurrently."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
_, cl_new = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
_, res_new = insert_task(db_path, "company_research", job_id)
|
||||||
|
assert cl_new is True
|
||||||
|
assert res_new is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_task_status_running(tmp_path):
|
||||||
|
"""update_task_status('running') sets started_at."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||||
|
import sqlite3
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
update_task_status(db_path, task_id, "running")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "running"
|
||||||
|
assert row[1] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_task_status_completed(tmp_path):
|
||||||
|
"""update_task_status('completed') sets finished_at."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||||
|
import sqlite3
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
update_task_status(db_path, task_id, "completed")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "completed"
|
||||||
|
assert row[1] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_task_status_failed_stores_error(tmp_path):
|
||||||
|
"""update_task_status('failed') stores error message and sets finished_at."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, update_task_status
|
||||||
|
import sqlite3
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
task_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
update_task_status(db_path, task_id, "failed", error="LLM timeout")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "failed"
|
||||||
|
assert row[1] == "LLM timeout"
|
||||||
|
assert row[2] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_active_tasks_returns_only_active(tmp_path):
|
||||||
|
"""get_active_tasks returns only queued/running tasks with job info joined."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
active_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
done_id, _ = insert_task(db_path, "company_research", job_id)
|
||||||
|
update_task_status(db_path, done_id, "completed")
|
||||||
|
|
||||||
|
tasks = get_active_tasks(db_path)
|
||||||
|
assert len(tasks) == 1
|
||||||
|
assert tasks[0]["id"] == active_id
|
||||||
|
assert tasks[0]["company"] == "Acme"
|
||||||
|
assert tasks[0]["title"] == "CSM"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_task_for_job_returns_latest(tmp_path):
|
||||||
|
"""get_task_for_job returns the most recent task for the given type+job."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
first_id, _ = insert_task(db_path, "cover_letter", job_id)
|
||||||
|
update_task_status(db_path, first_id, "completed")
|
||||||
|
second_id, _ = insert_task(db_path, "cover_letter", job_id) # allowed since first is done
|
||||||
|
|
||||||
|
task = get_task_for_job(db_path, "cover_letter", job_id)
|
||||||
|
assert task is not None
|
||||||
|
assert task["id"] == second_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_task_for_job_returns_none_when_absent(tmp_path):
|
||||||
|
"""get_task_for_job returns None when no task exists for that job+type."""
|
||||||
|
from scripts.db import init_db, insert_job, get_task_for_job
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
assert get_task_for_job(db_path, "cover_letter", job_id) is None
|
||||||
|
|
||||||
|
|
||||||
|
# ── company_research new-column tests ─────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_company_research_has_new_columns(tmp_path):
|
||||||
|
"""init_db creates company_research with the four extended columns."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
cols = [r[1] for r in conn.execute("PRAGMA table_info(company_research)").fetchall()]
|
||||||
|
conn.close()
|
||||||
|
assert "tech_brief" in cols
|
||||||
|
assert "funding_brief" in cols
|
||||||
|
assert "competitors_brief" in cols
|
||||||
|
assert "red_flags" in cols
|
||||||
|
|
||||||
|
def test_save_and_get_research_new_fields(tmp_path):
|
||||||
|
"""save_research persists and get_research returns the four new brief fields."""
|
||||||
|
from scripts.db import init_db, insert_job, save_research, get_research
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "TAM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
|
||||||
|
save_research(db, job_id=job_id,
|
||||||
|
company_brief="overview", ceo_brief="ceo",
|
||||||
|
talking_points="points", raw_output="raw",
|
||||||
|
tech_brief="tech stack", funding_brief="series B",
|
||||||
|
competitors_brief="vs competitors", red_flags="none")
|
||||||
|
r = get_research(db, job_id=job_id)
|
||||||
|
assert r["tech_brief"] == "tech stack"
|
||||||
|
assert r["funding_brief"] == "series B"
|
||||||
|
assert r["competitors_brief"] == "vs competitors"
|
||||||
|
assert r["red_flags"] == "none"
|
||||||
|
|
||||||
|
|
||||||
|
# ── stage_signal / suggestion_dismissed tests ─────────────────────────────────
|
||||||
|
|
||||||
|
def test_stage_signal_columns_exist(tmp_path):
|
||||||
|
"""init_db creates stage_signal and suggestion_dismissed columns on job_contacts."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cols = {row[1] for row in conn.execute("PRAGMA table_info(job_contacts)").fetchall()}
|
||||||
|
conn.close()
|
||||||
|
assert "stage_signal" in cols
|
||||||
|
assert "suggestion_dismissed" in cols
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_contact_with_stage_signal(tmp_path):
|
||||||
|
"""add_contact stores stage_signal when provided."""
|
||||||
|
from scripts.db import init_db, insert_job, add_contact, get_contacts
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
add_contact(db_path, job_id=job_id, direction="inbound",
|
||||||
|
subject="Interview invite", stage_signal="interview_scheduled")
|
||||||
|
contacts = get_contacts(db_path, job_id=job_id)
|
||||||
|
assert contacts[0]["stage_signal"] == "interview_scheduled"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_unread_stage_signals(tmp_path):
|
||||||
|
"""get_unread_stage_signals returns only non-neutral, non-dismissed signals."""
|
||||||
|
from scripts.db import (init_db, insert_job, add_contact,
|
||||||
|
get_unread_stage_signals, dismiss_stage_signal)
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
c1 = add_contact(db_path, job_id=job_id, direction="inbound",
|
||||||
|
subject="Interview invite", stage_signal="interview_scheduled")
|
||||||
|
add_contact(db_path, job_id=job_id, direction="inbound",
|
||||||
|
subject="Auto-confirm", stage_signal="neutral")
|
||||||
|
signals = get_unread_stage_signals(db_path, job_id)
|
||||||
|
assert len(signals) == 1
|
||||||
|
assert signals[0]["stage_signal"] == "interview_scheduled"
|
||||||
|
|
||||||
|
dismiss_stage_signal(db_path, c1)
|
||||||
|
assert get_unread_stage_signals(db_path, job_id) == []
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_email_leads(tmp_path):
|
||||||
|
"""get_email_leads returns only source='email' pending jobs."""
|
||||||
|
from scripts.db import init_db, insert_job, get_email_leads
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": "TAM", "company": "Wiz", "url": "email://wiz.com/abc123",
|
||||||
|
"source": "email", "location": "", "is_remote": 0,
|
||||||
|
"salary": "", "description": "Hi Alex…", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
leads = get_email_leads(db_path)
|
||||||
|
assert len(leads) == 1
|
||||||
|
assert leads[0]["company"] == "Wiz"
|
||||||
|
assert leads[0]["source"] == "email"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_all_message_ids(tmp_path):
|
||||||
|
"""get_all_message_ids returns all message IDs across jobs."""
|
||||||
|
from scripts.db import init_db, insert_job, add_contact, get_all_message_ids
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
add_contact(db_path, job_id=job_id, message_id="<msg-001@acme.com>")
|
||||||
|
add_contact(db_path, job_id=job_id, message_id="<msg-002@acme.com>")
|
||||||
|
mids = get_all_message_ids(db_path)
|
||||||
|
assert "<msg-001@acme.com>" in mids
|
||||||
|
assert "<msg-002@acme.com>" in mids
|
||||||
|
|
||||||
|
|
||||||
|
# ── survey_responses tests ────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def test_survey_responses_table_created(tmp_path):
|
||||||
|
"""init_db creates survey_responses table."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.execute(
|
||||||
|
"SELECT name FROM sqlite_master WHERE type='table' AND name='survey_responses'"
|
||||||
|
)
|
||||||
|
assert cur.fetchone() is not None
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_survey_at_column_exists(tmp_path):
|
||||||
|
"""jobs table has survey_at column after init_db."""
|
||||||
|
from scripts.db import init_db
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cols = [row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()]
|
||||||
|
assert "survey_at" in cols
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def test_insert_and_get_survey_response(tmp_path):
|
||||||
|
"""insert_survey_response inserts a row; get_survey_responses returns it."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_survey_response, get_survey_responses
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-23",
|
||||||
|
})
|
||||||
|
row_id = insert_survey_response(
|
||||||
|
db_path, job_id=job_id, survey_name="Culture Fit",
|
||||||
|
source="text_paste", raw_input="Q1: A B C", mode="quick",
|
||||||
|
llm_output="1. B — collaborative", reported_score="82%",
|
||||||
|
)
|
||||||
|
assert isinstance(row_id, int)
|
||||||
|
responses = get_survey_responses(db_path, job_id=job_id)
|
||||||
|
assert len(responses) == 1
|
||||||
|
assert responses[0]["survey_name"] == "Culture Fit"
|
||||||
|
assert responses[0]["reported_score"] == "82%"
|
||||||
|
|
||||||
|
|
||||||
|
def test_get_interview_jobs_includes_survey(tmp_path):
|
||||||
|
"""get_interview_jobs returns survey-stage jobs."""
|
||||||
|
from scripts.db import init_db, insert_job, update_job_status, get_interview_jobs
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/2",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-23",
|
||||||
|
})
|
||||||
|
update_job_status(db_path, [job_id], "survey")
|
||||||
|
result = get_interview_jobs(db_path)
|
||||||
|
assert any(j["id"] == job_id for j in result.get("survey", []))
|
||||||
|
|
||||||
|
|
||||||
|
def test_advance_to_survey_sets_survey_at(tmp_path):
|
||||||
|
"""advance_to_stage('survey') sets survey_at timestamp."""
|
||||||
|
from scripts.db import init_db, insert_job, update_job_status, advance_to_stage, get_job_by_id
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/3",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-23",
|
||||||
|
})
|
||||||
|
update_job_status(db_path, [job_id], "applied")
|
||||||
|
advance_to_stage(db_path, job_id=job_id, stage="survey")
|
||||||
|
job = get_job_by_id(db_path, job_id=job_id)
|
||||||
|
assert job["status"] == "survey"
|
||||||
|
assert job["survey_at"] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_job_fields(tmp_path):
|
||||||
|
from scripts.db import init_db, insert_job, update_job_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "Importing…", "company": "", "url": "https://example.com/job/1",
|
||||||
|
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
update_job_fields(db, job_id, {
|
||||||
|
"title": "Customer Success Manager",
|
||||||
|
"company": "Acme Corp",
|
||||||
|
"location": "San Francisco, CA",
|
||||||
|
"description": "Great role.",
|
||||||
|
"salary": "$120k",
|
||||||
|
"is_remote": 1,
|
||||||
|
})
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||||
|
conn.close()
|
||||||
|
assert row["title"] == "Customer Success Manager"
|
||||||
|
assert row["company"] == "Acme Corp"
|
||||||
|
assert row["description"] == "Great role."
|
||||||
|
assert row["is_remote"] == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_job_fields_ignores_unknown_columns(tmp_path):
|
||||||
|
from scripts.db import init_db, insert_job, update_job_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "Importing…", "company": "", "url": "https://example.com/job/2",
|
||||||
|
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
# Should not raise even with an unknown column
|
||||||
|
update_job_fields(db, job_id, {"title": "Real Title", "nonexistent_col": "ignored"})
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||||
|
conn.close()
|
||||||
|
assert row["title"] == "Real Title"
|
||||||
185
tests/test_discover.py
Normal file
185
tests/test_discover.py
Normal file
|
|
@ -0,0 +1,185 @@
|
||||||
|
# tests/test_discover.py
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
import pandas as pd
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
SAMPLE_JOB = {
|
||||||
|
"title": "Customer Success Manager",
|
||||||
|
"company": "Acme Corp",
|
||||||
|
"location": "Remote",
|
||||||
|
"is_remote": True,
|
||||||
|
"job_url": "https://linkedin.com/jobs/view/123456",
|
||||||
|
"site": "linkedin",
|
||||||
|
"min_amount": 90000,
|
||||||
|
"max_amount": 120000,
|
||||||
|
"salary_source": "$90,000 - $120,000",
|
||||||
|
"description": "Great CS role",
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMPLE_FM = {
|
||||||
|
"title_field": "Salary", "job_title": "Job Title", "company": "Company Name",
|
||||||
|
"url": "Role Link", "source": "Job Source", "status": "Status of Application",
|
||||||
|
"status_new": "Application Submitted", "date_found": "Date Found",
|
||||||
|
"remote": "Remote", "match_score": "Match Score",
|
||||||
|
"keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description",
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM}
|
||||||
|
SAMPLE_PROFILES_CFG = {
|
||||||
|
"profiles": [{"name": "cs", "titles": ["Customer Success Manager"],
|
||||||
|
"locations": ["Remote"], "boards": ["linkedin"],
|
||||||
|
"results_per_board": 5, "hours_old": 72}]
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def make_jobs_df(jobs=None):
|
||||||
|
return pd.DataFrame(jobs or [SAMPLE_JOB])
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_writes_to_sqlite(tmp_path):
|
||||||
|
"""run_discovery inserts new jobs into SQLite staging db."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
from scripts.db import get_jobs_by_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
run_discovery(db_path=db_path)
|
||||||
|
|
||||||
|
jobs = get_jobs_by_status(db_path, "pending")
|
||||||
|
assert len(jobs) == 1
|
||||||
|
assert jobs[0]["title"] == "Customer Success Manager"
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_skips_duplicate_urls(tmp_path):
|
||||||
|
"""run_discovery does not insert a job whose URL is already in SQLite."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
from scripts.db import init_db, insert_job, get_jobs_by_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-01-01",
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
run_discovery(db_path=db_path)
|
||||||
|
|
||||||
|
jobs = get_jobs_by_status(db_path, "pending")
|
||||||
|
assert len(jobs) == 1 # only the pre-existing one, not a duplicate
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_pushes_new_jobs(tmp_path):
|
||||||
|
"""Legacy: discover still calls push_to_notion when notion_push=True."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \
|
||||||
|
patch("scripts.discover.push_to_notion") as mock_push, \
|
||||||
|
patch("scripts.discover.get_existing_urls", return_value=set()), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
run_discovery(db_path=db_path, notion_push=True)
|
||||||
|
assert mock_push.call_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_push_to_notion_sets_status_new():
|
||||||
|
"""push_to_notion always sets Status to the configured status_new value."""
|
||||||
|
from scripts.discover import push_to_notion
|
||||||
|
mock_notion = MagicMock()
|
||||||
|
push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM)
|
||||||
|
call_kwargs = mock_notion.pages.create.call_args[1]
|
||||||
|
status = call_kwargs["properties"]["Status of Application"]["select"]["name"]
|
||||||
|
assert status == "Application Submitted"
|
||||||
|
|
||||||
|
|
||||||
|
# ── Custom boards integration ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
_PROFILE_WITH_CUSTOM = {
|
||||||
|
"profiles": [{
|
||||||
|
"name": "cs", "titles": ["Customer Success Manager"],
|
||||||
|
"locations": ["Remote"], "boards": [],
|
||||||
|
"custom_boards": ["adzuna"],
|
||||||
|
"results_per_board": 5, "hours_old": 72,
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
|
||||||
|
_ADZUNA_JOB = {
|
||||||
|
"title": "Customer Success Manager",
|
||||||
|
"company": "TestCo",
|
||||||
|
"url": "https://www.adzuna.com/jobs/details/999",
|
||||||
|
"source": "adzuna",
|
||||||
|
"location": "Remote",
|
||||||
|
"is_remote": True,
|
||||||
|
"salary": "$90,000 – $120,000",
|
||||||
|
"description": "Great remote CSM role",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_custom_board_inserts_jobs(tmp_path):
|
||||||
|
"""run_discovery dispatches custom_boards scrapers and inserts returned jobs."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
from scripts.db import get_jobs_by_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||||||
|
patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
count = run_discovery(db_path=db_path)
|
||||||
|
|
||||||
|
assert count == 1
|
||||||
|
jobs = get_jobs_by_status(db_path, "pending")
|
||||||
|
assert jobs[0]["title"] == "Customer Success Manager"
|
||||||
|
assert jobs[0]["source"] == "adzuna"
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_custom_board_skips_unknown(tmp_path, capsys):
|
||||||
|
"""run_discovery logs and skips an unregistered custom board name."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
|
||||||
|
profile_unknown = {
|
||||||
|
"profiles": [{
|
||||||
|
"name": "cs", "titles": ["CSM"], "locations": ["Remote"],
|
||||||
|
"boards": [], "custom_boards": ["nonexistent_board"],
|
||||||
|
"results_per_board": 5, "hours_old": 72,
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
with patch("scripts.discover.load_config", return_value=(profile_unknown, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
run_discovery(db_path=db_path)
|
||||||
|
|
||||||
|
captured = capsys.readouterr()
|
||||||
|
assert "nonexistent_board" in captured.out
|
||||||
|
assert "Unknown scraper" in captured.out
|
||||||
|
|
||||||
|
|
||||||
|
def test_discover_custom_board_deduplicates(tmp_path):
|
||||||
|
"""Custom board results are deduplicated by URL against pre-existing jobs."""
|
||||||
|
from scripts.discover import run_discovery
|
||||||
|
from scripts.db import init_db, insert_job, get_jobs_by_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "TestCo",
|
||||||
|
"url": "https://www.adzuna.com/jobs/details/999",
|
||||||
|
"source": "adzuna", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-01-01",
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \
|
||||||
|
patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \
|
||||||
|
patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \
|
||||||
|
patch("scripts.discover.Client"):
|
||||||
|
count = run_discovery(db_path=db_path)
|
||||||
|
|
||||||
|
assert count == 0 # duplicate skipped
|
||||||
|
assert len(get_jobs_by_status(db_path, "pending")) == 1
|
||||||
96
tests/test_enrich_descriptions.py
Normal file
96
tests/test_enrich_descriptions.py
Normal file
|
|
@ -0,0 +1,96 @@
|
||||||
|
# tests/test_enrich_descriptions.py
|
||||||
|
"""Tests for scripts/enrich_descriptions.py — enrich_craigslist_fields()."""
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_craigslist_fields_skips_non_craigslist(tmp_path):
|
||||||
|
"""Non-craigslist source → returns {} without calling LLM."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://example.com/1",
|
||||||
|
"source": "linkedin", "location": "", "description": "Some company here.",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
with patch("scripts.llm_router.LLMRouter") as mock_llm:
|
||||||
|
result = enrich_craigslist_fields(db, job_id)
|
||||||
|
assert result == {}
|
||||||
|
mock_llm.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_craigslist_fields_skips_populated_company(tmp_path):
|
||||||
|
"""Company already set → returns {} without calling LLM."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "Acme Corp", "url": "https://sfbay.craigslist.org/jjj/d/1.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "Join Acme Corp today.",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
with patch("scripts.llm_router.LLMRouter") as mock_llm:
|
||||||
|
result = enrich_craigslist_fields(db, job_id)
|
||||||
|
assert result == {}
|
||||||
|
mock_llm.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_craigslist_fields_skips_empty_description(tmp_path):
|
||||||
|
"""Empty description → returns {} without calling LLM."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/2.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
with patch("scripts.llm_router.LLMRouter") as mock_llm:
|
||||||
|
result = enrich_craigslist_fields(db, job_id)
|
||||||
|
assert result == {}
|
||||||
|
mock_llm.assert_not_called()
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_craigslist_fields_extracts_and_updates(tmp_path):
|
||||||
|
"""Valid LLM response → updates company/salary in DB, returns extracted dict."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/3.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $120k/yr.",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = '{"company": "Acme Corp", "salary": "$120k/yr"}'
|
||||||
|
with patch("scripts.llm_router.LLMRouter", return_value=mock_router):
|
||||||
|
result = enrich_craigslist_fields(db, job_id)
|
||||||
|
assert result == {"company": "Acme Corp", "salary": "$120k/yr"}
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
row = conn.execute("SELECT company, salary FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "Acme Corp"
|
||||||
|
assert row[1] == "$120k/yr"
|
||||||
|
|
||||||
|
|
||||||
|
def test_enrich_craigslist_fields_handles_bad_llm_json(tmp_path):
|
||||||
|
"""Unparseable LLM response → returns {} without raising."""
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
from scripts.enrich_descriptions import enrich_craigslist_fields
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/4.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "Great opportunity.",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
mock_router = MagicMock()
|
||||||
|
mock_router.complete.return_value = "Sorry, I cannot extract that."
|
||||||
|
with patch("scripts.llm_router.LLMRouter", return_value=mock_router):
|
||||||
|
result = enrich_craigslist_fields(db, job_id)
|
||||||
|
assert result == {}
|
||||||
330
tests/test_imap_sync.py
Normal file
330
tests/test_imap_sync.py
Normal file
|
|
@ -0,0 +1,330 @@
|
||||||
|
"""Tests for imap_sync helpers (no live IMAP connection required)."""
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_stage_signal_interview():
|
||||||
|
"""classify_stage_signal returns interview_scheduled for a call-scheduling email."""
|
||||||
|
from scripts.imap_sync import classify_stage_signal
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.return_value = "interview_scheduled"
|
||||||
|
result = classify_stage_signal(
|
||||||
|
"Let's schedule a call",
|
||||||
|
"Hi Alex, we'd love to book a 30-min phone screen with you.",
|
||||||
|
)
|
||||||
|
assert result == "interview_scheduled"
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_stage_signal_returns_none_on_error():
|
||||||
|
"""classify_stage_signal returns None when LLM call raises."""
|
||||||
|
from scripts.imap_sync import classify_stage_signal
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.side_effect = RuntimeError("model not loaded")
|
||||||
|
result = classify_stage_signal("subject", "body")
|
||||||
|
assert result is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_stage_signal_strips_think_tags():
|
||||||
|
"""classify_stage_signal strips <think>...</think> blocks before parsing."""
|
||||||
|
from scripts.imap_sync import classify_stage_signal
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.return_value = "<think>Let me think...</think>\nrejected"
|
||||||
|
result = classify_stage_signal("Update on your application", "We went with another candidate.")
|
||||||
|
assert result == "rejected"
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalise_company():
|
||||||
|
"""_normalise_company strips legal suffixes."""
|
||||||
|
from scripts.imap_sync import _normalise_company
|
||||||
|
assert _normalise_company("DataStax, Inc.") == "DataStax"
|
||||||
|
assert _normalise_company("Wiz Ltd") == "Wiz"
|
||||||
|
assert _normalise_company("Crusoe Energy") == "Crusoe Energy"
|
||||||
|
|
||||||
|
|
||||||
|
def test_company_search_terms_excludes_job_board_sld():
|
||||||
|
"""Job-board domains like linkedin.com are never used as match terms."""
|
||||||
|
from scripts.imap_sync import _company_search_terms
|
||||||
|
# LinkedIn-sourced job: SLD "linkedin" must not appear in the terms
|
||||||
|
terms = _company_search_terms("Bamboo Health", "https://www.linkedin.com/jobs/view/123")
|
||||||
|
assert "linkedin" not in terms
|
||||||
|
assert "bamboo health" in terms
|
||||||
|
|
||||||
|
# Company with its own domain: SLD should be included
|
||||||
|
terms = _company_search_terms("Crusoe Energy", "https://crusoe.ai/jobs/456")
|
||||||
|
assert "crusoe" in terms
|
||||||
|
|
||||||
|
# Indeed-sourced job: "indeed" excluded
|
||||||
|
terms = _company_search_terms("DoorDash", "https://www.indeed.com/viewjob?jk=abc")
|
||||||
|
assert "indeed" not in terms
|
||||||
|
assert "doordash" in terms
|
||||||
|
|
||||||
|
|
||||||
|
def test_has_recruitment_keyword():
|
||||||
|
"""_has_recruitment_keyword matches known keywords."""
|
||||||
|
from scripts.imap_sync import _has_recruitment_keyword
|
||||||
|
assert _has_recruitment_keyword("Interview Invitation — Senior TAM")
|
||||||
|
assert _has_recruitment_keyword("Your application with DataStax")
|
||||||
|
assert not _has_recruitment_keyword("Team lunch tomorrow")
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_lead_info_returns_company_and_title():
|
||||||
|
"""extract_lead_info parses LLM JSON response into (company, title)."""
|
||||||
|
from scripts.imap_sync import extract_lead_info
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.return_value = '{"company": "Wiz", "title": "Senior TAM"}'
|
||||||
|
result = extract_lead_info("Senior TAM at Wiz", "Hi Alex, we have a role…", "recruiter@wiz.com")
|
||||||
|
assert result == ("Wiz", "Senior TAM")
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_lead_info_returns_none_on_bad_json():
|
||||||
|
"""extract_lead_info returns (None, None) when LLM returns unparseable output."""
|
||||||
|
from scripts.imap_sync import extract_lead_info
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.return_value = "I cannot determine the company."
|
||||||
|
result = extract_lead_info("Job opportunity", "blah", "noreply@example.com")
|
||||||
|
assert result == (None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_labels_includes_survey_received():
|
||||||
|
"""_CLASSIFY_LABELS includes survey_received."""
|
||||||
|
from scripts.imap_sync import _CLASSIFY_LABELS
|
||||||
|
assert "survey_received" in _CLASSIFY_LABELS
|
||||||
|
|
||||||
|
|
||||||
|
def test_classify_stage_signal_returns_survey_received():
|
||||||
|
"""classify_stage_signal returns 'survey_received' when LLM outputs that label."""
|
||||||
|
from unittest.mock import patch
|
||||||
|
from scripts.imap_sync import classify_stage_signal
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router:
|
||||||
|
mock_router.complete.return_value = "survey_received"
|
||||||
|
result = classify_stage_signal("Complete our culture survey", "Please fill out this form")
|
||||||
|
assert result == "survey_received"
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_job_emails_classifies_inbound(tmp_path):
|
||||||
|
"""sync_job_emails classifies inbound emails and stores the stage_signal."""
|
||||||
|
from scripts.db import init_db, insert_job, get_contacts
|
||||||
|
from scripts.imap_sync import sync_job_emails
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
job_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme",
|
||||||
|
"url": "https://acme.com/jobs/1",
|
||||||
|
"source": "linkedin", "location": "Remote",
|
||||||
|
"is_remote": True, "salary": "", "description": "",
|
||||||
|
"date_found": "2026-02-21",
|
||||||
|
})
|
||||||
|
job = {"id": job_id, "company": "Acme", "url": "https://acme.com/jobs/1"}
|
||||||
|
|
||||||
|
fake_msg_bytes = (
|
||||||
|
b"From: recruiter@acme.com\r\n"
|
||||||
|
b"To: alex@example.com\r\n"
|
||||||
|
b"Subject: Interview Invitation\r\n"
|
||||||
|
b"Message-ID: <unique-001@acme.com>\r\n"
|
||||||
|
b"\r\n"
|
||||||
|
b"Hi Alex, we'd like to schedule a phone screen."
|
||||||
|
)
|
||||||
|
|
||||||
|
conn_mock = MagicMock()
|
||||||
|
conn_mock.select.return_value = ("OK", [b"1"])
|
||||||
|
conn_mock.search.return_value = ("OK", [b"1"])
|
||||||
|
conn_mock.fetch.return_value = ("OK", [(b"1 (RFC822 {123})", fake_msg_bytes)])
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync.classify_stage_signal", return_value="interview_scheduled"):
|
||||||
|
inb, out = sync_job_emails(job, conn_mock, {"lookback_days": 90}, db_path)
|
||||||
|
|
||||||
|
assert inb == 1
|
||||||
|
contacts = get_contacts(db_path, job_id=job_id)
|
||||||
|
assert contacts[0]["stage_signal"] == "interview_scheduled"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_alert_extracts_jobs():
|
||||||
|
from scripts.imap_sync import parse_linkedin_alert
|
||||||
|
body = """\
|
||||||
|
Your job alert for customer success manager in United States
|
||||||
|
New jobs match your preferences.
|
||||||
|
Manage alerts: https://www.linkedin.com/comm/jobs/alerts?...
|
||||||
|
|
||||||
|
Customer Success Manager
|
||||||
|
Reflow
|
||||||
|
California, United States
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
Customer Engagement Manager
|
||||||
|
Bitwarden
|
||||||
|
United States
|
||||||
|
|
||||||
|
2 school alumni
|
||||||
|
Apply with resume & profile
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
"""
|
||||||
|
jobs = parse_linkedin_alert(body)
|
||||||
|
assert len(jobs) == 2
|
||||||
|
assert jobs[0]["title"] == "Customer Success Manager"
|
||||||
|
assert jobs[0]["company"] == "Reflow"
|
||||||
|
assert jobs[0]["location"] == "California, United States"
|
||||||
|
assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||||
|
assert jobs[1]["title"] == "Customer Engagement Manager"
|
||||||
|
assert jobs[1]["company"] == "Bitwarden"
|
||||||
|
assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_alert_skips_blocks_without_view_job():
|
||||||
|
from scripts.imap_sync import parse_linkedin_alert
|
||||||
|
body = """\
|
||||||
|
Customer Success Manager
|
||||||
|
Some Company
|
||||||
|
United States
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
Valid Job Title
|
||||||
|
Valid Company
|
||||||
|
Remote
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
"""
|
||||||
|
jobs = parse_linkedin_alert(body)
|
||||||
|
assert len(jobs) == 1
|
||||||
|
assert jobs[0]["title"] == "Valid Job Title"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_linkedin_alert_empty_body():
|
||||||
|
from scripts.imap_sync import parse_linkedin_alert
|
||||||
|
assert parse_linkedin_alert("") == []
|
||||||
|
assert parse_linkedin_alert("No jobs here.") == []
|
||||||
|
|
||||||
|
|
||||||
|
# ── _scan_unmatched_leads integration ─────────────────────────────────────────
|
||||||
|
|
||||||
|
_ALERT_BODY = """\
|
||||||
|
Your job alert for customer success manager in United States
|
||||||
|
New jobs match your preferences.
|
||||||
|
|
||||||
|
Customer Success Manager
|
||||||
|
Acme Corp
|
||||||
|
California, United States
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/9999001/?trackingId=abc
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
|
||||||
|
Director of Customer Success
|
||||||
|
Beta Inc
|
||||||
|
Remote
|
||||||
|
View job: https://www.linkedin.com/comm/jobs/view/9999002/?trackingId=def
|
||||||
|
|
||||||
|
---------------------------------------------------------
|
||||||
|
"""
|
||||||
|
|
||||||
|
_ALERT_EMAIL = {
|
||||||
|
"message_id": "<alert-001@linkedin.com>",
|
||||||
|
"from_addr": "jobalerts-noreply@linkedin.com",
|
||||||
|
"to_addr": "alex@example.com",
|
||||||
|
"subject": "2 new jobs for customer success manager",
|
||||||
|
"body": _ALERT_BODY,
|
||||||
|
"date": "2026-02-24 12:00:00",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_unmatched_leads_linkedin_alert_inserts_jobs(tmp_path):
|
||||||
|
"""_scan_unmatched_leads detects a LinkedIn alert and inserts each job card."""
|
||||||
|
import sqlite3
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from scripts.db import init_db
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
conn_mock = MagicMock()
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
|
||||||
|
patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
|
||||||
|
patch("scripts.task_runner.submit_task") as mock_submit:
|
||||||
|
|
||||||
|
from scripts.imap_sync import _scan_unmatched_leads
|
||||||
|
known_ids: set = set()
|
||||||
|
new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, known_ids)
|
||||||
|
|
||||||
|
assert new_leads == 2
|
||||||
|
|
||||||
|
# Message ID added so it won't be reprocessed
|
||||||
|
assert "<alert-001@linkedin.com>" in known_ids
|
||||||
|
|
||||||
|
# Both jobs inserted with correct fields
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
jobs = conn.execute("SELECT * FROM jobs ORDER BY id").fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
assert len(jobs) == 2
|
||||||
|
assert jobs[0]["title"] == "Customer Success Manager"
|
||||||
|
assert jobs[0]["company"] == "Acme Corp"
|
||||||
|
assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/9999001/"
|
||||||
|
assert jobs[0]["source"] == "linkedin"
|
||||||
|
assert jobs[1]["title"] == "Director of Customer Success"
|
||||||
|
assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/9999002/"
|
||||||
|
|
||||||
|
# scrape_url task submitted for each inserted job
|
||||||
|
assert mock_submit.call_count == 2
|
||||||
|
task_types = [call.args[1] for call in mock_submit.call_args_list]
|
||||||
|
assert task_types == ["scrape_url", "scrape_url"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_unmatched_leads_linkedin_alert_skips_duplicates(tmp_path):
|
||||||
|
"""URLs already in the DB are not re-inserted."""
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
# Pre-insert one of the two URLs
|
||||||
|
insert_job(db_path, {
|
||||||
|
"title": "Customer Success Manager", "company": "Acme Corp",
|
||||||
|
"url": "https://www.linkedin.com/jobs/view/9999001/",
|
||||||
|
"source": "linkedin", "location": "", "is_remote": 0,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
|
||||||
|
conn_mock = MagicMock()
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
|
||||||
|
patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
|
||||||
|
patch("scripts.task_runner.submit_task") as mock_submit:
|
||||||
|
|
||||||
|
from scripts.imap_sync import _scan_unmatched_leads
|
||||||
|
new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set())
|
||||||
|
|
||||||
|
# Only one new job (the duplicate was skipped)
|
||||||
|
assert new_leads == 1
|
||||||
|
assert mock_submit.call_count == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_scan_unmatched_leads_linkedin_alert_skips_llm_path(tmp_path):
|
||||||
|
"""After a LinkedIn alert email, the LLM extraction path is never reached."""
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from scripts.db import init_db
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
conn_mock = MagicMock()
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \
|
||||||
|
patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \
|
||||||
|
patch("scripts.task_runner.submit_task"), \
|
||||||
|
patch("scripts.imap_sync.extract_lead_info") as mock_llm:
|
||||||
|
|
||||||
|
from scripts.imap_sync import _scan_unmatched_leads
|
||||||
|
_scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set())
|
||||||
|
|
||||||
|
# LLM extraction must never be called for alert emails
|
||||||
|
mock_llm.assert_not_called()
|
||||||
135
tests/test_llm_router.py
Normal file
135
tests/test_llm_router.py
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from pathlib import Path
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml"
|
||||||
|
|
||||||
|
|
||||||
|
def test_config_loads():
|
||||||
|
"""Config file is valid YAML with required keys."""
|
||||||
|
cfg = yaml.safe_load(CONFIG_PATH.read_text())
|
||||||
|
assert "fallback_order" in cfg
|
||||||
|
assert "backends" in cfg
|
||||||
|
assert len(cfg["fallback_order"]) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_uses_first_reachable_backend():
|
||||||
|
"""Router skips unreachable backends and uses the first that responds."""
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
router = LLMRouter(CONFIG_PATH)
|
||||||
|
|
||||||
|
mock_response = MagicMock()
|
||||||
|
mock_response.choices[0].message.content = "hello"
|
||||||
|
|
||||||
|
with patch.object(router, "_is_reachable", side_effect=[False, True, True, True, True]), \
|
||||||
|
patch("scripts.llm_router.OpenAI") as MockOpenAI:
|
||||||
|
instance = MockOpenAI.return_value
|
||||||
|
instance.chat.completions.create.return_value = mock_response
|
||||||
|
mock_model = MagicMock()
|
||||||
|
mock_model.id = "test-model"
|
||||||
|
instance.models.list.return_value.data = [mock_model]
|
||||||
|
|
||||||
|
result = router.complete("say hello")
|
||||||
|
|
||||||
|
assert result == "hello"
|
||||||
|
|
||||||
|
|
||||||
|
def test_router_raises_when_all_backends_fail():
|
||||||
|
"""Router raises RuntimeError when every backend is unreachable or errors."""
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
router = LLMRouter(CONFIG_PATH)
|
||||||
|
|
||||||
|
with patch.object(router, "_is_reachable", return_value=False):
|
||||||
|
with pytest.raises(RuntimeError, match="All LLM backends exhausted"):
|
||||||
|
router.complete("say hello")
|
||||||
|
|
||||||
|
|
||||||
|
def test_is_reachable_returns_false_on_connection_error():
|
||||||
|
"""_is_reachable returns False when the health endpoint is unreachable."""
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
import requests
|
||||||
|
|
||||||
|
router = LLMRouter(CONFIG_PATH)
|
||||||
|
|
||||||
|
with patch("scripts.llm_router.requests.get", side_effect=requests.ConnectionError):
|
||||||
|
result = router._is_reachable("http://localhost:9999/v1")
|
||||||
|
|
||||||
|
assert result is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_complete_skips_backend_without_image_support(tmp_path):
|
||||||
|
"""When images= is passed, backends without supports_images are skipped."""
|
||||||
|
import yaml
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
|
||||||
|
cfg = {
|
||||||
|
"fallback_order": ["ollama", "vision_service"],
|
||||||
|
"backends": {
|
||||||
|
"ollama": {
|
||||||
|
"type": "openai_compat",
|
||||||
|
"base_url": "http://localhost:11434/v1",
|
||||||
|
"model": "llava",
|
||||||
|
"api_key": "ollama",
|
||||||
|
"enabled": True,
|
||||||
|
"supports_images": False,
|
||||||
|
},
|
||||||
|
"vision_service": {
|
||||||
|
"type": "vision_service",
|
||||||
|
"base_url": "http://localhost:8002",
|
||||||
|
"enabled": True,
|
||||||
|
"supports_images": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cfg_file = tmp_path / "llm.yaml"
|
||||||
|
cfg_file.write_text(yaml.dump(cfg))
|
||||||
|
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.status_code = 200
|
||||||
|
mock_resp.json.return_value = {"text": "B — collaborative"}
|
||||||
|
|
||||||
|
with patch("scripts.llm_router.requests.get") as mock_get, \
|
||||||
|
patch("scripts.llm_router.requests.post") as mock_post:
|
||||||
|
# health check returns ok for vision_service
|
||||||
|
mock_get.return_value = MagicMock(status_code=200)
|
||||||
|
mock_post.return_value = mock_resp
|
||||||
|
|
||||||
|
router = LLMRouter(config_path=cfg_file)
|
||||||
|
result = router.complete("Which option?", images=["base64data"])
|
||||||
|
|
||||||
|
assert result == "B — collaborative"
|
||||||
|
# vision_service POST /analyze should have been called
|
||||||
|
assert mock_post.called
|
||||||
|
|
||||||
|
|
||||||
|
def test_complete_without_images_skips_vision_service(tmp_path):
|
||||||
|
"""When images=None, vision_service backend is skipped."""
|
||||||
|
import yaml
|
||||||
|
from scripts.llm_router import LLMRouter
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
cfg = {
|
||||||
|
"fallback_order": ["vision_service"],
|
||||||
|
"backends": {
|
||||||
|
"vision_service": {
|
||||||
|
"type": "vision_service",
|
||||||
|
"base_url": "http://localhost:8002",
|
||||||
|
"enabled": True,
|
||||||
|
"supports_images": True,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
cfg_file = tmp_path / "llm.yaml"
|
||||||
|
cfg_file.write_text(yaml.dump(cfg))
|
||||||
|
|
||||||
|
router = LLMRouter(config_path=cfg_file)
|
||||||
|
with patch("scripts.llm_router.requests.post") as mock_post:
|
||||||
|
try:
|
||||||
|
router.complete("text only prompt")
|
||||||
|
except RuntimeError:
|
||||||
|
pass # all backends exhausted is expected
|
||||||
|
assert not mock_post.called
|
||||||
47
tests/test_match.py
Normal file
47
tests/test_match.py
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_job_description_from_url():
|
||||||
|
"""extract_job_description fetches and returns visible text from a URL."""
|
||||||
|
from scripts.match import extract_job_description
|
||||||
|
|
||||||
|
with patch("scripts.match.requests.get") as mock_get:
|
||||||
|
mock_get.return_value.text = "<html><body><p>We need a CSM with Salesforce.</p></body></html>"
|
||||||
|
mock_get.return_value.raise_for_status = MagicMock()
|
||||||
|
result = extract_job_description("https://example.com/job/123")
|
||||||
|
|
||||||
|
assert "CSM" in result
|
||||||
|
assert "Salesforce" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_score_is_between_0_and_100():
|
||||||
|
"""match_score returns a float in [0, 100] and a list of keyword gaps."""
|
||||||
|
from scripts.match import match_score
|
||||||
|
|
||||||
|
score, gaps = match_score(
|
||||||
|
resume_text="Customer Success Manager with Salesforce experience",
|
||||||
|
job_text="Looking for a Customer Success Manager who knows Salesforce and Gainsight",
|
||||||
|
)
|
||||||
|
assert 0 <= score <= 100
|
||||||
|
assert isinstance(gaps, list)
|
||||||
|
|
||||||
|
|
||||||
|
def test_write_score_to_notion():
|
||||||
|
"""write_match_to_notion updates the Notion page with score and gaps."""
|
||||||
|
from scripts.match import write_match_to_notion
|
||||||
|
|
||||||
|
mock_notion = MagicMock()
|
||||||
|
|
||||||
|
SAMPLE_FM = {
|
||||||
|
"match_score": "Match Score",
|
||||||
|
"keyword_gaps": "Keyword Gaps",
|
||||||
|
}
|
||||||
|
|
||||||
|
write_match_to_notion(mock_notion, "page-id-abc", 85.5, ["Gainsight", "Churnzero"], SAMPLE_FM)
|
||||||
|
|
||||||
|
mock_notion.pages.update.assert_called_once()
|
||||||
|
call_kwargs = mock_notion.pages.update.call_args[1]
|
||||||
|
assert call_kwargs["page_id"] == "page-id-abc"
|
||||||
|
score_val = call_kwargs["properties"]["Match Score"]["number"]
|
||||||
|
assert score_val == 85.5
|
||||||
135
tests/test_scrape_url.py
Normal file
135
tests/test_scrape_url.py
Normal file
|
|
@ -0,0 +1,135 @@
|
||||||
|
"""Tests for URL-based job scraping."""
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
|
||||||
|
|
||||||
|
def _make_db(tmp_path, url="https://www.linkedin.com/jobs/view/99999/"):
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "Importing…", "company": "", "url": url,
|
||||||
|
"source": "manual", "location": "", "description": "", "date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
return db, job_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonicalize_url_linkedin():
|
||||||
|
from scripts.scrape_url import canonicalize_url
|
||||||
|
messy = (
|
||||||
|
"https://www.linkedin.com/jobs/view/4376518925/"
|
||||||
|
"?trk=eml-email_job_alert&refId=abc%3D%3D&trackingId=xyz"
|
||||||
|
)
|
||||||
|
assert canonicalize_url(messy) == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonicalize_url_linkedin_comm():
|
||||||
|
from scripts.scrape_url import canonicalize_url
|
||||||
|
comm = "https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc"
|
||||||
|
assert canonicalize_url(comm) == "https://www.linkedin.com/jobs/view/4376518925/"
|
||||||
|
|
||||||
|
|
||||||
|
def test_canonicalize_url_generic_strips_utm():
|
||||||
|
from scripts.scrape_url import canonicalize_url
|
||||||
|
url = "https://jobs.example.com/post/42?utm_source=linkedin&utm_medium=email&jk=real_param"
|
||||||
|
result = canonicalize_url(url)
|
||||||
|
assert "utm_source" not in result
|
||||||
|
assert "real_param" in result
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_board_linkedin():
|
||||||
|
from scripts.scrape_url import _detect_board
|
||||||
|
assert _detect_board("https://www.linkedin.com/jobs/view/12345/") == "linkedin"
|
||||||
|
assert _detect_board("https://linkedin.com/jobs/view/12345/?tracking=abc") == "linkedin"
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_board_indeed():
|
||||||
|
from scripts.scrape_url import _detect_board
|
||||||
|
assert _detect_board("https://www.indeed.com/viewjob?jk=abc123") == "indeed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_board_glassdoor():
|
||||||
|
from scripts.scrape_url import _detect_board
|
||||||
|
assert _detect_board("https://www.glassdoor.com/job-listing/foo-bar-123.htm") == "glassdoor"
|
||||||
|
|
||||||
|
|
||||||
|
def test_detect_board_generic():
|
||||||
|
from scripts.scrape_url import _detect_board
|
||||||
|
assert _detect_board("https://jobs.example.com/posting/42") == "generic"
|
||||||
|
|
||||||
|
|
||||||
|
def test_extract_linkedin_job_id():
|
||||||
|
from scripts.scrape_url import _extract_linkedin_job_id
|
||||||
|
assert _extract_linkedin_job_id("https://www.linkedin.com/jobs/view/4376518925/") == "4376518925"
|
||||||
|
assert _extract_linkedin_job_id("https://www.linkedin.com/comm/jobs/view/4376518925/?tracking=x") == "4376518925"
|
||||||
|
assert _extract_linkedin_job_id("https://example.com/no-id") is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_linkedin_updates_job(tmp_path):
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
|
||||||
|
linkedin_html = """<html><head></head><body>
|
||||||
|
<h2 class="top-card-layout__title">Customer Success Manager</h2>
|
||||||
|
<a class="topcard__org-name-link">Acme Corp</a>
|
||||||
|
<span class="topcard__flavor--bullet">San Francisco, CA</span>
|
||||||
|
<div class="show-more-less-html__markup">Exciting CSM role with great benefits.</div>
|
||||||
|
</body></html>"""
|
||||||
|
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.text = linkedin_html
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
result = scrape_job_url(db, job_id)
|
||||||
|
|
||||||
|
assert result.get("title") == "Customer Success Manager"
|
||||||
|
assert result.get("company") == "Acme Corp"
|
||||||
|
assert "CSM role" in result.get("description", "")
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone())
|
||||||
|
conn.close()
|
||||||
|
assert row["title"] == "Customer Success Manager"
|
||||||
|
assert row["company"] == "Acme Corp"
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_url_generic_json_ld(tmp_path):
|
||||||
|
db, job_id = _make_db(tmp_path, url="https://jobs.example.com/post/42")
|
||||||
|
|
||||||
|
json_ld_html = """<html><head>
|
||||||
|
<script type="application/ld+json">
|
||||||
|
{"@type": "JobPosting", "title": "TAM Role", "description": "Tech account mgmt.",
|
||||||
|
"hiringOrganization": {"name": "TechCo"},
|
||||||
|
"jobLocation": {"address": {"addressLocality": "Austin, TX"}}}
|
||||||
|
</script>
|
||||||
|
</head><body></body></html>"""
|
||||||
|
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.text = json_ld_html
|
||||||
|
mock_resp.raise_for_status = MagicMock()
|
||||||
|
|
||||||
|
with patch("scripts.scrape_url.requests.get", return_value=mock_resp):
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
result = scrape_job_url(db, job_id)
|
||||||
|
|
||||||
|
assert result.get("title") == "TAM Role"
|
||||||
|
assert result.get("company") == "TechCo"
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_url_graceful_on_http_error(tmp_path):
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
import requests as req
|
||||||
|
|
||||||
|
with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")):
|
||||||
|
from scripts.scrape_url import scrape_job_url
|
||||||
|
result = scrape_job_url(db, job_id)
|
||||||
|
|
||||||
|
# Should return empty dict and not raise; job row still exists
|
||||||
|
assert isinstance(result, dict)
|
||||||
|
import sqlite3
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row is not None
|
||||||
88
tests/test_sync.py
Normal file
88
tests/test_sync.py
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
# tests/test_sync.py
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import patch, MagicMock
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_FM = {
|
||||||
|
"title_field": "Salary", "job_title": "Job Title", "company": "Company Name",
|
||||||
|
"url": "Role Link", "source": "Job Source", "status": "Status of Application",
|
||||||
|
"status_new": "Application Submitted", "date_found": "Date Found",
|
||||||
|
"remote": "Remote", "match_score": "Match Score",
|
||||||
|
"keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description",
|
||||||
|
}
|
||||||
|
|
||||||
|
SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM}
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_pushes_approved_jobs(tmp_path):
|
||||||
|
"""sync_to_notion pushes approved jobs and marks them synced."""
|
||||||
|
from scripts.sync import sync_to_notion
|
||||||
|
from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
row_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://example.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "$100k", "description": "Good role", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
update_job_status(db_path, [row_id], "approved")
|
||||||
|
|
||||||
|
mock_notion = MagicMock()
|
||||||
|
mock_notion.pages.create.return_value = {"id": "notion-page-abc"}
|
||||||
|
|
||||||
|
with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
|
||||||
|
patch("scripts.sync.Client", return_value=mock_notion):
|
||||||
|
count = sync_to_notion(db_path=db_path)
|
||||||
|
|
||||||
|
assert count == 1
|
||||||
|
mock_notion.pages.create.assert_called_once()
|
||||||
|
synced = get_jobs_by_status(db_path, "synced")
|
||||||
|
assert len(synced) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_falls_back_to_core_fields_on_validation_error(tmp_path):
|
||||||
|
"""When Notion returns a validation_error (missing column), sync retries without optional fields."""
|
||||||
|
from scripts.sync import sync_to_notion
|
||||||
|
from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
row_id = insert_job(db_path, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://example.com/2",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
update_job_status(db_path, [row_id], "approved")
|
||||||
|
|
||||||
|
mock_notion = MagicMock()
|
||||||
|
# First call raises validation_error; second call (fallback) succeeds
|
||||||
|
mock_notion.pages.create.side_effect = [
|
||||||
|
Exception("validation_error: Could not find property with name: Match Score"),
|
||||||
|
{"id": "notion-page-fallback"},
|
||||||
|
]
|
||||||
|
|
||||||
|
with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
|
||||||
|
patch("scripts.sync.Client", return_value=mock_notion):
|
||||||
|
count = sync_to_notion(db_path=db_path)
|
||||||
|
|
||||||
|
assert count == 1
|
||||||
|
assert mock_notion.pages.create.call_count == 2
|
||||||
|
synced = get_jobs_by_status(db_path, "synced")
|
||||||
|
assert len(synced) == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_sync_returns_zero_when_nothing_approved(tmp_path):
|
||||||
|
"""sync_to_notion returns 0 when there are no approved jobs."""
|
||||||
|
from scripts.sync import sync_to_notion
|
||||||
|
from scripts.db import init_db
|
||||||
|
|
||||||
|
db_path = tmp_path / "test.db"
|
||||||
|
init_db(db_path)
|
||||||
|
|
||||||
|
with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \
|
||||||
|
patch("scripts.sync.Client"):
|
||||||
|
count = sync_to_notion(db_path=db_path)
|
||||||
|
|
||||||
|
assert count == 0
|
||||||
210
tests/test_task_runner.py
Normal file
210
tests/test_task_runner.py
Normal file
|
|
@ -0,0 +1,210 @@
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import patch
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
|
||||||
|
def _make_db(tmp_path):
|
||||||
|
from scripts.db import init_db, insert_job
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "Acme", "url": "https://ex.com/1",
|
||||||
|
"source": "linkedin", "location": "Remote", "is_remote": True,
|
||||||
|
"salary": "", "description": "Great role.", "date_found": "2026-02-20",
|
||||||
|
})
|
||||||
|
return db, job_id
|
||||||
|
|
||||||
|
|
||||||
|
def test_submit_task_returns_id_and_true(tmp_path):
|
||||||
|
"""submit_task returns (task_id, True) and spawns a thread."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
with patch("scripts.task_runner._run_task"): # don't actually call LLM
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
task_id, is_new = submit_task(db, "cover_letter", job_id)
|
||||||
|
assert isinstance(task_id, int) and task_id > 0
|
||||||
|
assert is_new is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_submit_task_deduplicates(tmp_path):
|
||||||
|
"""submit_task returns (existing_id, False) for a duplicate in-flight task."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
with patch("scripts.task_runner._run_task"):
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
first_id, _ = submit_task(db, "cover_letter", job_id)
|
||||||
|
second_id, is_new = submit_task(db, "cover_letter", job_id)
|
||||||
|
assert second_id == first_id
|
||||||
|
assert is_new is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_cover_letter_success(tmp_path):
|
||||||
|
"""_run_task marks running→completed and saves cover letter to DB."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
from scripts.db import insert_task, get_task_for_job
|
||||||
|
task_id, _ = insert_task(db, "cover_letter", job_id)
|
||||||
|
|
||||||
|
with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "cover_letter", job_id)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "cover_letter", job_id)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
assert task["error"] is None
|
||||||
|
|
||||||
|
conn = sqlite3.connect(db)
|
||||||
|
row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone()
|
||||||
|
conn.close()
|
||||||
|
assert row[0] == "Dear Hiring Manager,\nGreat fit!"
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_company_research_success(tmp_path):
|
||||||
|
"""_run_task marks running→completed and saves research to DB."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
from scripts.db import insert_task, get_task_for_job, get_research
|
||||||
|
|
||||||
|
task_id, _ = insert_task(db, "company_research", job_id)
|
||||||
|
fake_result = {
|
||||||
|
"raw_output": "raw", "company_brief": "brief",
|
||||||
|
"ceo_brief": "ceo", "talking_points": "points",
|
||||||
|
}
|
||||||
|
with patch("scripts.company_research.research_company", return_value=fake_result):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "company_research", job_id)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "company_research", job_id)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
|
||||||
|
research = get_research(db, job_id=job_id)
|
||||||
|
assert research["company_brief"] == "brief"
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_marks_failed_on_exception(tmp_path):
|
||||||
|
"""_run_task marks status=failed and stores error when generator raises."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
from scripts.db import insert_task, get_task_for_job
|
||||||
|
task_id, _ = insert_task(db, "cover_letter", job_id)
|
||||||
|
|
||||||
|
with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "cover_letter", job_id)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "cover_letter", job_id)
|
||||||
|
assert task["status"] == "failed"
|
||||||
|
assert "LLM timeout" in task["error"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_discovery_success(tmp_path):
|
||||||
|
"""_run_task with task_type=discovery calls run_discovery and stores count in error field."""
|
||||||
|
from scripts.db import init_db, insert_task, get_task_for_job
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
task_id, _ = insert_task(db, "discovery", 0)
|
||||||
|
|
||||||
|
with patch("scripts.discover.run_discovery", return_value=7):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "discovery", 0)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "discovery", 0)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
assert "7 new listings" in task["error"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_email_sync_success(tmp_path):
|
||||||
|
"""email_sync task calls sync_all and marks completed with summary."""
|
||||||
|
db, _ = _make_db(tmp_path)
|
||||||
|
from scripts.db import insert_task, get_task_for_job
|
||||||
|
task_id, _ = insert_task(db, "email_sync", 0)
|
||||||
|
|
||||||
|
summary = {"synced": 3, "inbound": 5, "outbound": 2, "new_leads": 1, "errors": []}
|
||||||
|
with patch("scripts.imap_sync.sync_all", return_value=summary):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "email_sync", 0)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "email_sync", 0)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
assert "3 jobs" in task["error"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_email_sync_file_not_found(tmp_path):
|
||||||
|
"""email_sync marks failed with helpful message when config is missing."""
|
||||||
|
db, _ = _make_db(tmp_path)
|
||||||
|
from scripts.db import insert_task, get_task_for_job
|
||||||
|
task_id, _ = insert_task(db, "email_sync", 0)
|
||||||
|
|
||||||
|
with patch("scripts.imap_sync.sync_all", side_effect=FileNotFoundError("config/email.yaml")):
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "email_sync", 0)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "email_sync", 0)
|
||||||
|
assert task["status"] == "failed"
|
||||||
|
assert "email" in task["error"].lower()
|
||||||
|
|
||||||
|
|
||||||
|
def test_submit_task_actually_completes(tmp_path):
|
||||||
|
"""Integration: submit_task spawns a thread that completes asynchronously."""
|
||||||
|
db, job_id = _make_db(tmp_path)
|
||||||
|
from scripts.db import get_task_for_job
|
||||||
|
|
||||||
|
with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"):
|
||||||
|
from scripts.task_runner import submit_task
|
||||||
|
task_id, _ = submit_task(db, "cover_letter", job_id)
|
||||||
|
# Wait for thread to complete (max 5s)
|
||||||
|
for _ in range(50):
|
||||||
|
task = get_task_for_job(db, "cover_letter", job_id)
|
||||||
|
if task and task["status"] in ("completed", "failed"):
|
||||||
|
break
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
task = get_task_for_job(db, "cover_letter", job_id)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_run_task_enrich_craigslist_success(tmp_path):
|
||||||
|
"""enrich_craigslist task calls enrich_craigslist_fields and marks completed."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, get_task_for_job
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/9.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $100k.",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
task_id, _ = insert_task(db, "enrich_craigslist", job_id)
|
||||||
|
|
||||||
|
with patch("scripts.enrich_descriptions.enrich_craigslist_fields",
|
||||||
|
return_value={"company": "Acme Corp", "salary": "$100k"}) as mock_enrich:
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "enrich_craigslist", job_id)
|
||||||
|
|
||||||
|
mock_enrich.assert_called_once_with(db, job_id)
|
||||||
|
task = get_task_for_job(db, "enrich_craigslist", job_id)
|
||||||
|
assert task["status"] == "completed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_scrape_url_submits_enrich_craigslist_for_craigslist_job(tmp_path):
|
||||||
|
"""After scrape_url completes for a craigslist job with empty company, enrich_craigslist is queued."""
|
||||||
|
from scripts.db import init_db, insert_job, insert_task, get_task_for_job
|
||||||
|
db = tmp_path / "test.db"
|
||||||
|
init_db(db)
|
||||||
|
job_id = insert_job(db, {
|
||||||
|
"title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/10.html",
|
||||||
|
"source": "craigslist", "location": "", "description": "",
|
||||||
|
"date_found": "2026-02-24",
|
||||||
|
})
|
||||||
|
task_id, _ = insert_task(db, "scrape_url", job_id)
|
||||||
|
|
||||||
|
with patch("scripts.scrape_url.scrape_job_url", return_value={"title": "CSM", "company": ""}):
|
||||||
|
with patch("scripts.task_runner.submit_task", wraps=None) as mock_submit:
|
||||||
|
# Use wraps=None so we can capture calls without actually spawning threads
|
||||||
|
mock_submit.return_value = (99, True)
|
||||||
|
from scripts.task_runner import _run_task
|
||||||
|
_run_task(db, task_id, "scrape_url", job_id)
|
||||||
|
|
||||||
|
# submit_task should have been called with enrich_craigslist
|
||||||
|
assert mock_submit.called
|
||||||
|
call_args = mock_submit.call_args
|
||||||
|
assert call_args[0][1] == "enrich_craigslist"
|
||||||
|
assert call_args[0][2] == job_id
|
||||||
Loading…
Reference in a new issue