From 1dc1ca89d77267aaadc924e70538ab1413c7f5ca Mon Sep 17 00:00:00 2001 From: pyr0ball Date: Tue, 24 Feb 2026 18:25:39 -0800 Subject: [PATCH] chore: seed Peregrine from personal job-seeker (pre-generalization) App: Peregrine Company: Circuit Forge LLC Source: github.com/pyr0ball/job-seeker (personal fork, not linked) --- .gitignore | 20 + app/.streamlit/config.toml | 7 + app/Home.py | 475 +++++++++++++ app/app.py | 119 ++++ app/pages/1_Job_Review.py | 203 ++++++ app/pages/2_Settings.py | 842 +++++++++++++++++++++++ app/pages/3_Resume_Editor.py | 191 ++++++ app/pages/4_Apply.py | 388 +++++++++++ app/pages/5_Interviews.py | 539 +++++++++++++++ app/pages/6_Interview_Prep.py | 371 ++++++++++ app/pages/7_Survey.py | 274 ++++++++ config/adzuna.yaml.example | 5 + config/blocklist.yaml | 15 + config/craigslist.yaml.example | 24 + config/email.yaml.example | 38 ++ config/llm.yaml | 66 ++ config/llm.yaml.example | 66 ++ config/notion.yaml.example | 24 + config/resume_keywords.yaml | 23 + config/resume_keywords.yaml.example | 33 + config/search_profiles.yaml | 123 ++++ data/survey_screenshots/.gitkeep | 0 environment.yml | 68 ++ pytest.ini | 2 + scripts/__init__.py | 0 scripts/company_research.py | 468 +++++++++++++ scripts/custom_boards/__init__.py | 1 + scripts/custom_boards/adzuna.py | 160 +++++ scripts/custom_boards/craigslist.py | 177 +++++ scripts/custom_boards/theladders.py | 179 +++++ scripts/db.py | 728 ++++++++++++++++++++ scripts/discover.py | 285 ++++++++ scripts/enrich_descriptions.py | 284 ++++++++ scripts/finetune_local.py | 248 +++++++ scripts/generate_cover_letter.py | 224 ++++++ scripts/imap_sync.py | 906 +++++++++++++++++++++++++ scripts/llm_router.py | 170 +++++ scripts/manage-ui.sh | 106 +++ scripts/manage-vision.sh | 113 +++ scripts/manage-vllm.sh | 160 +++++ scripts/match.py | 156 +++++ scripts/prepare_training_data.py | 134 ++++ scripts/scrape_url.py | 228 +++++++ scripts/sync.py | 97 +++ scripts/task_runner.py | 155 +++++ scripts/test_email_classify.py | 159 +++++ scripts/vision_service/environment.yml | 17 + scripts/vision_service/main.py | 98 +++ tests/__init__.py | 0 tests/test_company_research.py | 84 +++ tests/test_cover_letter.py | 120 ++++ tests/test_craigslist.py | 211 ++++++ tests/test_db.py | 560 +++++++++++++++ tests/test_discover.py | 185 +++++ tests/test_enrich_descriptions.py | 96 +++ tests/test_imap_sync.py | 330 +++++++++ tests/test_llm_router.py | 135 ++++ tests/test_match.py | 47 ++ tests/test_scrape_url.py | 135 ++++ tests/test_sync.py | 88 +++ tests/test_task_runner.py | 210 ++++++ 61 files changed, 11370 insertions(+) create mode 100644 .gitignore create mode 100644 app/.streamlit/config.toml create mode 100644 app/Home.py create mode 100644 app/app.py create mode 100644 app/pages/1_Job_Review.py create mode 100644 app/pages/2_Settings.py create mode 100644 app/pages/3_Resume_Editor.py create mode 100644 app/pages/4_Apply.py create mode 100644 app/pages/5_Interviews.py create mode 100644 app/pages/6_Interview_Prep.py create mode 100644 app/pages/7_Survey.py create mode 100644 config/adzuna.yaml.example create mode 100644 config/blocklist.yaml create mode 100644 config/craigslist.yaml.example create mode 100644 config/email.yaml.example create mode 100644 config/llm.yaml create mode 100644 config/llm.yaml.example create mode 100644 config/notion.yaml.example create mode 100644 config/resume_keywords.yaml create mode 100644 config/resume_keywords.yaml.example create mode 100644 config/search_profiles.yaml create mode 100644 data/survey_screenshots/.gitkeep create mode 100644 environment.yml create mode 100644 pytest.ini create mode 100644 scripts/__init__.py create mode 100644 scripts/company_research.py create mode 100644 scripts/custom_boards/__init__.py create mode 100644 scripts/custom_boards/adzuna.py create mode 100644 scripts/custom_boards/craigslist.py create mode 100644 scripts/custom_boards/theladders.py create mode 100644 scripts/db.py create mode 100644 scripts/discover.py create mode 100644 scripts/enrich_descriptions.py create mode 100644 scripts/finetune_local.py create mode 100644 scripts/generate_cover_letter.py create mode 100644 scripts/imap_sync.py create mode 100644 scripts/llm_router.py create mode 100755 scripts/manage-ui.sh create mode 100755 scripts/manage-vision.sh create mode 100755 scripts/manage-vllm.sh create mode 100644 scripts/match.py create mode 100644 scripts/prepare_training_data.py create mode 100644 scripts/scrape_url.py create mode 100644 scripts/sync.py create mode 100644 scripts/task_runner.py create mode 100644 scripts/test_email_classify.py create mode 100644 scripts/vision_service/environment.yml create mode 100644 scripts/vision_service/main.py create mode 100644 tests/__init__.py create mode 100644 tests/test_company_research.py create mode 100644 tests/test_cover_letter.py create mode 100644 tests/test_craigslist.py create mode 100644 tests/test_db.py create mode 100644 tests/test_discover.py create mode 100644 tests/test_enrich_descriptions.py create mode 100644 tests/test_imap_sync.py create mode 100644 tests/test_llm_router.py create mode 100644 tests/test_match.py create mode 100644 tests/test_scrape_url.py create mode 100644 tests/test_sync.py create mode 100644 tests/test_task_runner.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..75174d4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,20 @@ +.env +config/notion.yaml +config/tokens.yaml +config/email.yaml +config/adzuna.yaml +config/craigslist.yaml +__pycache__/ +*.pyc +.pytest_cache/ +output/ +aihawk/ +resume_matcher/ +staging.db +.streamlit.log +.streamlit.pid +.coverage +log/ +unsloth_compiled_cache/ +data/survey_screenshots/* +!data/survey_screenshots/.gitkeep diff --git a/app/.streamlit/config.toml b/app/.streamlit/config.toml new file mode 100644 index 0000000..218fba5 --- /dev/null +++ b/app/.streamlit/config.toml @@ -0,0 +1,7 @@ +[theme] +base = "dark" +primaryColor = "#2DD4BF" +backgroundColor = "#0F172A" +secondaryBackgroundColor = "#1E293B" +textColor = "#F1F5F9" +font = "sans serif" diff --git a/app/Home.py b/app/Home.py new file mode 100644 index 0000000..c516250 --- /dev/null +++ b/app/Home.py @@ -0,0 +1,475 @@ +# app/Home.py +""" +Job Seeker Dashboard β€” Home page. +Shows counts, Run Discovery button, and Sync to Notion button. +""" +import subprocess +import sys +from pathlib import Path + +import streamlit as st + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db, get_job_counts, purge_jobs, purge_email_data, \ + purge_non_remote, archive_jobs, kill_stuck_tasks, get_task_for_job, get_active_tasks, \ + insert_job, get_existing_urls +from scripts.task_runner import submit_task + +init_db(DEFAULT_DB) + + +def _dismissible(key: str, status: str, msg: str) -> None: + """Render a dismissible success/error message. key must be unique per task result.""" + if st.session_state.get(f"dismissed_{key}"): + return + col_msg, col_x = st.columns([10, 1]) + with col_msg: + if status == "completed": + st.success(msg) + else: + st.error(msg) + with col_x: + st.write("") + if st.button("βœ•", key=f"dismiss_{key}", help="Dismiss"): + st.session_state[f"dismissed_{key}"] = True + st.rerun() + + +def _queue_url_imports(db_path: Path, urls: list) -> int: + """Insert each URL as a pending manual job and queue a scrape_url task. + Returns count of newly queued jobs.""" + from datetime import datetime + from scripts.scrape_url import canonicalize_url + existing = get_existing_urls(db_path) + queued = 0 + for url in urls: + url = canonicalize_url(url.strip()) + if not url.startswith("http"): + continue + if url in existing: + continue + job_id = insert_job(db_path, { + "title": "Importing…", + "company": "", + "url": url, + "source": "manual", + "location": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + submit_task(db_path, "scrape_url", job_id) + queued += 1 + return queued + + +st.title("πŸ” Alex's Job Search") +st.caption("Discover β†’ Review β†’ Sync to Notion") + +st.divider() + + +@st.fragment(run_every=10) +def _live_counts(): + counts = get_job_counts(DEFAULT_DB) + col1, col2, col3, col4, col5 = st.columns(5) + col1.metric("Pending Review", counts.get("pending", 0)) + col2.metric("Approved", counts.get("approved", 0)) + col3.metric("Applied", counts.get("applied", 0)) + col4.metric("Synced to Notion", counts.get("synced", 0)) + col5.metric("Rejected", counts.get("rejected", 0)) + + +_live_counts() + +st.divider() + +left, enrich_col, mid, right = st.columns(4) + +with left: + st.subheader("Find New Jobs") + st.caption("Scrapes all configured boards and adds new listings to your review queue.") + + _disc_task = get_task_for_job(DEFAULT_DB, "discovery", 0) + _disc_running = _disc_task and _disc_task["status"] in ("queued", "running") + + if st.button("πŸš€ Run Discovery", use_container_width=True, type="primary", + disabled=bool(_disc_running)): + submit_task(DEFAULT_DB, "discovery", 0) + st.rerun() + + if _disc_running: + @st.fragment(run_every=4) + def _disc_status(): + t = get_task_for_job(DEFAULT_DB, "discovery", 0) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Scraping job boards… this may take a minute" + st.info(f"⏳ {lbl}") + else: + st.rerun() + _disc_status() + elif _disc_task and _disc_task["status"] == "completed": + _dismissible(f"disc_{_disc_task['id']}", "completed", + f"βœ… Discovery complete β€” {_disc_task.get('error', '')}. Head to Job Review.") + elif _disc_task and _disc_task["status"] == "failed": + _dismissible(f"disc_{_disc_task['id']}", "failed", + f"Discovery failed: {_disc_task.get('error', '')}") + +with enrich_col: + st.subheader("Enrich Descriptions") + st.caption("Re-fetch missing descriptions for any listing (LinkedIn, Indeed, Glassdoor, Adzuna, The Ladders, generic).") + + _enrich_task = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0) + _enrich_running = _enrich_task and _enrich_task["status"] in ("queued", "running") + + if st.button("πŸ” Fill Missing Descriptions", use_container_width=True, type="primary", + disabled=bool(_enrich_running)): + submit_task(DEFAULT_DB, "enrich_descriptions", 0) + st.rerun() + + if _enrich_running: + @st.fragment(run_every=4) + def _enrich_status(): + t = get_task_for_job(DEFAULT_DB, "enrich_descriptions", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Fetching descriptions…") + else: + st.rerun() + _enrich_status() + elif _enrich_task and _enrich_task["status"] == "completed": + _dismissible(f"enrich_{_enrich_task['id']}", "completed", + f"βœ… {_enrich_task.get('error', 'Done')}") + elif _enrich_task and _enrich_task["status"] == "failed": + _dismissible(f"enrich_{_enrich_task['id']}", "failed", + f"Enrich failed: {_enrich_task.get('error', '')}") + +with mid: + unscored = sum(1 for j in __import__("scripts.db", fromlist=["get_jobs_by_status"]) + .get_jobs_by_status(DEFAULT_DB, "pending") + if j.get("match_score") is None and j.get("description")) + st.subheader("Score Listings") + st.caption(f"Run TF-IDF match scoring against Alex's resume. {unscored} pending job{'s' if unscored != 1 else ''} unscored.") + if st.button("πŸ“Š Score All Unscored Jobs", use_container_width=True, type="primary", + disabled=unscored == 0): + with st.spinner("Scoring…"): + result = subprocess.run( + ["conda", "run", "-n", "job-seeker", "python", "scripts/match.py"], + capture_output=True, text=True, + cwd=str(Path(__file__).parent.parent), + ) + if result.returncode == 0: + st.success("Scoring complete!") + st.code(result.stdout) + else: + st.error("Scoring failed.") + st.code(result.stderr) + st.rerun() + +with right: + approved_count = get_job_counts(DEFAULT_DB).get("approved", 0) + st.subheader("Send to Notion") + st.caption("Push all approved jobs to your Notion tracking database.") + if approved_count == 0: + st.info("No approved jobs yet. Review and approve some listings first.") + else: + if st.button( + f"πŸ“€ Sync {approved_count} approved job{'s' if approved_count != 1 else ''} β†’ Notion", + use_container_width=True, type="primary", + ): + with st.spinner("Syncing to Notion…"): + from scripts.sync import sync_to_notion + count = sync_to_notion(DEFAULT_DB) + st.success(f"Synced {count} job{'s' if count != 1 else ''} to Notion!") + st.rerun() + +st.divider() + +# ── Email Sync ──────────────────────────────────────────────────────────────── +email_left, email_right = st.columns([3, 1]) + +with email_left: + st.subheader("Sync Emails") + st.caption("Pull inbound recruiter emails and match them to active applications. " + "New recruiter outreach is added to your Job Review queue.") + +with email_right: + _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0) + _email_running = _email_task and _email_task["status"] in ("queued", "running") + + if st.button("πŸ“§ Sync Emails", use_container_width=True, type="primary", + disabled=bool(_email_running)): + submit_task(DEFAULT_DB, "email_sync", 0) + st.rerun() + + if _email_running: + @st.fragment(run_every=4) + def _email_status(): + t = get_task_for_job(DEFAULT_DB, "email_sync", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Syncing emails…") + else: + st.rerun() + _email_status() + elif _email_task and _email_task["status"] == "completed": + _dismissible(f"email_{_email_task['id']}", "completed", + f"βœ… {_email_task.get('error', 'Done')}") + elif _email_task and _email_task["status"] == "failed": + _dismissible(f"email_{_email_task['id']}", "failed", + f"Sync failed: {_email_task.get('error', '')}") + +st.divider() + +# ── Add Jobs by URL ─────────────────────────────────────────────────────────── +add_left, _add_right = st.columns([3, 1]) +with add_left: + st.subheader("Add Jobs by URL") + st.caption("Paste job listing URLs to import and scrape in the background. " + "Supports LinkedIn, Indeed, Glassdoor, and most job boards.") + +url_tab, csv_tab = st.tabs(["Paste URLs", "Upload CSV"]) + +with url_tab: + url_text = st.text_area( + "urls", + placeholder="https://www.linkedin.com/jobs/view/1234567/\nhttps://www.indeed.com/viewjob?jk=abc", + height=100, + label_visibility="collapsed", + ) + if st.button("πŸ“₯ Add Jobs", key="add_urls_btn", use_container_width=True, + disabled=not (url_text or "").strip()): + _urls = [u.strip() for u in url_text.strip().splitlines() if u.strip().startswith("http")] + if _urls: + _n = _queue_url_imports(DEFAULT_DB, _urls) + if _n: + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import. Check Job Review shortly.") + else: + st.info("All URLs already in the database.") + st.rerun() + +with csv_tab: + csv_file = st.file_uploader("CSV with a URL column", type=["csv"], + label_visibility="collapsed") + if csv_file: + import csv as _csv + import io as _io + reader = _csv.DictReader(_io.StringIO(csv_file.read().decode("utf-8", errors="replace"))) + _csv_urls = [] + for row in reader: + for val in row.values(): + if val and val.strip().startswith("http"): + _csv_urls.append(val.strip()) + break + if _csv_urls: + st.caption(f"Found {len(_csv_urls)} URL(s) in CSV.") + if st.button("πŸ“₯ Import CSV Jobs", key="add_csv_btn", use_container_width=True): + _n = _queue_url_imports(DEFAULT_DB, _csv_urls) + st.success(f"Queued {_n} job{'s' if _n != 1 else ''} for import.") + st.rerun() + else: + st.warning("No URLs found β€” CSV must have a column whose values start with http.") + + +@st.fragment(run_every=3) +def _scrape_status(): + import sqlite3 as _sq + conn = _sq.connect(DEFAULT_DB) + conn.row_factory = _sq.Row + rows = conn.execute( + """SELECT bt.status, bt.error, j.title, j.company, j.url + FROM background_tasks bt + JOIN jobs j ON j.id = bt.job_id + WHERE bt.task_type = 'scrape_url' + AND bt.updated_at >= datetime('now', '-5 minutes') + ORDER BY bt.updated_at DESC LIMIT 20""" + ).fetchall() + conn.close() + if not rows: + return + st.caption("Recent URL imports:") + for r in rows: + if r["status"] == "running": + st.info(f"⏳ Scraping {r['url']}") + elif r["status"] == "completed": + label = r["title"] + (f" @ {r['company']}" if r["company"] else "") + st.success(f"βœ… {label}") + elif r["status"] == "failed": + st.error(f"❌ {r['url']} β€” {r['error'] or 'scrape failed'}") + + +_scrape_status() + +st.divider() + +# ── Danger zone: purge + re-scrape ──────────────────────────────────────────── +with st.expander("⚠️ Danger Zone", expanded=False): + st.caption( + "**Purge** permanently deletes jobs from the local database. " + "Applied and synced jobs are never touched." + ) + + purge_col, rescrape_col, email_col, tasks_col = st.columns(4) + + with purge_col: + st.markdown("**Purge pending & rejected**") + st.caption("Removes all _pending_ and _rejected_ listings so the next discovery starts fresh.") + if st.button("πŸ—‘ Purge Pending + Rejected", use_container_width=True): + st.session_state["confirm_purge"] = "partial" + + if st.session_state.get("confirm_purge") == "partial": + st.warning("Are you sure? This cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["pending", "rejected"]) + st.success(f"Purged {deleted} jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with email_col: + st.markdown("**Purge email data**") + st.caption("Clears all email thread logs and email-sourced pending jobs so the next sync starts fresh.") + if st.button("πŸ“§ Purge Email Data", use_container_width=True): + st.session_state["confirm_purge"] = "email" + + if st.session_state.get("confirm_purge") == "email": + st.warning("This deletes all email contacts and email-sourced jobs. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge emails", type="primary", use_container_width=True): + contacts, jobs = purge_email_data(DEFAULT_DB) + st.success(f"Purged {contacts} email contacts, {jobs} email jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with tasks_col: + _active = get_active_tasks(DEFAULT_DB) + st.markdown("**Kill stuck tasks**") + st.caption(f"Force-fail all queued/running background tasks. Currently **{len(_active)}** active.") + if st.button("⏹ Kill All Tasks", use_container_width=True, disabled=len(_active) == 0): + killed = kill_stuck_tasks(DEFAULT_DB) + st.success(f"Killed {killed} task(s).") + st.rerun() + + with rescrape_col: + st.markdown("**Purge all & re-scrape**") + st.caption("Wipes _all_ non-applied, non-synced jobs then immediately runs a fresh discovery.") + if st.button("πŸ”„ Purge All + Re-scrape", use_container_width=True): + st.session_state["confirm_purge"] = "full" + + if st.session_state.get("confirm_purge") == "full": + st.warning("This will delete ALL pending, approved, and rejected jobs, then re-scrape. Applied and synced records are kept.") + c1, c2 = st.columns(2) + if c1.button("Yes, wipe + scrape", type="primary", use_container_width=True): + purge_jobs(DEFAULT_DB, statuses=["pending", "approved", "rejected"]) + submit_task(DEFAULT_DB, "discovery", 0) + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + st.divider() + + pending_col, nonremote_col, approved_col, _ = st.columns(4) + + with pending_col: + st.markdown("**Purge pending review**") + st.caption("Removes only _pending_ listings, keeping your rejected history intact.") + if st.button("πŸ—‘ Purge Pending Only", use_container_width=True): + st.session_state["confirm_purge"] = "pending_only" + + if st.session_state.get("confirm_purge") == "pending_only": + st.warning("Deletes all pending jobs. Rejected jobs are kept. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge pending", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["pending"]) + st.success(f"Purged {deleted} pending jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with nonremote_col: + st.markdown("**Purge non-remote**") + st.caption("Removes pending/approved/rejected jobs where remote is not set. Keeps anything already in the pipeline.") + if st.button("🏒 Purge On-site Jobs", use_container_width=True): + st.session_state["confirm_purge"] = "non_remote" + + if st.session_state.get("confirm_purge") == "non_remote": + st.warning("Deletes all non-remote jobs not yet applied to. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge on-site", type="primary", use_container_width=True): + deleted = purge_non_remote(DEFAULT_DB) + st.success(f"Purged {deleted} non-remote jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with approved_col: + st.markdown("**Purge approved (unapplied)**") + st.caption("Removes _approved_ jobs you haven't applied to yet β€” e.g. to reset after a review pass.") + if st.button("πŸ—‘ Purge Approved", use_container_width=True): + st.session_state["confirm_purge"] = "approved_only" + + if st.session_state.get("confirm_purge") == "approved_only": + st.warning("Deletes all approved-but-not-applied jobs. Cannot be undone.") + c1, c2 = st.columns(2) + if c1.button("Yes, purge approved", type="primary", use_container_width=True): + deleted = purge_jobs(DEFAULT_DB, statuses=["approved"]) + st.success(f"Purged {deleted} approved jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + st.divider() + + archive_col1, archive_col2, _, _ = st.columns(4) + + with archive_col1: + st.markdown("**Archive remaining**") + st.caption( + "Move all _pending_ and _rejected_ jobs to archived status. " + "Archived jobs stay in the DB for dedup β€” they just won't appear in Job Review." + ) + if st.button("πŸ“¦ Archive Pending + Rejected", use_container_width=True): + st.session_state["confirm_purge"] = "archive_remaining" + + if st.session_state.get("confirm_purge") == "archive_remaining": + st.info("Jobs will be archived (not deleted) β€” URLs are kept for dedup.") + c1, c2 = st.columns(2) + if c1.button("Yes, archive", type="primary", use_container_width=True): + archived = archive_jobs(DEFAULT_DB, statuses=["pending", "rejected"]) + st.success(f"Archived {archived} jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() + + with archive_col2: + st.markdown("**Archive approved (unapplied)**") + st.caption("Archive _approved_ listings you decided to skip β€” keeps history without cluttering the apply queue.") + if st.button("πŸ“¦ Archive Approved", use_container_width=True): + st.session_state["confirm_purge"] = "archive_approved" + + if st.session_state.get("confirm_purge") == "archive_approved": + st.info("Approved jobs will be archived (not deleted).") + c1, c2 = st.columns(2) + if c1.button("Yes, archive approved", type="primary", use_container_width=True): + archived = archive_jobs(DEFAULT_DB, statuses=["approved"]) + st.success(f"Archived {archived} approved jobs.") + st.session_state.pop("confirm_purge", None) + st.rerun() + if c2.button("Cancel ", use_container_width=True): + st.session_state.pop("confirm_purge", None) + st.rerun() diff --git a/app/app.py b/app/app.py new file mode 100644 index 0000000..5f29348 --- /dev/null +++ b/app/app.py @@ -0,0 +1,119 @@ +# app/app.py +""" +Streamlit entry point β€” uses st.navigation() to control the sidebar. +Main workflow pages are listed at the top; Settings is separated into +a "System" section so it doesn't crowd the navigation. + +Run: streamlit run app/app.py + bash scripts/manage-ui.sh start +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import streamlit as st +from scripts.db import DEFAULT_DB, init_db, get_active_tasks +import sqlite3 + +st.set_page_config( + page_title="Job Seeker", + page_icon="πŸ’Ό", + layout="wide", +) + +init_db(DEFAULT_DB) + +# ── Startup cleanup β€” runs once per server process via cache_resource ────────── +@st.cache_resource +def _startup() -> None: + """Runs exactly once per server lifetime (st.cache_resource). + 1. Marks zombie tasks as failed. + 2. Auto-queues re-runs for any research generated without SearXNG data, + if SearXNG is now reachable. + """ + conn = sqlite3.connect(DEFAULT_DB) + conn.execute( + "UPDATE background_tasks SET status='failed', error='Interrupted by server restart'," + " finished_at=datetime('now') WHERE status IN ('queued','running')" + ) + conn.commit() + + # Auto-recovery: re-run LLM-only research when SearXNG is available + try: + import requests as _req + if _req.get("http://localhost:8888/", timeout=3).status_code == 200: + from scripts.task_runner import submit_task + _ACTIVE_STAGES = ("phone_screen", "interviewing", "offer", "hired") + rows = conn.execute( + """SELECT cr.job_id FROM company_research cr + JOIN jobs j ON j.id = cr.job_id + WHERE (cr.scrape_used IS NULL OR cr.scrape_used = 0) + AND j.status IN ({})""".format(",".join("?" * len(_ACTIVE_STAGES))), + _ACTIVE_STAGES, + ).fetchall() + for (job_id,) in rows: + submit_task(str(DEFAULT_DB), "company_research", job_id) + except Exception: + pass # never block startup + + conn.close() + +_startup() + +# ── Navigation ───────────────────────────────────────────────────────────────── +# st.navigation() must be called before any sidebar writes so it can establish +# the navigation structure first; sidebar additions come after. +pages = { + "": [ + st.Page("Home.py", title="Home", icon="🏠"), + st.Page("pages/1_Job_Review.py", title="Job Review", icon="πŸ“‹"), + st.Page("pages/4_Apply.py", title="Apply Workspace", icon="πŸš€"), + st.Page("pages/5_Interviews.py", title="Interviews", icon="🎯"), + st.Page("pages/6_Interview_Prep.py", title="Interview Prep", icon="πŸ“ž"), + st.Page("pages/7_Survey.py", title="Survey Assistant", icon="πŸ“‹"), + ], + "System": [ + st.Page("pages/2_Settings.py", title="Settings", icon="βš™οΈ"), + ], +} + +pg = st.navigation(pages) + +# ── Background task sidebar indicator ───────────────────────────────────────── +# Fragment polls every 3s so stage labels update live without a full page reload. +# The sidebar context WRAPS the fragment call β€” do not write to st.sidebar inside it. +@st.fragment(run_every=3) +def _task_indicator(): + tasks = get_active_tasks(DEFAULT_DB) + if not tasks: + return + st.divider() + st.markdown(f"**⏳ {len(tasks)} task(s) running**") + for t in tasks: + icon = "⏳" if t["status"] == "running" else "πŸ•" + task_type = t["task_type"] + if task_type == "cover_letter": + label = "Cover letter" + elif task_type == "company_research": + label = "Research" + elif task_type == "email_sync": + label = "Email sync" + elif task_type == "discovery": + label = "Discovery" + elif task_type == "enrich_descriptions": + label = "Enriching" + elif task_type == "scrape_url": + label = "Scraping URL" + elif task_type == "enrich_craigslist": + label = "Enriching listing" + else: + label = task_type.replace("_", " ").title() + stage = t.get("stage") or "" + detail = f" Β· {stage}" if stage else (f" β€” {t.get('company')}" if t.get("company") else "") + st.caption(f"{icon} {label}{detail}") + +with st.sidebar: + _task_indicator() + +pg.run() diff --git a/app/pages/1_Job_Review.py b/app/pages/1_Job_Review.py new file mode 100644 index 0000000..8f2c397 --- /dev/null +++ b/app/pages/1_Job_Review.py @@ -0,0 +1,203 @@ +# app/pages/1_Job_Review.py +""" +Job Review β€” browse listings, approve/reject inline, generate cover letters, +and mark approved jobs as applied. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +from scripts.db import ( + DEFAULT_DB, init_db, get_jobs_by_status, update_job_status, + update_cover_letter, mark_applied, get_email_leads, +) + +st.title("πŸ“‹ Job Review") + +init_db(DEFAULT_DB) + +_email_leads = get_email_leads(DEFAULT_DB) + +# ── Sidebar filters ──────────────────────────────────────────────────────────── +with st.sidebar: + st.header("Filters") + show_status = st.selectbox( + "Show", + ["pending", "approved", "applied", "rejected", "synced"], + index=0, + ) + remote_only = st.checkbox("Remote only", value=False) + min_score = st.slider("Min match score", 0, 100, 0) + + st.header("Sort") + sort_by = st.selectbox( + "Sort by", + ["Date Found (newest)", "Date Found (oldest)", "Match Score (highβ†’low)", "Match Score (lowβ†’high)", "Company A–Z", "Title A–Z"], + index=0, + ) + +jobs = get_jobs_by_status(DEFAULT_DB, show_status) + +if remote_only: + jobs = [j for j in jobs if j.get("is_remote")] +if min_score > 0: + jobs = [j for j in jobs if (j.get("match_score") or 0) >= min_score] + +# Apply sort +if sort_by == "Date Found (newest)": + jobs = sorted(jobs, key=lambda j: j.get("date_found") or "", reverse=True) +elif sort_by == "Date Found (oldest)": + jobs = sorted(jobs, key=lambda j: j.get("date_found") or "") +elif sort_by == "Match Score (highβ†’low)": + jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0, reverse=True) +elif sort_by == "Match Score (lowβ†’high)": + jobs = sorted(jobs, key=lambda j: j.get("match_score") or 0) +elif sort_by == "Company A–Z": + jobs = sorted(jobs, key=lambda j: (j.get("company") or "").lower()) +elif sort_by == "Title A–Z": + jobs = sorted(jobs, key=lambda j: (j.get("title") or "").lower()) + +if not jobs: + st.info(f"No {show_status} jobs matching your filters.") + st.stop() + +st.caption(f"Showing {len(jobs)} {show_status} job{'s' if len(jobs) != 1 else ''}") +st.divider() + +if show_status == "pending" and _email_leads: + st.subheader(f"πŸ“§ Email Leads ({len(_email_leads)})") + st.caption( + "Inbound recruiter emails not yet matched to a scraped listing. " + "Approve to add to Job Review; Reject to dismiss." + ) + for lead in _email_leads: + lead_id = lead["id"] + with st.container(border=True): + left_l, right_l = st.columns([7, 3]) + with left_l: + st.markdown(f"**{lead['title']}** β€” {lead['company']}") + badge_cols = st.columns(4) + badge_cols[0].caption("πŸ“§ Email Lead") + badge_cols[1].caption(f"πŸ“… {lead.get('date_found', '')}") + if lead.get("description"): + with st.expander("πŸ“„ Email excerpt", expanded=False): + st.text(lead["description"][:500]) + with right_l: + if st.button("βœ… Approve", key=f"el_approve_{lead_id}", + type="primary", use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "approved") + st.rerun() + if st.button("❌ Reject", key=f"el_reject_{lead_id}", + use_container_width=True): + update_job_status(DEFAULT_DB, [lead_id], "rejected") + st.rerun() + st.divider() + +# Filter email leads out of the main pending list (already shown above) +if show_status == "pending": + jobs = [j for j in jobs if j.get("source") != "email"] + +# ── Job cards ────────────────────────────────────────────────────────────────── +for job in jobs: + job_id = job["id"] + + score = job.get("match_score") + if score is None: + score_badge = "⬜ No score" + elif score >= 70: + score_badge = f"🟒 {score:.0f}%" + elif score >= 40: + score_badge = f"🟑 {score:.0f}%" + else: + score_badge = f"πŸ”΄ {score:.0f}%" + + remote_badge = "🌐 Remote" if job.get("is_remote") else "🏒 On-site" + src = (job.get("source") or "").lower() + source_badge = f"πŸ€– {src.title()}" if src == "linkedin" else f"πŸ‘€ {src.title() or 'Manual'}" + + with st.container(border=True): + left, right = st.columns([7, 3]) + + # ── Left: job info ───────────────────────────────────────────────────── + with left: + st.markdown(f"**{job['title']}** β€” {job['company']}") + + badge_cols = st.columns(4) + badge_cols[0].caption(remote_badge) + badge_cols[1].caption(source_badge) + badge_cols[2].caption(score_badge) + badge_cols[3].caption(f"πŸ“… {job.get('date_found', '')}") + + if job.get("keyword_gaps"): + st.caption(f"**Keyword gaps:** {job['keyword_gaps']}") + + # Cover letter expander (approved view) + if show_status == "approved": + _cl_key = f"cl_{job_id}" + if _cl_key not in st.session_state: + st.session_state[_cl_key] = job.get("cover_letter") or "" + + cl_exists = bool(st.session_state[_cl_key]) + with st.expander("πŸ“ Cover Letter", expanded=cl_exists): + gen_label = "Regenerate" if cl_exists else "Generate Cover Letter" + if st.button(gen_label, key=f"gen_{job_id}"): + with st.spinner("Generating via LLM…"): + try: + from scripts.generate_cover_letter import generate as _gen + st.session_state[_cl_key] = _gen( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + st.rerun() + except Exception as e: + st.error(f"Generation failed: {e}") + + st.text_area( + "cover_letter_edit", + key=_cl_key, + height=300, + label_visibility="collapsed", + ) + save_col, _ = st.columns([2, 5]) + if save_col.button("πŸ’Ύ Save draft", key=f"save_cl_{job_id}"): + update_cover_letter(DEFAULT_DB, job_id, st.session_state[_cl_key]) + st.success("Saved!") + + # Applied date + cover letter preview (applied/synced) + if show_status in ("applied", "synced") and job.get("applied_at"): + st.caption(f"βœ… Applied: {job['applied_at']}") + if show_status in ("applied", "synced") and job.get("cover_letter"): + with st.expander("πŸ“ Cover Letter (sent)"): + st.text(job["cover_letter"]) + + # ── Right: actions ───────────────────────────────────────────────────── + with right: + if job.get("url"): + st.link_button("View listing β†’", job["url"], use_container_width=True) + if job.get("salary"): + st.caption(f"πŸ’° {job['salary']}") + + if show_status == "pending": + if st.button("βœ… Approve", key=f"approve_{job_id}", + type="primary", use_container_width=True): + update_job_status(DEFAULT_DB, [job_id], "approved") + st.rerun() + if st.button("❌ Reject", key=f"reject_{job_id}", + use_container_width=True): + update_job_status(DEFAULT_DB, [job_id], "rejected") + st.rerun() + + elif show_status == "approved": + if st.button("πŸš€ Apply β†’", key=f"apply_page_{job_id}", + type="primary", use_container_width=True): + st.session_state["apply_job_id"] = job_id + st.switch_page("pages/4_Apply.py") + if st.button("βœ… Mark Applied", key=f"applied_{job_id}", + use_container_width=True): + cl_text = st.session_state.get(f"cl_{job_id}", "") + if cl_text: + update_cover_letter(DEFAULT_DB, job_id, cl_text) + mark_applied(DEFAULT_DB, [job_id]) + st.rerun() diff --git a/app/pages/2_Settings.py b/app/pages/2_Settings.py new file mode 100644 index 0000000..9e37a04 --- /dev/null +++ b/app/pages/2_Settings.py @@ -0,0 +1,842 @@ +# app/pages/2_Settings.py +""" +Settings β€” edit search profiles, LLM backends, Notion connection, services, +and resume profile (paste-able bullets used in Apply Workspace). +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.title("βš™οΈ Settings") + +CONFIG_DIR = Path(__file__).parent.parent.parent / "config" +SEARCH_CFG = CONFIG_DIR / "search_profiles.yaml" +BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" +LLM_CFG = CONFIG_DIR / "llm.yaml" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +KEYWORDS_CFG = CONFIG_DIR / "resume_keywords.yaml" + +def load_yaml(path: Path) -> dict: + if path.exists(): + return yaml.safe_load(path.read_text()) or {} + return {} + +def save_yaml(path: Path, data: dict) -> None: + path.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True)) + + +def _suggest_search_terms(current_titles: list[str], resume_path: Path) -> dict: + """Call LLM to suggest additional job titles and exclude keywords.""" + import json + import re + from scripts.llm_router import LLMRouter + + resume_context = "" + if resume_path.exists(): + resume = load_yaml(resume_path) + lines = [] + for exp in (resume.get("experience_details") or [])[:3]: + pos = exp.get("position", "") + co = exp.get("company", "") + skills = ", ".join((exp.get("skills_acquired") or [])[:5]) + lines.append(f"- {pos} at {co}: {skills}") + resume_context = "\n".join(lines) + + titles_str = "\n".join(f"- {t}" for t in current_titles) + prompt = f"""You are helping a job seeker optimize their search criteria. + +Their background (from resume): +{resume_context or "Customer success and technical account management leader"} + +Current job titles being searched: +{titles_str} + +Suggest: +1. 5-8 additional job titles they might be missing (alternative names, adjacent roles, senior variants) +2. 3-5 keywords to add to the exclusion filter (to screen out irrelevant postings) + +Return ONLY valid JSON in this exact format: +{{"suggested_titles": ["Title 1", "Title 2"], "suggested_excludes": ["keyword 1", "keyword 2"]}}""" + + result = LLMRouter().complete(prompt).strip() + m = re.search(r"\{.*\}", result, re.DOTALL) + if m: + try: + return json.loads(m.group()) + except Exception: + pass + return {"suggested_titles": [], "suggested_excludes": []} + +tab_search, tab_llm, tab_notion, tab_services, tab_resume, tab_email, tab_skills = st.tabs( + ["πŸ”Ž Search", "πŸ€– LLM Backends", "πŸ“š Notion", "πŸ”Œ Services", "πŸ“ Resume Profile", "πŸ“§ Email", "🏷️ Skills"] +) + +# ── Search tab ─────────────────────────────────────────────────────────────── +with tab_search: + cfg = load_yaml(SEARCH_CFG) + profiles = cfg.get("profiles", [{}]) + p = profiles[0] if profiles else {} + + # Seed session state from config on first load (or when config changes after save) + _sp_hash = str(p.get("titles", [])) + str(p.get("exclude_keywords", [])) + if st.session_state.get("_sp_hash") != _sp_hash: + st.session_state["_sp_titles"] = "\n".join(p.get("titles", [])) + st.session_state["_sp_excludes"] = "\n".join(p.get("exclude_keywords", [])) + st.session_state["_sp_hash"] = _sp_hash + + # ── Titles ──────────────────────────────────────────────────────────────── + title_row, suggest_btn_col = st.columns([4, 1]) + with title_row: + st.subheader("Job Titles to Search") + with suggest_btn_col: + st.write("") # vertical align + _run_suggest = st.button("✨ Suggest", key="sp_suggest_btn", + help="Ask the LLM to suggest additional titles and exclude keywords based on your resume") + + titles_text = st.text_area( + "One title per line", + key="_sp_titles", + height=150, + help="JobSpy will search for any of these titles across all configured boards.", + label_visibility="visible", + ) + + # ── LLM suggestions panel ──────────────────────────────────────────────── + if _run_suggest: + current = [t.strip() for t in titles_text.splitlines() if t.strip()] + with st.spinner("Asking LLM for suggestions…"): + suggestions = _suggest_search_terms(current, RESUME_PATH) + st.session_state["_sp_suggestions"] = suggestions + + if st.session_state.get("_sp_suggestions"): + sugg = st.session_state["_sp_suggestions"] + s_titles = sugg.get("suggested_titles", []) + s_excl = sugg.get("suggested_excludes", []) + + existing_titles = {t.lower() for t in titles_text.splitlines() if t.strip()} + existing_excl = {e.lower() for e in st.session_state.get("_sp_excludes", "").splitlines() if e.strip()} + + if s_titles: + st.caption("**Suggested titles** β€” click to add:") + cols = st.columns(min(len(s_titles), 4)) + for i, title in enumerate(s_titles): + with cols[i % 4]: + if title.lower() not in existing_titles: + if st.button(f"+ {title}", key=f"sp_add_title_{i}"): + st.session_state["_sp_titles"] = ( + st.session_state.get("_sp_titles", "").rstrip("\n") + f"\n{title}" + ) + st.rerun() + else: + st.caption(f"βœ“ {title}") + + if s_excl: + st.caption("**Suggested exclusions** β€” click to add:") + cols2 = st.columns(min(len(s_excl), 4)) + for i, kw in enumerate(s_excl): + with cols2[i % 4]: + if kw.lower() not in existing_excl: + if st.button(f"+ {kw}", key=f"sp_add_excl_{i}"): + st.session_state["_sp_excludes"] = ( + st.session_state.get("_sp_excludes", "").rstrip("\n") + f"\n{kw}" + ) + st.rerun() + else: + st.caption(f"βœ“ {kw}") + + if st.button("βœ• Clear suggestions", key="sp_clear_sugg"): + st.session_state.pop("_sp_suggestions", None) + st.rerun() + + st.subheader("Locations") + locations_text = st.text_area( + "One location per line", + value="\n".join(p.get("locations", [])), + height=100, + ) + + st.subheader("Exclude Keywords") + st.caption("Jobs whose **title or description** contain any of these words are silently dropped before entering the queue. Case-insensitive.") + exclude_text = st.text_area( + "One keyword or phrase per line", + key="_sp_excludes", + height=150, + help="e.g. 'sales', 'account executive', 'SDR'", + ) + + st.subheader("Job Boards") + board_options = ["linkedin", "indeed", "glassdoor", "zip_recruiter", "google"] + selected_boards = st.multiselect( + "Standard boards (via JobSpy)", board_options, + default=[b for b in p.get("boards", board_options) if b in board_options], + help="Google Jobs aggregates listings from many sources and often finds roles the other boards miss.", + ) + + _custom_board_options = ["adzuna", "theladders"] + _custom_board_labels = { + "adzuna": "Adzuna (free API β€” requires app_id + app_key in config/adzuna.yaml)", + "theladders": "The Ladders (curl_cffi scraper β€” $100K+ roles, requires curl_cffi)", + } + st.caption("**Custom boards** β€” scrapers built into this app, not part of JobSpy.") + selected_custom = st.multiselect( + "Custom boards", + options=_custom_board_options, + default=[b for b in p.get("custom_boards", []) if b in _custom_board_options], + format_func=lambda b: _custom_board_labels.get(b, b), + ) + + col1, col2 = st.columns(2) + results_per = col1.slider("Results per board", 5, 100, p.get("results_per_board", 25)) + hours_old = col2.slider("How far back to look (hours)", 24, 720, p.get("hours_old", 72)) + + if st.button("πŸ’Ύ Save search settings", type="primary"): + profiles[0] = { + **p, + "titles": [t.strip() for t in titles_text.splitlines() if t.strip()], + "locations": [loc.strip() for loc in locations_text.splitlines() if loc.strip()], + "boards": selected_boards, + "custom_boards": selected_custom, + "results_per_board": results_per, + "hours_old": hours_old, + "exclude_keywords": [k.strip() for k in exclude_text.splitlines() if k.strip()], + } + save_yaml(SEARCH_CFG, {"profiles": profiles}) + st.session_state["_sp_hash"] = "" # force re-seed on next load + st.session_state.pop("_sp_suggestions", None) + st.success("Search settings saved!") + + st.divider() + + # ── Blocklist ────────────────────────────────────────────────────────────── + with st.expander("🚫 Blocklist β€” companies, industries, and locations I will never work at", expanded=False): + st.caption( + "Listings matching any rule below are **silently dropped before entering the review queue**, " + "across all search profiles and custom boards. Changes take effect on the next discovery run." + ) + bl = load_yaml(BLOCKLIST_CFG) + + bl_companies = st.text_area( + "Company names (partial match, one per line)", + value="\n".join(bl.get("companies", [])), + height=120, + help="e.g. 'Amazon' blocks any listing where the company name contains 'amazon' (case-insensitive).", + key="bl_companies", + ) + bl_industries = st.text_area( + "Industry / content keywords (one per line)", + value="\n".join(bl.get("industries", [])), + height=100, + help="Blocked if the keyword appears in the company name OR job description. " + "e.g. 'gambling', 'crypto', 'tobacco', 'defense contractor'.", + key="bl_industries", + ) + bl_locations = st.text_area( + "Location strings to exclude (one per line)", + value="\n".join(bl.get("locations", [])), + height=80, + help="e.g. 'Dallas' blocks any listing whose location contains 'dallas'.", + key="bl_locations", + ) + + if st.button("πŸ’Ύ Save blocklist", type="primary", key="save_blocklist"): + save_yaml(BLOCKLIST_CFG, { + "companies": [c.strip() for c in bl_companies.splitlines() if c.strip()], + "industries": [i.strip() for i in bl_industries.splitlines() if i.strip()], + "locations": [loc.strip() for loc in bl_locations.splitlines() if loc.strip()], + }) + st.success("Blocklist saved β€” takes effect on next discovery run.") + +# ── LLM Backends tab ───────────────────────────────────────────────────────── +with tab_llm: + import requests as _req + + def _ollama_models(base_url: str) -> list[str]: + """Fetch installed model names from the Ollama /api/tags endpoint.""" + try: + r = _req.get(base_url.rstrip("/v1").rstrip("/") + "/api/tags", timeout=2) + if r.ok: + return [m["name"] for m in r.json().get("models", [])] + except Exception: + pass + return [] + + cfg = load_yaml(LLM_CFG) + backends = cfg.get("backends", {}) + fallback_order = cfg.get("fallback_order", list(backends.keys())) + + # Persist reordering across reruns triggered by ↑↓ buttons. + # Reset to config order whenever the config file is fresher than the session key. + _cfg_key = str(fallback_order) + if st.session_state.get("_llm_order_cfg_key") != _cfg_key: + st.session_state["_llm_order"] = list(fallback_order) + st.session_state["_llm_order_cfg_key"] = _cfg_key + new_order: list[str] = st.session_state["_llm_order"] + + # All known backends (in current order first, then any extras) + all_names = list(new_order) + [n for n in backends if n not in new_order] + + st.caption("Enable/disable backends and drag their priority with the ↑ ↓ buttons. " + "First enabled + reachable backend wins on each call.") + + updated_backends = {} + + for name in all_names: + b = backends.get(name, {}) + enabled = b.get("enabled", True) + label = name.replace("_", " ").title() + pos = new_order.index(name) + 1 if name in new_order else "β€”" + header = f"{'🟒' if enabled else '⚫'} **{pos}. {label}**" + + with st.expander(header, expanded=False): + col_tog, col_up, col_dn, col_spacer = st.columns([2, 1, 1, 4]) + + new_enabled = col_tog.checkbox("Enabled", value=enabled, key=f"{name}_enabled") + + # Up / Down only apply to backends currently in the order + if name in new_order: + idx = new_order.index(name) + if col_up.button("↑", key=f"{name}_up", disabled=idx == 0): + new_order[idx], new_order[idx - 1] = new_order[idx - 1], new_order[idx] + st.session_state["_llm_order"] = new_order + st.rerun() + if col_dn.button("↓", key=f"{name}_dn", disabled=idx == len(new_order) - 1): + new_order[idx], new_order[idx + 1] = new_order[idx + 1], new_order[idx] + st.session_state["_llm_order"] = new_order + st.rerun() + + if b.get("type") == "openai_compat": + url = st.text_input("URL", value=b.get("base_url", ""), key=f"{name}_url") + + # Ollama gets a live model picker; other backends get a text input + if name == "ollama": + ollama_models = _ollama_models(b.get("base_url", "http://localhost:11434")) + current_model = b.get("model", "") + if ollama_models: + options = ollama_models + idx_default = options.index(current_model) if current_model in options else 0 + model = st.selectbox( + "Model", + options, + index=idx_default, + key=f"{name}_model", + help="Lists models currently installed in Ollama. Pull new ones with `ollama pull `.", + ) + else: + st.caption("_Ollama not reachable β€” enter model name manually_") + model = st.text_input("Model", value=current_model, key=f"{name}_model") + else: + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + + updated_backends[name] = {**b, "base_url": url, "model": model, "enabled": new_enabled} + elif b.get("type") == "anthropic": + model = st.text_input("Model", value=b.get("model", ""), key=f"{name}_model") + updated_backends[name] = {**b, "model": model, "enabled": new_enabled} + else: + updated_backends[name] = {**b, "enabled": new_enabled} + + if b.get("type") == "openai_compat": + if st.button(f"Test connection", key=f"test_{name}"): + with st.spinner("Testing…"): + try: + from scripts.llm_router import LLMRouter + r = LLMRouter() + reachable = r._is_reachable(b.get("base_url", "")) + if reachable: + st.success("Reachable βœ“") + else: + st.warning("Not reachable βœ—") + except Exception as e: + st.error(f"Error: {e}") + + st.divider() + st.caption("Current priority: " + " β†’ ".join( + f"{'βœ“' if backends.get(n, {}).get('enabled', True) else 'βœ—'} {n}" + for n in new_order + )) + + if st.button("πŸ’Ύ Save LLM settings", type="primary"): + save_yaml(LLM_CFG, {**cfg, "backends": updated_backends, "fallback_order": new_order}) + st.session_state.pop("_llm_order", None) + st.session_state.pop("_llm_order_cfg_key", None) + st.success("LLM settings saved!") + +# ── Notion tab ──────────────────────────────────────────────────────────────── +with tab_notion: + cfg = load_yaml(NOTION_CFG) if NOTION_CFG.exists() else {} + + st.subheader("Notion Connection") + token = st.text_input( + "Integration Token", + value=cfg.get("token", ""), + type="password", + help="Find this at notion.so/my-integrations β†’ your integration β†’ Internal Integration Token", + ) + db_id = st.text_input( + "Database ID", + value=cfg.get("database_id", ""), + help="The 32-character ID from your Notion database URL", + ) + + col_save, col_test = st.columns(2) + if col_save.button("πŸ’Ύ Save Notion settings", type="primary"): + save_yaml(NOTION_CFG, {**cfg, "token": token, "database_id": db_id}) + st.success("Notion settings saved!") + + if col_test.button("πŸ”Œ Test connection"): + with st.spinner("Connecting…"): + try: + from notion_client import Client + n = Client(auth=token) + db = n.databases.retrieve(db_id) + st.success(f"Connected to: **{db['title'][0]['plain_text']}**") + except Exception as e: + st.error(f"Connection failed: {e}") + +# ── Services tab ─────────────────────────────────────────────────────────────── +with tab_services: + import socket + import subprocess as _sp + + TOKENS_CFG = CONFIG_DIR / "tokens.yaml" + PFP_DIR = Path("/Library/Documents/Post Fight Processing") + + # Service definitions: (display_name, port, start_cmd, stop_cmd, notes) + SERVICES = [ + { + "name": "Streamlit UI", + "port": 8501, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-ui.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "note": "Job Seeker web interface", + }, + { + "name": "Ollama (local LLM)", + "port": 11434, + "start": ["sudo", "systemctl", "start", "ollama"], + "stop": ["sudo", "systemctl", "stop", "ollama"], + "cwd": "/", + "note": "Local inference engine β€” systemd service", + }, + { + "name": "Claude Code Wrapper", + "port": 3009, + "start": ["bash", str(PFP_DIR / "manage-services.sh"), "start"], + "stop": ["bash", str(PFP_DIR / "manage-services.sh"), "stop"], + "cwd": str(PFP_DIR), + "note": "OpenAI-compat proxy β†’ Claude Code (port 3009)", + }, + { + "name": "GitHub Copilot Wrapper", + "port": 3010, + "start": ["bash", str(PFP_DIR / "manage-copilot.sh"), "start"], + "stop": ["bash", str(PFP_DIR / "manage-copilot.sh"), "stop"], + "cwd": str(PFP_DIR), + "note": "OpenAI-compat proxy β†’ GitHub Copilot (port 3010)", + }, + { + "name": "vLLM Server", + "port": 8000, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vllm.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "model_dir": "/Library/Assets/LLM/vllm/models", + "note": "Local vLLM inference β€” Ouro model family (port 8000, GPU 1)", + }, + { + "name": "Vision Service (moondream2)", + "port": 8002, + "start": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "start"], + "stop": ["bash", str(Path(__file__).parent.parent.parent / "scripts/manage-vision.sh"), "stop"], + "cwd": str(Path(__file__).parent.parent.parent), + "note": "Survey screenshot analysis β€” moondream2 (port 8002, optional)", + }, + { + "name": "SearXNG (company scraper)", + "port": 8888, + "start": ["docker", "compose", "up", "-d"], + "stop": ["docker", "compose", "down"], + "cwd": str(Path("/Library/Development/scrapers/SearXNG")), + "note": "Privacy-respecting meta-search used for company research (port 8888)", + }, + ] + + def _port_open(port: int) -> bool: + try: + with socket.create_connection(("127.0.0.1", port), timeout=1): + return True + except OSError: + return False + + st.caption("Monitor and control the LLM backend services. Status is checked live on each page load.") + + for svc in SERVICES: + up = _port_open(svc["port"]) + badge = "🟒 Running" if up else "πŸ”΄ Stopped" + header = f"**{svc['name']}** β€” {badge}" + + with st.container(border=True): + left_col, right_col = st.columns([3, 1]) + with left_col: + st.markdown(header) + st.caption(f"Port {svc['port']} Β· {svc['note']}") + + # Model selector for services backed by a local model directory (e.g. vLLM) + if "model_dir" in svc: + _mdir = Path(svc["model_dir"]) + _models = ( + sorted(d.name for d in _mdir.iterdir() if d.is_dir()) + if _mdir.exists() else [] + ) + _mk = f"svc_model_{svc['port']}" + _loaded_file = Path("/tmp/vllm-server.model") + _loaded = _loaded_file.read_text().strip() if (_loaded_file.exists()) else "" + if _models: + _default = _models.index(_loaded) if _loaded in _models else 0 + st.selectbox( + "Model", + _models, + index=_default, + key=_mk, + disabled=up, + help="Model to load on start. Stop then Start to swap models.", + ) + else: + st.caption(f"_No models found in {svc['model_dir']}_") + + with right_col: + if svc["start"] is None: + st.caption("_Manual start only_") + elif up: + if st.button("⏹ Stop", key=f"svc_stop_{svc['port']}", use_container_width=True): + with st.spinner(f"Stopping {svc['name']}…"): + r = _sp.run(svc["stop"], capture_output=True, text=True, cwd=svc["cwd"]) + if r.returncode == 0: + st.success("Stopped.") + else: + st.error(f"Error: {r.stderr or r.stdout}") + st.rerun() + else: + # Build start command, appending selected model for services with model_dir + _start_cmd = list(svc["start"]) + if "model_dir" in svc: + _sel = st.session_state.get(f"svc_model_{svc['port']}") + if _sel: + _start_cmd.append(_sel) + if st.button("β–Ά Start", key=f"svc_start_{svc['port']}", use_container_width=True, type="primary"): + with st.spinner(f"Starting {svc['name']}…"): + r = _sp.run(_start_cmd, capture_output=True, text=True, cwd=svc["cwd"]) + if r.returncode == 0: + st.success("Started!") + else: + st.error(f"Error: {r.stderr or r.stdout}") + st.rerun() + + st.divider() + st.subheader("πŸ€— Hugging Face") + st.caption( + "Used for uploading training data and running fine-tune jobs on HF infrastructure. " + "Token is stored in `config/tokens.yaml` (git-ignored). " + "Create a **write-permission** token at huggingface.co/settings/tokens." + ) + + tok_cfg = load_yaml(TOKENS_CFG) if TOKENS_CFG.exists() else {} + hf_token = st.text_input( + "HF Token", + value=tok_cfg.get("hf_token", ""), + type="password", + placeholder="hf_…", + ) + + col_save_hf, col_test_hf = st.columns(2) + if col_save_hf.button("πŸ’Ύ Save HF token", type="primary"): + save_yaml(TOKENS_CFG, {**tok_cfg, "hf_token": hf_token}) + TOKENS_CFG.chmod(0o600) + st.success("Saved!") + + if col_test_hf.button("πŸ”Œ Test HF token"): + with st.spinner("Checking…"): + try: + import requests as _r + resp = _r.get( + "https://huggingface.co/api/whoami", + headers={"Authorization": f"Bearer {hf_token}"}, + timeout=5, + ) + if resp.ok: + info = resp.json() + name = info.get("name") or info.get("fullname") or "unknown" + auth = info.get("auth", {}) + perm = auth.get("accessToken", {}).get("role", "read") + st.success(f"Logged in as **{name}** Β· permission: `{perm}`") + if perm == "read": + st.warning("Token is read-only β€” create a **write** token to upload datasets and run Jobs.") + else: + st.error(f"Invalid token ({resp.status_code})") + except Exception as e: + st.error(f"Error: {e}") + +# ── Resume Profile tab ──────────────────────────────────────────────────────── +with tab_resume: + st.caption( + "Edit Alex's application profile. " + "Bullets are used as paste-able shortcuts in the Apply Workspace." + ) + + if not RESUME_PATH.exists(): + st.error(f"Resume YAML not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.stop() + + _data = yaml.safe_load(RESUME_PATH.read_text()) or {} + + def _field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str: + needs_attention = str(value).startswith("FILL_IN") or value == "" + if needs_attention: + st.markdown( + '

⚠️ Needs attention

', + unsafe_allow_html=True, + ) + return st.text_input(label, value=value or "", key=key, help=help, + type="password" if password else "default") + + # ── Personal Info ───────────────────────────────────────────────────────── + with st.expander("πŸ‘€ Personal Information", expanded=True): + _info = _data.get("personal_information", {}) + _c1, _c2 = st.columns(2) + with _c1: + _name = _field("First Name", _info.get("name", ""), "rp_name") + _email = _field("Email", _info.get("email", ""), "rp_email") + _phone = _field("Phone", _info.get("phone", ""), "rp_phone") + _city = _field("City", _info.get("city", ""), "rp_city") + with _c2: + _surname = _field("Last Name", _info.get("surname", ""), "rp_surname") + _linkedin = _field("LinkedIn URL", _info.get("linkedin", ""), "rp_linkedin") + _zip_code = _field("Zip Code", _info.get("zip_code", ""), "rp_zip") + _dob = _field("Date of Birth", _info.get("date_of_birth", ""), "rp_dob", + help="MM/DD/YYYY") + + # ── Experience ──────────────────────────────────────────────────────────── + with st.expander("πŸ’Ό Work Experience"): + _exp_list = _data.get("experience_details", [{}]) + if "rp_exp_count" not in st.session_state: + st.session_state.rp_exp_count = len(_exp_list) + if st.button("+ Add Experience Entry", key="rp_add_exp"): + st.session_state.rp_exp_count += 1 + _exp_list.append({}) + + _updated_exp = [] + for _i in range(st.session_state.rp_exp_count): + _exp = _exp_list[_i] if _i < len(_exp_list) else {} + st.markdown(f"**Position {_i + 1}**") + _ec1, _ec2 = st.columns(2) + with _ec1: + _pos = _field("Job Title", _exp.get("position", ""), f"rp_pos_{_i}") + _co = _field("Company", _exp.get("company", ""), f"rp_co_{_i}") + _period = _field("Period", _exp.get("employment_period", ""), f"rp_period_{_i}", + help="e.g. 01/2022 - Present") + with _ec2: + _loc = st.text_input("Location", _exp.get("location", ""), key=f"rp_loc_{_i}") + _ind = st.text_input("Industry", _exp.get("industry", ""), key=f"rp_ind_{_i}") + _resp_raw = st.text_area( + "Key Responsibilities (one per line)", + value="\n".join( + r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r) + for j, r in enumerate(_exp.get("key_responsibilities", [])) + ), + key=f"rp_resp_{_i}", height=100, + ) + _skills_raw = st.text_input( + "Skills (comma-separated)", + value=", ".join(_exp.get("skills_acquired", [])), + key=f"rp_skills_{_i}", + ) + _updated_exp.append({ + "position": _pos, "company": _co, "employment_period": _period, + "location": _loc, "industry": _ind, + "key_responsibilities": [{"responsibility_1": r.strip()} for r in _resp_raw.splitlines() if r.strip()], + "skills_acquired": [s.strip() for s in _skills_raw.split(",") if s.strip()], + }) + st.divider() + + # ── Preferences ─────────────────────────────────────────────────────────── + with st.expander("βš™οΈ Preferences & Availability"): + _wp = _data.get("work_preferences", {}) + _sal = _data.get("salary_expectations", {}) + _avail = _data.get("availability", {}) + _pc1, _pc2 = st.columns(2) + with _pc1: + _salary_range = st.text_input("Salary Range (USD)", _sal.get("salary_range_usd", ""), + key="rp_salary", help="e.g. 120000 - 180000") + _notice = st.text_input("Notice Period", _avail.get("notice_period", "2 weeks"), key="rp_notice") + with _pc2: + _remote = st.checkbox("Open to Remote", value=_wp.get("remote_work", "Yes") == "Yes", key="rp_remote") + _reloc = st.checkbox("Open to Relocation", value=_wp.get("open_to_relocation", "No") == "Yes", key="rp_reloc") + _assessments = st.checkbox("Willing to complete assessments", + value=_wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="rp_assess") + _bg = st.checkbox("Willing to undergo background checks", + value=_wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="rp_bg") + + # ── Self-ID ─────────────────────────────────────────────────────────────── + with st.expander("πŸ³οΈβ€πŸŒˆ Self-Identification (optional)"): + _sid = _data.get("self_identification", {}) + _sc1, _sc2 = st.columns(2) + with _sc1: + _gender = st.text_input("Gender identity", _sid.get("gender", "Non-binary"), key="rp_gender") + _pronouns = st.text_input("Pronouns", _sid.get("pronouns", "Any"), key="rp_pronouns") + _ethnicity = _field("Ethnicity", _sid.get("ethnicity", ""), "rp_ethnicity") + with _sc2: + _vet_opts = ["No", "Yes", "Prefer not to say"] + _veteran = st.selectbox("Veteran status", _vet_opts, + index=_vet_opts.index(_sid.get("veteran", "No")), key="rp_vet") + _dis_opts = ["Prefer not to say", "No", "Yes"] + _disability = st.selectbox("Disability disclosure", _dis_opts, + index=_dis_opts.index(_sid.get("disability", "Prefer not to say")), + key="rp_dis") + + st.divider() + if st.button("πŸ’Ύ Save Resume Profile", type="primary", use_container_width=True, key="rp_save"): + _data["personal_information"] = { + **_data.get("personal_information", {}), + "name": _name, "surname": _surname, "email": _email, "phone": _phone, + "city": _city, "zip_code": _zip_code, "linkedin": _linkedin, "date_of_birth": _dob, + } + _data["experience_details"] = _updated_exp + _data["salary_expectations"] = {"salary_range_usd": _salary_range} + _data["availability"] = {"notice_period": _notice} + _data["work_preferences"] = { + **_data.get("work_preferences", {}), + "remote_work": "Yes" if _remote else "No", + "open_to_relocation": "Yes" if _reloc else "No", + "willing_to_complete_assessments": "Yes" if _assessments else "No", + "willing_to_undergo_background_checks": "Yes" if _bg else "No", + } + _data["self_identification"] = { + "gender": _gender, "pronouns": _pronouns, "veteran": _veteran, + "disability": _disability, "ethnicity": _ethnicity, + } + RESUME_PATH.write_text(yaml.dump(_data, default_flow_style=False, allow_unicode=True)) + st.success("βœ… Resume profile saved!") + st.balloons() + +# ── Email tab ───────────────────────────────────────────────────────────────── +with tab_email: + EMAIL_CFG = CONFIG_DIR / "email.yaml" + EMAIL_EXAMPLE = CONFIG_DIR / "email.yaml.example" + + st.caption( + "Connect Alex's email via IMAP to automatically associate recruitment " + "emails with job applications. Only emails that mention the company name " + "AND contain a recruitment keyword are ever imported β€” no personal emails " + "are touched." + ) + + if not EMAIL_CFG.exists(): + st.info("No email config found β€” fill in your credentials below and click **Save** to create it.") + + em_cfg = load_yaml(EMAIL_CFG) if EMAIL_CFG.exists() else {} + + col_a, col_b = st.columns(2) + with col_a: + em_host = st.text_input("IMAP Host", em_cfg.get("host", "imap.gmail.com"), key="em_host") + em_port = st.number_input("Port", value=int(em_cfg.get("port", 993)), + min_value=1, max_value=65535, key="em_port") + em_ssl = st.checkbox("Use SSL", value=em_cfg.get("use_ssl", True), key="em_ssl") + with col_b: + em_user = st.text_input("Username (email address)", em_cfg.get("username", ""), key="em_user") + em_pass = st.text_input("Password / App Password", em_cfg.get("password", ""), + type="password", key="em_pass") + em_sent = st.text_input("Sent folder (blank = auto-detect)", + em_cfg.get("sent_folder", ""), key="em_sent", + placeholder='e.g. "[Gmail]/Sent Mail"') + + em_days = st.slider("Look-back window (days)", 14, 365, + int(em_cfg.get("lookback_days", 90)), key="em_days") + + st.caption( + "**Gmail users:** create an App Password at " + "myaccount.google.com/apppasswords (requires 2-Step Verification). " + "Enable IMAP at Gmail Settings β†’ Forwarding and POP/IMAP." + ) + + col_save, col_test = st.columns(2) + + if col_save.button("πŸ’Ύ Save email settings", type="primary", key="em_save"): + save_yaml(EMAIL_CFG, { + "host": em_host, "port": int(em_port), "use_ssl": em_ssl, + "username": em_user, "password": em_pass, + "sent_folder": em_sent, "lookback_days": int(em_days), + }) + EMAIL_CFG.chmod(0o600) + st.success("Saved!") + + if col_test.button("πŸ”Œ Test connection", key="em_test"): + with st.spinner("Connecting…"): + try: + import imaplib as _imap + _conn = (_imap.IMAP4_SSL if em_ssl else _imap.IMAP4)(em_host, int(em_port)) + _conn.login(em_user, em_pass) + _, _caps = _conn.capability() + _conn.logout() + st.success(f"Connected successfully to {em_host}") + except Exception as e: + st.error(f"Connection failed: {e}") + +# ── Skills & Keywords tab ───────────────────────────────────────────────────── +with tab_skills: + st.subheader("🏷️ Skills & Keywords") + st.caption( + "These are matched against job descriptions to select Alex's most relevant " + "experience and highlight keyword overlap in the research brief." + ) + + if not KEYWORDS_CFG.exists(): + st.warning("resume_keywords.yaml not found β€” create it at config/resume_keywords.yaml") + else: + kw_data = load_yaml(KEYWORDS_CFG) + + changed = False + for category in ["skills", "domains", "keywords"]: + st.markdown(f"**{category.title()}**") + tags: list[str] = kw_data.get(category, []) + + if not tags: + st.caption("No tags yet β€” add one below.") + + # Render existing tags as removable chips (value-based keys for stability) + n_cols = min(max(len(tags), 1), 6) + cols = st.columns(n_cols) + to_remove = None + for i, tag in enumerate(tags): + with cols[i % n_cols]: + if st.button(f"Γ— {tag}", key=f"rm_{category}_{tag}", use_container_width=True): + to_remove = tag + if to_remove: + tags.remove(to_remove) + kw_data[category] = tags + changed = True + + # Add new tag + new_col, btn_col = st.columns([4, 1]) + new_tag = new_col.text_input( + "Add", + key=f"new_{category}", + label_visibility="collapsed", + placeholder=f"Add {category[:-1] if category.endswith('s') else category}…", + ) + if btn_col.button("οΌ‹ Add", key=f"add_{category}"): + tag = new_tag.strip() + if tag and tag not in tags: + tags.append(tag) + kw_data[category] = tags + changed = True + + st.markdown("---") + + if changed: + save_yaml(KEYWORDS_CFG, kw_data) + st.success("Saved.") + st.rerun() diff --git a/app/pages/3_Resume_Editor.py b/app/pages/3_Resume_Editor.py new file mode 100644 index 0000000..092c2a3 --- /dev/null +++ b/app/pages/3_Resume_Editor.py @@ -0,0 +1,191 @@ +# app/pages/3_Resume_Editor.py +""" +Resume Editor β€” form-based editor for Alex's AIHawk profile YAML. +FILL_IN fields highlighted in amber. +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import yaml + +st.set_page_config(page_title="Resume Editor", page_icon="πŸ“", layout="wide") +st.title("πŸ“ Resume Editor") +st.caption("Edit Alex's application profile used by AIHawk for LinkedIn Easy Apply.") + +RESUME_PATH = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" + +if not RESUME_PATH.exists(): + st.error(f"Resume file not found at `{RESUME_PATH}`. Is AIHawk cloned?") + st.stop() + +data = yaml.safe_load(RESUME_PATH.read_text()) or {} + + +def field(label: str, value: str, key: str, help: str = "", password: bool = False) -> str: + """Render a text input, highlighted amber if value is FILL_IN or empty.""" + needs_attention = str(value).startswith("FILL_IN") or value == "" + if needs_attention: + st.markdown( + '

⚠️ Needs your attention

', + unsafe_allow_html=True, + ) + return st.text_input(label, value=value or "", key=key, help=help, + type="password" if password else "default") + + +st.divider() + +# ── Personal Info ───────────────────────────────────────────────────────────── +with st.expander("πŸ‘€ Personal Information", expanded=True): + info = data.get("personal_information", {}) + col1, col2 = st.columns(2) + with col1: + name = field("First Name", info.get("name", ""), "pi_name") + email = field("Email", info.get("email", ""), "pi_email") + phone = field("Phone", info.get("phone", ""), "pi_phone") + city = field("City", info.get("city", ""), "pi_city") + with col2: + surname = field("Last Name", info.get("surname", ""), "pi_surname") + linkedin = field("LinkedIn URL", info.get("linkedin", ""), "pi_linkedin") + zip_code = field("Zip Code", info.get("zip_code", ""), "pi_zip") + dob = field("Date of Birth", info.get("date_of_birth", ""), "pi_dob", + help="Format: MM/DD/YYYY") + +# ── Education ───────────────────────────────────────────────────────────────── +with st.expander("πŸŽ“ Education"): + edu_list = data.get("education_details", [{}]) + updated_edu = [] + degree_options = ["Bachelor's Degree", "Master's Degree", "Some College", + "Associate's Degree", "High School", "Other"] + for i, edu in enumerate(edu_list): + st.markdown(f"**Entry {i+1}**") + col1, col2 = st.columns(2) + with col1: + inst = field("Institution", edu.get("institution", ""), f"edu_inst_{i}") + field_study = st.text_input("Field of Study", edu.get("field_of_study", ""), key=f"edu_field_{i}") + start = st.text_input("Start Year", edu.get("start_date", ""), key=f"edu_start_{i}") + with col2: + current_level = edu.get("education_level", "Some College") + level_idx = degree_options.index(current_level) if current_level in degree_options else 2 + level = st.selectbox("Degree Level", degree_options, index=level_idx, key=f"edu_level_{i}") + end = st.text_input("Completion Year", edu.get("year_of_completion", ""), key=f"edu_end_{i}") + updated_edu.append({ + "education_level": level, "institution": inst, "field_of_study": field_study, + "start_date": start, "year_of_completion": end, "final_evaluation_grade": "", "exam": {}, + }) + st.divider() + +# ── Experience ──────────────────────────────────────────────────────────────── +with st.expander("πŸ’Ό Work Experience"): + exp_list = data.get("experience_details", [{}]) + if "exp_count" not in st.session_state: + st.session_state.exp_count = len(exp_list) + if st.button("+ Add Experience Entry"): + st.session_state.exp_count += 1 + exp_list.append({}) + + updated_exp = [] + for i in range(st.session_state.exp_count): + exp = exp_list[i] if i < len(exp_list) else {} + st.markdown(f"**Position {i+1}**") + col1, col2 = st.columns(2) + with col1: + pos = field("Job Title", exp.get("position", ""), f"exp_pos_{i}") + company = field("Company", exp.get("company", ""), f"exp_co_{i}") + period = field("Employment Period", exp.get("employment_period", ""), f"exp_period_{i}", + help="e.g. 01/2022 - Present") + with col2: + location = st.text_input("Location", exp.get("location", ""), key=f"exp_loc_{i}") + industry = st.text_input("Industry", exp.get("industry", ""), key=f"exp_ind_{i}") + + responsibilities = st.text_area( + "Key Responsibilities (one per line)", + value="\n".join( + r.get(f"responsibility_{j+1}", "") if isinstance(r, dict) else str(r) + for j, r in enumerate(exp.get("key_responsibilities", [])) + ), + key=f"exp_resp_{i}", height=100, + ) + skills = st.text_input( + "Skills (comma-separated)", + value=", ".join(exp.get("skills_acquired", [])), + key=f"exp_skills_{i}", + ) + resp_list = [{"responsibility_1": r.strip()} for r in responsibilities.splitlines() if r.strip()] + skill_list = [s.strip() for s in skills.split(",") if s.strip()] + updated_exp.append({ + "position": pos, "company": company, "employment_period": period, + "location": location, "industry": industry, + "key_responsibilities": resp_list, "skills_acquired": skill_list, + }) + st.divider() + +# ── Preferences ─────────────────────────────────────────────────────────────── +with st.expander("βš™οΈ Preferences & Availability"): + wp = data.get("work_preferences", {}) + sal = data.get("salary_expectations", {}) + avail = data.get("availability", {}) + col1, col2 = st.columns(2) + with col1: + salary_range = st.text_input("Salary Range (USD)", sal.get("salary_range_usd", ""), + key="pref_salary", help="e.g. 120000 - 180000") + notice = st.text_input("Notice Period", avail.get("notice_period", "2 weeks"), key="pref_notice") + with col2: + remote_work = st.checkbox("Open to Remote", value=wp.get("remote_work", "Yes") == "Yes", key="pref_remote") + relocation = st.checkbox("Open to Relocation", value=wp.get("open_to_relocation", "No") == "Yes", key="pref_reloc") + assessments = st.checkbox("Willing to complete assessments", + value=wp.get("willing_to_complete_assessments", "Yes") == "Yes", key="pref_assess") + bg_checks = st.checkbox("Willing to undergo background checks", + value=wp.get("willing_to_undergo_background_checks", "Yes") == "Yes", key="pref_bg") + drug_tests = st.checkbox("Willing to undergo drug tests", + value=wp.get("willing_to_undergo_drug_tests", "No") == "Yes", key="pref_drug") + +# ── Self-ID ─────────────────────────────────────────────────────────────────── +with st.expander("πŸ³οΈβ€πŸŒˆ Self-Identification (optional)"): + sid = data.get("self_identification", {}) + col1, col2 = st.columns(2) + with col1: + gender = st.text_input("Gender identity", sid.get("gender", "Non-binary"), key="sid_gender", + help="Select 'Non-binary' or 'Prefer not to say' when options allow") + pronouns = st.text_input("Pronouns", sid.get("pronouns", "Any"), key="sid_pronouns") + ethnicity = field("Ethnicity", sid.get("ethnicity", ""), "sid_ethnicity", + help="'Prefer not to say' is always an option") + with col2: + vet_options = ["No", "Yes", "Prefer not to say"] + veteran = st.selectbox("Veteran status", vet_options, + index=vet_options.index(sid.get("veteran", "No")), key="sid_vet") + dis_options = ["Prefer not to say", "No", "Yes"] + disability = st.selectbox("Disability disclosure", dis_options, + index=dis_options.index(sid.get("disability", "Prefer not to say")), + key="sid_dis") + +st.divider() + +# ── Save ────────────────────────────────────────────────────────────────────── +if st.button("πŸ’Ύ Save Resume Profile", type="primary", use_container_width=True): + data["personal_information"] = { + **data.get("personal_information", {}), + "name": name, "surname": surname, "email": email, "phone": phone, + "city": city, "zip_code": zip_code, "linkedin": linkedin, "date_of_birth": dob, + } + data["education_details"] = updated_edu + data["experience_details"] = updated_exp + data["salary_expectations"] = {"salary_range_usd": salary_range} + data["availability"] = {"notice_period": notice} + data["work_preferences"] = { + **data.get("work_preferences", {}), + "remote_work": "Yes" if remote_work else "No", + "open_to_relocation": "Yes" if relocation else "No", + "willing_to_complete_assessments": "Yes" if assessments else "No", + "willing_to_undergo_background_checks": "Yes" if bg_checks else "No", + "willing_to_undergo_drug_tests": "Yes" if drug_tests else "No", + } + data["self_identification"] = { + "gender": gender, "pronouns": pronouns, "veteran": veteran, + "disability": disability, "ethnicity": ethnicity, + } + RESUME_PATH.write_text(yaml.dump(data, default_flow_style=False, allow_unicode=True)) + st.success("βœ… Profile saved!") + st.balloons() diff --git a/app/pages/4_Apply.py b/app/pages/4_Apply.py new file mode 100644 index 0000000..123f1f4 --- /dev/null +++ b/app/pages/4_Apply.py @@ -0,0 +1,388 @@ +# app/pages/4_Apply.py +""" +Apply Workspace β€” side-by-side cover letter tools and job description. +Generates a PDF cover letter saved to the JobSearch docs folder. +""" +import re +import sys +from datetime import datetime +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st +import streamlit.components.v1 as components +import yaml + +from scripts.db import ( + DEFAULT_DB, init_db, get_jobs_by_status, + update_cover_letter, mark_applied, update_job_status, + get_task_for_job, +) +from scripts.task_runner import submit_task + +DOCS_DIR = Path("/Library/Documents/JobSearch") +RESUME_YAML = Path(__file__).parent.parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" + +st.title("πŸš€ Apply Workspace") + +init_db(DEFAULT_DB) + +# ── PDF generation ───────────────────────────────────────────────────────────── +def _make_cover_letter_pdf(job: dict, cover_letter: str, output_dir: Path) -> Path: + from reportlab.lib.pagesizes import letter + from reportlab.lib.units import inch + from reportlab.lib.colors import HexColor + from reportlab.lib.styles import ParagraphStyle + from reportlab.lib.enums import TA_LEFT + from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, HRFlowable + + output_dir.mkdir(parents=True, exist_ok=True) + company_safe = re.sub(r"[^a-zA-Z0-9]", "", job.get("company", "Company")) + date_str = datetime.now().strftime("%Y-%m-%d") + out_path = output_dir / f"CoverLetter_{company_safe}_{date_str}.pdf" + + doc = SimpleDocTemplate( + str(out_path), + pagesize=letter, + leftMargin=inch, rightMargin=inch, + topMargin=inch, bottomMargin=inch, + ) + + teal = HexColor("#2DD4BF") + dark = HexColor("#0F172A") + slate = HexColor("#64748B") + + name_style = ParagraphStyle( + "Name", fontName="Helvetica-Bold", fontSize=22, + textColor=teal, spaceAfter=6, + ) + contact_style = ParagraphStyle( + "Contact", fontName="Helvetica", fontSize=9, + textColor=slate, spaceAfter=4, + ) + date_style = ParagraphStyle( + "Date", fontName="Helvetica", fontSize=11, + textColor=dark, spaceBefore=16, spaceAfter=14, + ) + body_style = ParagraphStyle( + "Body", fontName="Helvetica", fontSize=11, + textColor=dark, leading=16, spaceAfter=12, alignment=TA_LEFT, + ) + + story = [ + Paragraph("ALEX RIVERA", name_style), + Paragraph( + "alex@example.com Β· (555) 867-5309 Β· " + "linkedin.com/in/AlexMcCann Β· hirealexmccann.site", + contact_style, + ), + HRFlowable(width="100%", thickness=1, color=teal, spaceBefore=8, spaceAfter=0), + Paragraph(datetime.now().strftime("%B %d, %Y"), date_style), + ] + + for para in cover_letter.strip().split("\n\n"): + para = para.strip() + if para: + story.append(Paragraph(para.replace("\n", "
"), body_style)) + + story += [ + Spacer(1, 6), + Paragraph("Warm regards,

Alex Rivera", body_style), + ] + + doc.build(story) + return out_path + +# ── Application Q&A helper ───────────────────────────────────────────────────── +def _answer_question(job: dict, question: str) -> str: + """Call the LLM to answer an application question in Alex's voice. + + Uses research_fallback_order (claude_code β†’ vllm β†’ ollama_research) + rather than the default cover-letter order β€” the fine-tuned cover letter + model is not suited for answering general application questions. + """ + from scripts.llm_router import LLMRouter + router = LLMRouter() + fallback = router.config.get("research_fallback_order") or router.config.get("fallback_order") + description_snippet = (job.get("description") or "")[:1200].strip() + prompt = f"""You are answering job application questions for Alex Rivera, a customer success leader. + +Background: +- 6+ years in customer success, technical account management, and CS leadership +- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), NPS consistently β‰₯95 +- Also founder of M3 Consulting, a CS advisory practice for SaaS startups +- Based in SF Bay Area; open to remote/hybrid; pronouns: any + +Role she's applying to: {job.get("title", "")} at {job.get("company", "")} +{f"Job description excerpt:{chr(10)}{description_snippet}" if description_snippet else ""} + +Application Question: +{question} + +Answer in Alex's voice β€” specific, warm, and confident. If the question specifies a word or character limit, respect it. Answer only the question with no preamble or sign-off.""" + return router.complete(prompt, fallback_order=fallback).strip() + + +# ── Copy-to-clipboard button ─────────────────────────────────────────────────── +def _copy_btn(text: str, label: str = "πŸ“‹ Copy", done: str = "βœ… Copied!", height: int = 44) -> None: + import json + # Each components.html call renders in its own sandboxed iframe, so a fixed + # element id is fine. json.dumps handles all special chars (quotes, newlines, + # backslashes, etc.) β€” avoids the fragile inline-onclick escaping approach. + components.html( + f""" + """, + height=height, + ) + +# ── Job selection ────────────────────────────────────────────────────────────── +approved = get_jobs_by_status(DEFAULT_DB, "approved") +if not approved: + st.info("No approved jobs β€” head to Job Review to approve some listings first.") + st.stop() + +preselect_id = st.session_state.pop("apply_job_id", None) +job_options = {j["id"]: f"{j['title']} β€” {j['company']}" for j in approved} +ids = list(job_options.keys()) +default_idx = ids.index(preselect_id) if preselect_id in ids else 0 + +selected_id = st.selectbox( + "Job", + options=ids, + format_func=lambda x: job_options[x], + index=default_idx, + label_visibility="collapsed", +) +job = next(j for j in approved if j["id"] == selected_id) + +st.divider() + +# ── Two-column workspace ─────────────────────────────────────────────────────── +col_tools, col_jd = st.columns([2, 3]) + +# ════════════════════════════════════════════════ +# RIGHT β€” job description +# ════════════════════════════════════════════════ +with col_jd: + score = job.get("match_score") + score_badge = ( + "⬜ No score" if score is None else + f"🟒 {score:.0f}%" if score >= 70 else + f"🟑 {score:.0f}%" if score >= 40 else f"πŸ”΄ {score:.0f}%" + ) + remote_badge = "🌐 Remote" if job.get("is_remote") else "🏒 On-site" + src = (job.get("source") or "").lower() + source_badge = f"πŸ€– {src.title()}" if src == "linkedin" else f"πŸ‘€ {src.title() or 'Manual'}" + + st.subheader(job["title"]) + st.caption( + f"**{job['company']}** Β· {job.get('location', '')} Β· " + f"{remote_badge} Β· {source_badge} Β· {score_badge}" + ) + if job.get("salary"): + st.caption(f"πŸ’° {job['salary']}") + if job.get("keyword_gaps"): + st.caption(f"**Gaps to address in letter:** {job['keyword_gaps']}") + + st.divider() + st.markdown(job.get("description") or "_No description scraped for this listing._") + +# ════════════════════════════════════════════════ +# LEFT β€” copy tools +# ════════════════════════════════════════════════ +with col_tools: + + # ── Cover letter ────────────────────────────── + st.subheader("πŸ“ Cover Letter") + + _cl_key = f"cl_{selected_id}" + if _cl_key not in st.session_state: + st.session_state[_cl_key] = job.get("cover_letter") or "" + + _cl_task = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id) + _cl_running = _cl_task and _cl_task["status"] in ("queued", "running") + + if st.button("✨ Generate / Regenerate", use_container_width=True, disabled=bool(_cl_running)): + submit_task(DEFAULT_DB, "cover_letter", selected_id) + st.rerun() + + if _cl_running: + @st.fragment(run_every=3) + def _cl_status_fragment(): + t = get_task_for_job(DEFAULT_DB, "cover_letter", selected_id) + if t and t["status"] in ("queued", "running"): + lbl = "Queued…" if t["status"] == "queued" else "Generating via LLM…" + st.info(f"⏳ {lbl}") + else: + st.rerun() # full page rerun β€” reloads cover letter from DB + _cl_status_fragment() + elif _cl_task and _cl_task["status"] == "failed": + st.error(f"Generation failed: {_cl_task.get('error', 'unknown error')}") + + # Refresh session state only when a NEW task has just completed β€” not on every rerun. + # Without this guard, every Save Draft click would overwrite the edited text with the + # old DB value before cl_text could be captured. + _cl_loaded_key = f"cl_loaded_{selected_id}" + if not _cl_running and _cl_task and _cl_task["status"] == "completed": + if st.session_state.get(_cl_loaded_key) != _cl_task["id"]: + st.session_state[_cl_key] = job.get("cover_letter") or "" + st.session_state[_cl_loaded_key] = _cl_task["id"] + + cl_text = st.text_area( + "cover_letter_body", + key=_cl_key, + height=280, + label_visibility="collapsed", + ) + + # Copy + Save row + c1, c2 = st.columns(2) + with c1: + if cl_text: + _copy_btn(cl_text, label="πŸ“‹ Copy Letter") + with c2: + if st.button("πŸ’Ύ Save draft", use_container_width=True): + update_cover_letter(DEFAULT_DB, selected_id, cl_text) + st.success("Saved!") + + # PDF generation + if cl_text: + if st.button("πŸ“„ Export PDF β†’ JobSearch folder", use_container_width=True, type="primary"): + with st.spinner("Generating PDF…"): + try: + pdf_path = _make_cover_letter_pdf(job, cl_text, DOCS_DIR) + update_cover_letter(DEFAULT_DB, selected_id, cl_text) + st.success(f"Saved: `{pdf_path.name}`") + except Exception as e: + st.error(f"PDF error: {e}") + + st.divider() + + # Open listing + Mark Applied + c3, c4 = st.columns(2) + with c3: + if job.get("url"): + st.link_button("Open listing β†—", job["url"], use_container_width=True) + with c4: + if st.button("βœ… Mark as Applied", use_container_width=True, type="primary"): + if cl_text: + update_cover_letter(DEFAULT_DB, selected_id, cl_text) + mark_applied(DEFAULT_DB, [selected_id]) + st.success("Marked as applied!") + st.rerun() + + if st.button("🚫 Reject listing", use_container_width=True): + update_job_status(DEFAULT_DB, [selected_id], "rejected") + # Advance selectbox to next job so list doesn't snap to first item + current_idx = ids.index(selected_id) if selected_id in ids else 0 + if current_idx + 1 < len(ids): + st.session_state["apply_job_id"] = ids[current_idx + 1] + st.rerun() + + st.divider() + + # ── Resume highlights ───────────────────────── + with st.expander("πŸ“„ Resume Highlights"): + if RESUME_YAML.exists(): + resume = yaml.safe_load(RESUME_YAML.read_text()) or {} + for exp in resume.get("experience_details", []): + position = exp.get("position", "") + company = exp.get("company", "") + period = exp.get("employment_period", "") + + # Parse start / end dates (handles "MM/YYYY - Present" style) + if " - " in period: + date_start, date_end = [p.strip() for p in period.split(" - ", 1)] + else: + date_start, date_end = period, "" + + # Flatten bullets + bullets = [ + v + for resp_dict in exp.get("key_responsibilities", []) + for v in resp_dict.values() + ] + all_duties = "\n".join(f"β€’ {b}" for b in bullets) + + # ── Header ──────────────────────────────────────────────────── + st.markdown( + f"**{position}**  Β·  " + f"{company}  Β·  " + f"*{period}*" + ) + + # ── Copy row: title | start | end | all duties ──────────────── + cp_t, cp_s, cp_e, cp_d = st.columns(4) + with cp_t: + st.caption("Title") + _copy_btn(position, label="πŸ“‹ Copy", height=34) + with cp_s: + st.caption("Start") + _copy_btn(date_start, label="πŸ“‹ Copy", height=34) + with cp_e: + st.caption("End") + _copy_btn(date_end or period, label="πŸ“‹ Copy", height=34) + with cp_d: + st.caption("All Duties") + if bullets: + _copy_btn(all_duties, label="πŸ“‹ Copy", height=34) + + # ── Individual bullets ──────────────────────────────────────── + for bullet in bullets: + b_col, cp_col = st.columns([6, 1]) + b_col.caption(f"β€’ {bullet}") + with cp_col: + _copy_btn(bullet, label="πŸ“‹", done="βœ…", height=32) + + st.markdown("---") + else: + st.warning("Resume YAML not found β€” check that AIHawk is cloned.") + + # ── Application Q&A ─────────────────────────────────────────────────────── + with st.expander("πŸ’¬ Answer Application Questions"): + st.caption("Paste a question from the application and get an answer in your voice.") + + _qa_key = f"qa_list_{selected_id}" + if _qa_key not in st.session_state: + st.session_state[_qa_key] = [] + + q_input = st.text_area( + "Paste question", + placeholder="In 200 words or less, explain why you're a strong fit for this role.", + height=80, + key=f"qa_input_{selected_id}", + label_visibility="collapsed", + ) + if st.button("✨ Generate Answer", key=f"qa_gen_{selected_id}", + use_container_width=True, + disabled=not (q_input or "").strip()): + with st.spinner("Generating answer…"): + _answer = _answer_question(job, q_input.strip()) + st.session_state[_qa_key].append({"q": q_input.strip(), "a": _answer}) + st.rerun() + + for _i, _pair in enumerate(reversed(st.session_state[_qa_key])): + _real_idx = len(st.session_state[_qa_key]) - 1 - _i + st.markdown(f"**Q:** {_pair['q']}") + _a_key = f"qa_ans_{selected_id}_{_real_idx}" + if _a_key not in st.session_state: + st.session_state[_a_key] = _pair["a"] + _answer_text = st.text_area( + "answer", + key=_a_key, + height=120, + label_visibility="collapsed", + ) + _copy_btn(_answer_text, label="πŸ“‹ Copy Answer") + if _i < len(st.session_state[_qa_key]) - 1: + st.markdown("---") diff --git a/app/pages/5_Interviews.py b/app/pages/5_Interviews.py new file mode 100644 index 0000000..7d624e3 --- /dev/null +++ b/app/pages/5_Interviews.py @@ -0,0 +1,539 @@ +# app/pages/5_Interviews.py +""" +Interviews β€” Kanban board for tracking post-application engagement. + +Pipeline: applied β†’ phone_screen β†’ interviewing β†’ offer β†’ hired + (or rejected at any stage, with stage captured for analytics) + +Features: + - Kanban columns for each interview stage + - Company research brief auto-generated when advancing to Phone Screen + - Contact / email log per job + - Email reply drafter via LLM + - Interview date tracking with calendar push hint + - Rejection analytics +""" +import sys +from collections import Counter +from datetime import date, datetime +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st + +from scripts.db import ( + DEFAULT_DB, init_db, + get_interview_jobs, advance_to_stage, reject_at_stage, + set_interview_date, add_contact, get_contacts, + get_research, get_task_for_job, get_job_by_id, + get_unread_stage_signals, dismiss_stage_signal, +) +from scripts.task_runner import submit_task + +st.title("🎯 Interviews") + +init_db(DEFAULT_DB) + +# ── Sidebar: Email sync ──────────────────────────────────────────────────────── +with st.sidebar: + st.markdown("### πŸ“§ Email Sync") + _email_task = get_task_for_job(DEFAULT_DB, "email_sync", 0) + _email_running = _email_task and _email_task["status"] in ("queued", "running") + + if st.button("πŸ”„ Sync Emails", use_container_width=True, type="primary", + disabled=bool(_email_running)): + submit_task(DEFAULT_DB, "email_sync", 0) + st.rerun() + + if _email_running: + @st.fragment(run_every=4) + def _email_sidebar_status(): + t = get_task_for_job(DEFAULT_DB, "email_sync", 0) + if t and t["status"] in ("queued", "running"): + st.info("⏳ Syncing…") + else: + st.rerun() + _email_sidebar_status() + elif _email_task and _email_task["status"] == "completed": + st.success(_email_task.get("error", "Done")) + elif _email_task and _email_task["status"] == "failed": + msg = _email_task.get("error", "") + if "not configured" in msg.lower(): + st.error("Email not configured. Go to **Settings β†’ Email**.") + else: + st.error(f"Sync failed: {msg}") + +# ── Constants ───────────────────────────────────────────────────────────────── +STAGE_LABELS = { + "phone_screen": "πŸ“ž Phone Screen", + "interviewing": "🎯 Interviewing", + "offer": "πŸ“œ Offer / Hired", +} +STAGE_NEXT = { + "survey": "phone_screen", + "applied": "phone_screen", + "phone_screen": "interviewing", + "interviewing": "offer", + "offer": "hired", +} +STAGE_NEXT_LABEL = { + "survey": "πŸ“ž Phone Screen", + "applied": "πŸ“ž Phone Screen", + "phone_screen": "🎯 Interviewing", + "interviewing": "πŸ“œ Offer", + "offer": "πŸŽ‰ Hired", +} + +# ── Data ────────────────────────────────────────────────────────────────────── +jobs_by_stage = get_interview_jobs(DEFAULT_DB) + +# ── Helpers ─────────────────────────────────────────────────────────────────── +def _days_ago(date_str: str | None) -> str: + if not date_str: + return "β€”" + try: + d = date.fromisoformat(date_str[:10]) + delta = (date.today() - d).days + if delta == 0: + return "today" + if delta == 1: + return "yesterday" + return f"{delta}d ago" + except Exception: + return date_str[:10] + +@st.dialog("πŸ”¬ Company Research", width="large") +def _research_modal(job: dict) -> None: + job_id = job["id"] + st.caption(f"**{job.get('company')}** β€” {job.get('title')}") + research = get_research(DEFAULT_DB, job_id=job_id) + task = get_task_for_job(DEFAULT_DB, "company_research", job_id) + running = task and task["status"] in ("queued", "running") + + if running: + task_stage = (task.get("stage") or "") + lbl = "Queued…" if task["status"] == "queued" else (task_stage or "Generating…") + st.info(f"⏳ {lbl}") + elif research: + scrape_used = research.get("scrape_used") + if not scrape_used: + import socket as _sock + _searxng_up = False + try: + with _sock.create_connection(("127.0.0.1", 8888), timeout=1): + _searxng_up = True + except OSError: + pass + if _searxng_up: + st.warning( + "⚠️ This brief was generated without live web data and may contain " + "inaccuracies. SearXNG is now available β€” re-run to get verified facts." + ) + if st.button("πŸ”„ Re-run with live data", key=f"modal_rescrape_{job_id}", type="primary"): + submit_task(DEFAULT_DB, "company_research", job_id) + st.rerun() + st.divider() + else: + st.warning( + "⚠️ Generated without live web data (SearXNG was offline). " + "Key facts like CEO, investors, and founding date may be hallucinated β€” " + "verify before the call. Start SearXNG in Settings β†’ Services to re-run." + ) + st.divider() + st.caption( + f"Generated {research.get('generated_at', '')} " + f"{'Β· web data used βœ“' if scrape_used else 'Β· LLM knowledge only'}" + ) + st.markdown(research["raw_output"]) + if st.button("πŸ”„ Refresh", key=f"modal_regen_{job_id}", disabled=bool(running)): + submit_task(DEFAULT_DB, "company_research", job_id) + st.rerun() + else: + st.info("No research brief yet.") + if task and task["status"] == "failed": + st.error(f"Last attempt failed: {task.get('error', '')}") + if st.button("πŸ”¬ Generate now", key=f"modal_gen_{job_id}"): + submit_task(DEFAULT_DB, "company_research", job_id) + st.rerun() + + +@st.dialog("πŸ“§ Email History", width="large") +def _email_modal(job: dict) -> None: + job_id = job["id"] + st.caption(f"**{job.get('company')}** β€” {job.get('title')}") + contacts = get_contacts(DEFAULT_DB, job_id=job_id) + + if not contacts: + st.info("No emails logged yet. Use the form below to add one.") + else: + for c in contacts: + icon = "πŸ“₯" if c["direction"] == "inbound" else "πŸ“€" + st.markdown( + f"{icon} **{c.get('subject') or '(no subject)'}** " + f"Β· _{c.get('received_at', '')[:10]}_" + ) + if c.get("from_addr"): + st.caption(f"From: {c['from_addr']}") + if c.get("body"): + st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else "")) + st.divider() + + inbound = [c for c in contacts if c["direction"] == "inbound"] + if inbound: + last = inbound[-1] + if st.button("✍️ Draft reply", key=f"modal_draft_{job_id}"): + with st.spinner("Drafting…"): + try: + from scripts.llm_router import complete + draft = complete( + prompt=( + f"Draft a professional, warm reply to this email.\n\n" + f"From: {last.get('from_addr', '')}\n" + f"Subject: {last.get('subject', '')}\n\n" + f"{last.get('body', '')}\n\n" + f"Context: Alex Rivera is a Customer Success / " + f"Technical Account Manager applying for " + f"{job.get('title')} at {job.get('company')}." + ), + system=( + "You are Alex Rivera's professional email assistant. " + "Write concise, warm, and professional replies in her voice. " + "Keep it to 3–5 sentences unless more is needed." + ), + ) + st.session_state[f"modal_draft_text_{job_id}"] = draft + st.rerun() + except Exception as e: + st.error(f"Draft failed: {e}") + + if f"modal_draft_text_{job_id}" in st.session_state: + st.text_area( + "Draft (edit before sending)", + value=st.session_state[f"modal_draft_text_{job_id}"], + height=160, + key=f"modal_draft_area_{job_id}", + ) + + st.divider() + st.markdown("**Log a contact**") + with st.form(key=f"contact_form_modal_{job_id}", clear_on_submit=True): + col_a, col_b = st.columns(2) + direction = col_a.radio( + "Direction", ["inbound", "outbound"], + horizontal=True, key=f"dir_modal_{job_id}", + ) + recv_at = col_b.text_input( + "Date (YYYY-MM-DD)", value=str(date.today()), key=f"recv_modal_{job_id}" + ) + subject = st.text_input("Subject", key=f"subj_modal_{job_id}") + from_addr = st.text_input("From", key=f"from_modal_{job_id}") + body_text = st.text_area("Body / notes", height=80, key=f"body_modal_{job_id}") + if st.form_submit_button("πŸ“§ Save contact"): + add_contact( + DEFAULT_DB, job_id=job_id, + direction=direction, subject=subject, + from_addr=from_addr, body=body_text, received_at=recv_at, + ) + st.rerun() + +def _render_card(job: dict, stage: str, compact: bool = False) -> None: + """Render a single job card appropriate for the given stage.""" + job_id = job["id"] + contacts = get_contacts(DEFAULT_DB, job_id=job_id) + last_contact = contacts[-1] if contacts else None + + with st.container(border=True): + st.markdown(f"**{job.get('company', '?')}**") + st.caption(job.get("title", "")) + + col_a, col_b = st.columns(2) + col_a.caption(f"Applied: {_days_ago(job.get('applied_at'))}") + if last_contact: + col_b.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}") + + # Interview date picker (phone_screen / interviewing stages) + if stage in ("phone_screen", "interviewing"): + current_idate = job.get("interview_date") or "" + with st.form(key=f"idate_form_{job_id}"): + new_date = st.date_input( + "Interview date", + value=date.fromisoformat(current_idate) if current_idate else None, + key=f"idate_{job_id}", + format="YYYY-MM-DD", + ) + if st.form_submit_button("πŸ“… Save date"): + set_interview_date(DEFAULT_DB, job_id=job_id, date_str=str(new_date)) + st.success("Saved!") + st.rerun() + + if not compact: + if stage in ("applied", "phone_screen", "interviewing"): + signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id) + if signals: + sig = signals[-1] + _SIGNAL_TO_STAGE = { + "interview_scheduled": ("phone_screen", "πŸ“ž Phone Screen"), + "positive_response": ("phone_screen", "πŸ“ž Phone Screen"), + "offer_received": ("offer", "πŸ“œ Offer"), + "survey_received": ("survey", "πŸ“‹ Survey"), + } + target_stage, target_label = _SIGNAL_TO_STAGE.get( + sig["stage_signal"], (None, None) + ) + with st.container(border=True): + st.caption( + f"πŸ’‘ Email suggests: **{sig['stage_signal'].replace('_', ' ')}** \n" + f"_{sig.get('subject', '')}_ Β· {(sig.get('received_at') or '')[:10]}" + ) + b1, b2 = st.columns(2) + if sig["stage_signal"] == "rejected": + if b1.button("βœ— Reject", key=f"sig_rej_{sig['id']}", + use_container_width=True): + reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage) + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun(scope="app") + elif target_stage and b1.button( + f"β†’ {target_label}", key=f"sig_adv_{sig['id']}", + use_container_width=True, type="primary", + ): + if target_stage == "phone_screen" and stage == "applied": + advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen") + submit_task(DEFAULT_DB, "company_research", job_id) + elif target_stage: + advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage) + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun(scope="app") + if b2.button("Dismiss", key=f"sig_dis_{sig['id']}", + use_container_width=True): + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun() + + # Advance / Reject buttons + next_stage = STAGE_NEXT.get(stage) + c1, c2 = st.columns(2) + if next_stage: + next_label = STAGE_NEXT_LABEL.get(stage, next_stage) + if c1.button( + f"β†’ {next_label}", key=f"adv_{job_id}", + use_container_width=True, type="primary", + ): + advance_to_stage(DEFAULT_DB, job_id=job_id, stage=next_stage) + if next_stage == "phone_screen": + submit_task(DEFAULT_DB, "company_research", job_id) + st.rerun(scope="app") # full rerun β€” card must appear in new column + + if c2.button( + "βœ— Reject", key=f"rej_{job_id}", + use_container_width=True, + ): + reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage) + st.rerun() # fragment-scope rerun β€” card disappears without scroll-to-top + + if job.get("url"): + st.link_button("Open listing β†—", job["url"], use_container_width=True) + + if stage in ("phone_screen", "interviewing", "offer"): + if st.button( + "πŸ“‹ Open Prep Sheet", key=f"prep_{job_id}", + use_container_width=True, + help="Open the Interview Prep page for this job", + ): + st.session_state["prep_job_id"] = job_id + st.switch_page("pages/6_Interview_Prep.py") + + # Detail modals β€” full-width overlays replace narrow inline expanders + if stage in ("phone_screen", "interviewing", "offer"): + mc1, mc2 = st.columns(2) + if mc1.button("πŸ”¬ Research", key=f"res_btn_{job_id}", use_container_width=True): + _research_modal(job) + if mc2.button("πŸ“§ Emails", key=f"email_btn_{job_id}", use_container_width=True): + _email_modal(job) + else: + if st.button("πŸ“§ Emails", key=f"email_btn_{job_id}", use_container_width=True): + _email_modal(job) + +# ── Fragment wrappers β€” keep scroll position on card actions ───────────────── +@st.fragment +def _card_fragment(job_id: int, stage: str) -> None: + """Re-fetches the job on each fragment rerun; renders nothing if moved/rejected.""" + job = get_job_by_id(DEFAULT_DB, job_id) + if job is None or job.get("status") != stage: + return + _render_card(job, stage) + + +@st.fragment +def _pre_kanban_row_fragment(job_id: int) -> None: + """Pre-kanban compact row for applied and survey-stage jobs.""" + job = get_job_by_id(DEFAULT_DB, job_id) + if job is None or job.get("status") not in ("applied", "survey"): + return + stage = job["status"] + contacts = get_contacts(DEFAULT_DB, job_id=job_id) + last_contact = contacts[-1] if contacts else None + + with st.container(border=True): + left, mid, right = st.columns([3, 2, 2]) + badge = " πŸ“‹ **Survey**" if stage == "survey" else "" + left.markdown(f"**{job.get('company')}** β€” {job.get('title', '')}{badge}") + left.caption(f"Applied: {_days_ago(job.get('applied_at'))}") + + with mid: + if last_contact: + st.caption(f"Last contact: {_days_ago(last_contact.get('received_at'))}") + if st.button("πŸ“§ Emails", key=f"email_pre_{job_id}", use_container_width=True): + _email_modal(job) + + # Stage signal hint (email-detected next steps) + signals = get_unread_stage_signals(DEFAULT_DB, job_id=job_id) + if signals: + sig = signals[-1] + _SIGNAL_TO_STAGE = { + "interview_scheduled": ("phone_screen", "πŸ“ž Phone Screen"), + "positive_response": ("phone_screen", "πŸ“ž Phone Screen"), + "offer_received": ("offer", "πŸ“œ Offer"), + "survey_received": ("survey", "πŸ“‹ Survey"), + } + target_stage, target_label = _SIGNAL_TO_STAGE.get( + sig["stage_signal"], (None, None) + ) + with st.container(border=True): + st.caption( + f"πŸ’‘ **{sig['stage_signal'].replace('_', ' ')}** \n" + f"_{sig.get('subject', '')}_ Β· {(sig.get('received_at') or '')[:10]}" + ) + s1, s2 = st.columns(2) + if target_stage and s1.button( + f"β†’ {target_label}", key=f"sig_adv_pre_{sig['id']}", + use_container_width=True, type="primary", + ): + if target_stage == "phone_screen": + advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen") + submit_task(DEFAULT_DB, "company_research", job_id) + else: + advance_to_stage(DEFAULT_DB, job_id=job_id, stage=target_stage) + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun(scope="app") + if s2.button("Dismiss", key=f"sig_dis_pre_{sig['id']}", + use_container_width=True): + dismiss_stage_signal(DEFAULT_DB, sig["id"]) + st.rerun() + + with right: + if st.button( + "β†’ πŸ“ž Phone Screen", key=f"adv_pre_{job_id}", + use_container_width=True, type="primary", + ): + advance_to_stage(DEFAULT_DB, job_id=job_id, stage="phone_screen") + submit_task(DEFAULT_DB, "company_research", job_id) + st.rerun(scope="app") + col_a, col_b = st.columns(2) + if stage == "applied" and col_a.button( + "πŸ“‹ Survey", key=f"to_survey_{job_id}", use_container_width=True, + ): + advance_to_stage(DEFAULT_DB, job_id=job_id, stage="survey") + st.rerun(scope="app") + if col_b.button("βœ— Reject", key=f"rej_pre_{job_id}", use_container_width=True): + reject_at_stage(DEFAULT_DB, job_id=job_id, rejection_stage=stage) + st.rerun() + + +@st.fragment +def _hired_card_fragment(job_id: int) -> None: + """Compact hired job card β€” shown in the Offer/Hired column.""" + job = get_job_by_id(DEFAULT_DB, job_id) + if job is None or job.get("status") != "hired": + return + with st.container(border=True): + st.markdown(f"βœ… **{job.get('company', '?')}**") + st.caption(job.get("title", "")) + st.caption(f"Hired {_days_ago(job.get('hired_at'))}") + + +# ── Stats bar ───────────────────────────────────────────────────────────────── +c1, c2, c3, c4, c5, c6 = st.columns(6) +c1.metric("Applied", len(jobs_by_stage.get("applied", []))) +c2.metric("Survey", len(jobs_by_stage.get("survey", []))) +c3.metric("Phone Screen", len(jobs_by_stage.get("phone_screen", []))) +c4.metric("Interviewing", len(jobs_by_stage.get("interviewing", []))) +c5.metric("Offer/Hired", len(jobs_by_stage.get("offer", [])) + len(jobs_by_stage.get("hired", []))) +c6.metric("Rejected", len(jobs_by_stage.get("rejected", []))) + +st.divider() + +# ── Pre-kanban: Applied + Survey ─────────────────────────────────────────────── +applied_jobs = jobs_by_stage.get("applied", []) +survey_jobs = jobs_by_stage.get("survey", []) +pre_kanban = survey_jobs + applied_jobs # survey shown first + +if pre_kanban: + st.subheader(f"πŸ“‹ Pre-pipeline ({len(pre_kanban)})") + st.caption( + "Move a job to **Phone Screen** once you receive an outreach. " + "A company research brief will be auto-generated to help you prepare." + ) + for job in pre_kanban: + _pre_kanban_row_fragment(job["id"]) + st.divider() + +# ── Kanban columns ───────────────────────────────────────────────────────────── +kanban_stages = ["phone_screen", "interviewing", "offer"] +cols = st.columns(len(kanban_stages)) + +for col, stage in zip(cols, kanban_stages): + with col: + stage_jobs = jobs_by_stage.get(stage, []) + hired_jobs = jobs_by_stage.get("hired", []) if stage == "offer" else [] + all_col_jobs = stage_jobs + hired_jobs + st.markdown(f"### {STAGE_LABELS[stage]}") + st.caption(f"{len(all_col_jobs)} job{'s' if len(all_col_jobs) != 1 else ''}") + st.divider() + + if not all_col_jobs: + st.caption("_Empty_") + else: + for job in stage_jobs: + _card_fragment(job["id"], stage) + for job in hired_jobs: + _hired_card_fragment(job["id"]) + +st.divider() + +# ── Rejected log + analytics ─────────────────────────────────────────────────── +rejected_jobs = jobs_by_stage.get("rejected", []) +if rejected_jobs: + with st.expander(f"❌ Rejected ({len(rejected_jobs)})", expanded=False): + # Stage breakdown + stage_counts = Counter( + j.get("rejection_stage") or "unknown" for j in rejected_jobs + ) + st.caption( + "Rejection by stage: " + + " Β· ".join(f"**{k}**: {v}" for k, v in stage_counts.most_common()) + ) + + # Rejection rate timeline (simple) + if len(rejected_jobs) > 1: + by_month: dict[str, int] = {} + for j in rejected_jobs: + mo = (j.get("applied_at") or "")[:7] + if mo: + by_month[mo] = by_month.get(mo, 0) + 1 + if by_month: + import pandas as pd + chart_data = pd.DataFrame( + list(by_month.items()), columns=["Month", "Rejections"] + ).sort_values("Month") + st.bar_chart(chart_data.set_index("Month")) + + st.divider() + for job in rejected_jobs: + r_stage = job.get("rejection_stage") or "unknown" + company = job.get("company") or "?" + title = job.get("title") or "" + applied = _days_ago(job.get("applied_at")) + st.markdown( + f"**{company}** β€” {title} " + f"Β· rejected at _**{r_stage}**_ Β· applied {applied}" + ) diff --git a/app/pages/6_Interview_Prep.py b/app/pages/6_Interview_Prep.py new file mode 100644 index 0000000..533a111 --- /dev/null +++ b/app/pages/6_Interview_Prep.py @@ -0,0 +1,371 @@ +# app/pages/6_Interview_Prep.py +""" +Interview Prep β€” a clean, glanceable reference you can keep open during a call. + +Left panel : talking points, company brief, CEO info, practice Q&A +Right panel : job description, email / contact history, cover letter snippet +""" +import sys +from datetime import date +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import streamlit as st + +from scripts.db import ( + DEFAULT_DB, init_db, + get_interview_jobs, get_contacts, get_research, + get_task_for_job, +) +from scripts.task_runner import submit_task + +init_db(DEFAULT_DB) + +# ── Job selection ───────────────────────────────────────────────────────────── +jobs_by_stage = get_interview_jobs(DEFAULT_DB) +active_stages = ["phone_screen", "interviewing", "offer"] +active_jobs = [ + j for stage in active_stages + for j in jobs_by_stage.get(stage, []) +] + +if not active_jobs: + st.title("πŸ“‹ Interview Prep") + st.info( + "No active interviews found. " + "Move a job to **Phone Screen** on the Interviews page first." + ) + st.stop() + +# Allow pre-selecting via session state (e.g., from Interviews page) +preselect_id = st.session_state.pop("prep_job_id", None) +job_options = { + j["id"]: f"{j['title']} β€” {j['company']} ({j['status'].replace('_', ' ').title()})" + for j in active_jobs +} +ids = list(job_options.keys()) +default_idx = ids.index(preselect_id) if preselect_id in ids else 0 + +selected_id = st.selectbox( + "Job", + options=ids, + format_func=lambda x: job_options[x], + index=default_idx, + label_visibility="collapsed", +) +job = next(j for j in active_jobs if j["id"] == selected_id) + +# ── Header bar ──────────────────────────────────────────────────────────────── +stage_label = job["status"].replace("_", " ").title() +idate = job.get("interview_date") +countdown = "" +if idate: + try: + delta = (date.fromisoformat(idate) - date.today()).days + if delta == 0: + countdown = " πŸ”΄ **TODAY**" + elif delta == 1: + countdown = " 🟑 **TOMORROW**" + elif delta > 0: + countdown = f" 🟒 in {delta} days" + else: + countdown = f" (was {abs(delta)}d ago)" + except Exception: + countdown = "" + +st.title(f"πŸ“‹ {job.get('company')} β€” {job.get('title')}") +st.caption( + f"Stage: **{stage_label}**" + + (f" Β· Interview: {idate}{countdown}" if idate else "") + + (f" Β· Applied: {job.get('applied_at', '')[:10]}" if job.get("applied_at") else "") +) + +if job.get("url"): + st.link_button("Open job listing β†—", job["url"]) + +st.divider() + +# ── Two-column layout ───────────────────────────────────────────────────────── +col_prep, col_context = st.columns([2, 3]) + +# ════════════════════════════════════════════════ +# LEFT β€” prep materials +# ════════════════════════════════════════════════ +with col_prep: + + research = get_research(DEFAULT_DB, job_id=selected_id) + + # Refresh / generate research + _res_task = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + _res_running = _res_task and _res_task["status"] in ("queued", "running") + + if not research: + if not _res_running: + st.warning("No research brief yet for this job.") + if _res_task and _res_task["status"] == "failed": + st.error(f"Last attempt failed: {_res_task.get('error', '')}") + if st.button("πŸ”¬ Generate research brief", type="primary", use_container_width=True): + submit_task(DEFAULT_DB, "company_research", selected_id) + st.rerun() + + if _res_running: + @st.fragment(run_every=3) + def _res_status_initial(): + t = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + if t and t["status"] in ("queued", "running"): + stage = t.get("stage") or "" + lbl = "Queued…" if t["status"] == "queued" else (stage or "Generating… this may take 30–60 seconds") + st.info(f"⏳ {lbl}") + else: + st.rerun() + _res_status_initial() + + st.stop() + else: + generated_at = research.get("generated_at", "") + col_ts, col_btn = st.columns([3, 1]) + col_ts.caption(f"Research generated: {generated_at}") + if col_btn.button("πŸ”„ Refresh", use_container_width=True, disabled=bool(_res_running)): + submit_task(DEFAULT_DB, "company_research", selected_id) + st.rerun() + + if _res_running: + @st.fragment(run_every=3) + def _res_status_refresh(): + t = get_task_for_job(DEFAULT_DB, "company_research", selected_id) + if t and t["status"] in ("queued", "running"): + stage = t.get("stage") or "" + lbl = "Queued…" if t["status"] == "queued" else (stage or "Refreshing research…") + st.info(f"⏳ {lbl}") + else: + st.rerun() + _res_status_refresh() + elif _res_task and _res_task["status"] == "failed": + st.error(f"Refresh failed: {_res_task.get('error', '')}") + + st.divider() + + # ── Talking points (top β€” most useful during a call) ────────────────────── + st.subheader("🎯 Talking Points") + tp = (research.get("talking_points") or "").strip() + if tp: + st.markdown(tp) + else: + st.caption("_No talking points extracted β€” try regenerating._") + + st.divider() + + # ── Company brief ───────────────────────────────────────────────────────── + st.subheader("🏒 Company Overview") + st.markdown(research.get("company_brief", "_β€”_")) + + st.divider() + + # ── Leadership brief ────────────────────────────────────────────────────── + st.subheader("πŸ‘€ Leadership & Culture") + st.markdown(research.get("ceo_brief", "_β€”_")) + + st.divider() + + # ── Tech Stack & Product ─────────────────────────────────────────────────── + tech = (research.get("tech_brief") or "").strip() + if tech: + st.subheader("βš™οΈ Tech Stack & Product") + st.markdown(tech) + st.divider() + + # ── Funding & Market Position ────────────────────────────────────────────── + funding = (research.get("funding_brief") or "").strip() + if funding: + st.subheader("πŸ’° Funding & Market Position") + st.markdown(funding) + st.divider() + + # ── Red Flags & Watch-outs ──────────────────────────────────────────────── + red = (research.get("red_flags") or "").strip() + if red and "no significant red flags" not in red.lower(): + st.subheader("⚠️ Red Flags & Watch-outs") + st.warning(red) + st.divider() + + # ── Inclusion & Accessibility ───────────────────────────────────────────── + access = (research.get("accessibility_brief") or "").strip() + if access: + st.subheader("β™Ώ Inclusion & Accessibility") + st.caption("For your personal evaluation β€” not disclosed in any application.") + st.markdown(access) + st.divider() + + # ── Practice Q&A (collapsible β€” use before the call) ───────────────────── + with st.expander("🎀 Practice Q&A (pre-call prep)", expanded=False): + st.caption( + "The LLM will play the interviewer. Type your answers below. " + "Use this before the call to warm up." + ) + + qa_key = f"qa_{selected_id}" + if qa_key not in st.session_state: + st.session_state[qa_key] = [] + + if st.button("πŸ”„ Start / Reset session", key=f"qa_reset_{selected_id}"): + st.session_state[qa_key] = [] + st.rerun() + + # Display history + for msg in st.session_state[qa_key]: + with st.chat_message(msg["role"]): + st.markdown(msg["content"]) + + # Initial question if session is empty + if not st.session_state[qa_key]: + with st.spinner("Setting up your mock interview…"): + try: + from scripts.llm_router import complete + opening = complete( + prompt=( + f"Start a mock phone screen for the {job.get('title')} " + f"role at {job.get('company')}. Ask your first question. " + f"Keep it realistic and concise." + ), + system=( + f"You are a recruiter at {job.get('company')} conducting " + f"a phone screen for the {job.get('title')} role. " + f"Ask one question at a time. After Alex answers, give " + f"brief feedback (1–2 sentences), then ask your next question. " + f"Be professional but warm." + ), + ) + st.session_state[qa_key] = [{"role": "assistant", "content": opening}] + st.rerun() + except Exception as e: + st.error(f"LLM error: {e}") + + # Answer input + answer = st.chat_input("Your answer…", key=f"qa_input_{selected_id}") + if answer and st.session_state[qa_key]: + history = st.session_state[qa_key] + history.append({"role": "user", "content": answer}) + + messages = [ + { + "role": "system", + "content": ( + f"You are a recruiter at {job.get('company')} conducting " + f"a phone screen for the {job.get('title')} role. " + f"Ask one question at a time. After Alex answers, give " + f"brief feedback (1–2 sentences), then ask your next question." + ), + } + ] + history + + with st.spinner("…"): + try: + from scripts.llm_router import LLMRouter + router = LLMRouter() + # Build prompt from history for single-turn backends + convo = "\n\n".join( + f"{'Interviewer' if m['role'] == 'assistant' else 'Alex'}: {m['content']}" + for m in history + ) + response = router.complete( + prompt=convo + "\n\nInterviewer:", + system=messages[0]["content"], + ) + history.append({"role": "assistant", "content": response}) + st.session_state[qa_key] = history + st.rerun() + except Exception as e: + st.error(f"Error: {e}") + +# ════════════════════════════════════════════════ +# RIGHT β€” context / reference +# ════════════════════════════════════════════════ +with col_context: + + tab_jd, tab_emails, tab_letter = st.tabs( + ["πŸ“„ Job Description", "πŸ“§ Email History", "πŸ“ Cover Letter"] + ) + + with tab_jd: + score = job.get("match_score") + if score is not None: + badge = ( + f"🟒 {score:.0f}% match" if score >= 70 else + f"🟑 {score:.0f}% match" if score >= 40 else + f"πŸ”΄ {score:.0f}% match" + ) + st.caption(badge) + if job.get("keyword_gaps"): + st.caption(f"**Gaps to address:** {job['keyword_gaps']}") + st.markdown(job.get("description") or "_No description saved for this listing._") + + with tab_emails: + contacts = get_contacts(DEFAULT_DB, job_id=selected_id) + if not contacts: + st.info("No contacts logged yet. Use the Interviews page to log emails.") + else: + for c in contacts: + icon = "πŸ“₯" if c["direction"] == "inbound" else "πŸ“€" + recv = (c.get("received_at") or "")[:10] + st.markdown( + f"{icon} **{c.get('subject') or '(no subject)'}** Β· _{recv}_" + ) + if c.get("from_addr"): + st.caption(f"From: {c['from_addr']}") + if c.get("body"): + st.text(c["body"][:500] + ("…" if len(c["body"]) > 500 else "")) + st.divider() + + # Quick draft reply + inbound = [c for c in contacts if c["direction"] == "inbound"] + if inbound: + last = inbound[-1] + if st.button("✍️ Draft reply to last email"): + with st.spinner("Drafting…"): + try: + from scripts.llm_router import complete + draft = complete( + prompt=( + f"Draft a professional, warm reply.\n\n" + f"From: {last.get('from_addr', '')}\n" + f"Subject: {last.get('subject', '')}\n\n" + f"{last.get('body', '')}\n\n" + f"Context: Alex is a CS/TAM professional applying " + f"for {job.get('title')} at {job.get('company')}." + ), + system=( + "You are Alex Rivera's professional email assistant. " + "Write concise, warm, and professional replies in her voice." + ), + ) + st.session_state[f"draft_{selected_id}"] = draft + except Exception as e: + st.error(f"Draft failed: {e}") + + if f"draft_{selected_id}" in st.session_state: + st.text_area( + "Draft (edit before sending)", + value=st.session_state[f"draft_{selected_id}"], + height=180, + ) + + with tab_letter: + cl = (job.get("cover_letter") or "").strip() + if cl: + st.markdown(cl) + else: + st.info("No cover letter saved for this job.") + + st.divider() + + # ── Notes (freeform, stored in session only β€” not persisted to DB) ──────── + st.subheader("πŸ“ Call Notes") + st.caption("Notes are per-session only β€” copy anything important before navigating away.") + st.text_area( + "notes", + placeholder="Type notes during or after the call…", + height=200, + key=f"notes_{selected_id}", + label_visibility="collapsed", + ) diff --git a/app/pages/7_Survey.py b/app/pages/7_Survey.py new file mode 100644 index 0000000..d5f00ed --- /dev/null +++ b/app/pages/7_Survey.py @@ -0,0 +1,274 @@ +# app/pages/7_Survey.py +""" +Survey Assistant β€” real-time help with culture-fit surveys. + +Supports text paste and screenshot (via clipboard or file upload). +Quick mode: "pick B" + one-liner. Detailed mode: option-by-option breakdown. +""" +import base64 +import io +import sys +from datetime import datetime +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + +import requests +import streamlit as st + +from scripts.db import ( + DEFAULT_DB, init_db, + get_interview_jobs, get_job_by_id, + insert_survey_response, get_survey_responses, +) +from scripts.llm_router import LLMRouter + +st.title("πŸ“‹ Survey Assistant") + +init_db(DEFAULT_DB) + + +# ── Vision service health check ──────────────────────────────────────────────── +def _vision_available() -> bool: + try: + r = requests.get("http://localhost:8002/health", timeout=2) + return r.status_code == 200 + except Exception: + return False + + +vision_up = _vision_available() + +# ── Job selector ─────────────────────────────────────────────────────────────── +jobs_by_stage = get_interview_jobs(DEFAULT_DB) +survey_jobs = jobs_by_stage.get("survey", []) +other_jobs = ( + jobs_by_stage.get("applied", []) + + jobs_by_stage.get("phone_screen", []) + + jobs_by_stage.get("interviewing", []) + + jobs_by_stage.get("offer", []) +) +all_jobs = survey_jobs + other_jobs + +if not all_jobs: + st.info("No active jobs found. Add jobs in Job Review first.") + st.stop() + +job_labels = {j["id"]: f"{j.get('company', '?')} β€” {j.get('title', '')}" for j in all_jobs} +selected_job_id = st.selectbox( + "Job", + options=[j["id"] for j in all_jobs], + format_func=lambda jid: job_labels[jid], + index=0, +) +selected_job = get_job_by_id(DEFAULT_DB, selected_job_id) + +# ── LLM prompt builders ──────────────────────────────────────────────────────── +_SURVEY_SYSTEM = ( + "You are a job application advisor helping a candidate answer a culture-fit survey. " + "The candidate values collaborative teamwork, clear communication, growth, and impact. " + "Choose answers that present them in the best professional light." +) + + +def _build_text_prompt(text: str, mode: str) -> str: + if mode == "Quick": + return ( + "Answer each survey question below. For each, give ONLY the letter of the best " + "option and a single-sentence reason. Format exactly as:\n" + "1. B β€” reason here\n2. A β€” reason here\n\n" + f"Survey:\n{text}" + ) + return ( + "Analyze each survey question below. For each question:\n" + "- Briefly evaluate each option (1 sentence each)\n" + "- State your recommendation with reasoning\n\n" + f"Survey:\n{text}" + ) + + +def _build_image_prompt(mode: str) -> str: + if mode == "Quick": + return ( + "This is a screenshot of a culture-fit survey. Read all questions and answer each " + "with the letter of the best option for a collaborative, growth-oriented candidate. " + "Format: '1. B β€” brief reason' on separate lines." + ) + return ( + "This is a screenshot of a culture-fit survey. For each question, evaluate each option " + "and recommend the best choice for a collaborative, growth-oriented candidate. " + "Include a brief breakdown per option and a clear recommendation." + ) + + +# ── Layout ───────────────────────────────────────────────────────────────────── +left_col, right_col = st.columns([1, 1], gap="large") + +with left_col: + survey_name = st.text_input( + "Survey name (optional)", + placeholder="e.g. Culture Fit Round 1", + key="survey_name", + ) + mode = st.radio("Mode", ["Quick", "Detailed"], horizontal=True, key="survey_mode") + st.caption( + "**Quick** β€” best answer + one-liner per question | " + "**Detailed** β€” option-by-option breakdown" + ) + + # Input tabs + if vision_up: + tab_text, tab_screenshot = st.tabs(["πŸ“ Paste Text", "πŸ–ΌοΈ Screenshot"]) + else: + st.info( + "πŸ“· Screenshot input unavailable β€” vision service not running. \n" + "Start it with: `bash scripts/manage-vision.sh start`" + ) + tab_text = st.container() + tab_screenshot = None + + image_b64: str | None = None + raw_text: str = "" + + with tab_text: + raw_text = st.text_area( + "Paste survey questions here", + height=280, + placeholder=( + "Q1: Which describes your ideal work environment?\n" + "A. Solo focused work\nB. Collaborative team\n" + "C. Mix of both\nD. Depends on the task" + ), + key="survey_text", + ) + + if tab_screenshot is not None: + with tab_screenshot: + st.caption("Paste from clipboard or upload a screenshot file.") + paste_col, upload_col = st.columns(2) + + with paste_col: + try: + from streamlit_paste_button import paste_image_button + paste_result = paste_image_button("πŸ“‹ Paste from clipboard", key="paste_btn") + if paste_result and paste_result.image_data: + buf = io.BytesIO() + paste_result.image_data.save(buf, format="PNG") + image_b64 = base64.b64encode(buf.getvalue()).decode() + st.image( + paste_result.image_data, + caption="Pasted image", + use_container_width=True, + ) + except ImportError: + st.warning("streamlit-paste-button not installed. Use file upload.") + + with upload_col: + uploaded = st.file_uploader( + "Upload screenshot", + type=["png", "jpg", "jpeg"], + key="survey_upload", + label_visibility="collapsed", + ) + if uploaded: + image_b64 = base64.b64encode(uploaded.read()).decode() + st.image(uploaded, caption="Uploaded image", use_container_width=True) + + # Analyze button + has_input = bool(raw_text.strip()) or bool(image_b64) + if st.button("πŸ” Analyze", type="primary", disabled=not has_input, use_container_width=True): + with st.spinner("Analyzing…"): + try: + router = LLMRouter() + if image_b64: + prompt = _build_image_prompt(mode) + output = router.complete( + prompt, + images=[image_b64], + fallback_order=router.config.get("vision_fallback_order"), + ) + source = "screenshot" + else: + prompt = _build_text_prompt(raw_text, mode) + output = router.complete( + prompt, + system=_SURVEY_SYSTEM, + fallback_order=router.config.get("research_fallback_order"), + ) + source = "text_paste" + st.session_state["survey_output"] = output + st.session_state["survey_source"] = source + st.session_state["survey_image_b64"] = image_b64 + st.session_state["survey_raw_text"] = raw_text + except Exception as e: + st.error(f"Analysis failed: {e}") + +with right_col: + output = st.session_state.get("survey_output") + if output: + st.markdown("### Analysis") + st.markdown(output) + + st.divider() + with st.form("save_survey_form"): + reported_score = st.text_input( + "Reported score (optional)", + placeholder="e.g. 82% or 4.2/5", + key="reported_score_input", + ) + if st.form_submit_button("πŸ’Ύ Save to Job"): + source = st.session_state.get("survey_source", "text_paste") + image_b64_saved = st.session_state.get("survey_image_b64") + raw_text_saved = st.session_state.get("survey_raw_text", "") + + image_path = "" + if image_b64_saved: + ts = datetime.now().strftime("%Y%m%d_%H%M%S") + save_dir = ( + Path(__file__).parent.parent.parent + / "data" + / "survey_screenshots" + / str(selected_job_id) + ) + save_dir.mkdir(parents=True, exist_ok=True) + img_file = save_dir / f"{ts}.png" + img_file.write_bytes(base64.b64decode(image_b64_saved)) + image_path = str(img_file) + + insert_survey_response( + DEFAULT_DB, + job_id=selected_job_id, + survey_name=survey_name, + source=source, + raw_input=raw_text_saved, + image_path=image_path, + mode=mode.lower(), + llm_output=output, + reported_score=reported_score, + ) + st.success("Saved!") + del st.session_state["survey_output"] + st.rerun() + else: + st.markdown("### Analysis") + st.caption("Results will appear here after analysis.") + +# ── History ──────────────────────────────────────────────────────────────────── +st.divider() +st.subheader("πŸ“‚ Response History") +history = get_survey_responses(DEFAULT_DB, job_id=selected_job_id) + +if not history: + st.caption("No saved responses for this job yet.") +else: + for resp in history: + label = resp.get("survey_name") or "Survey response" + ts = (resp.get("created_at") or "")[:16] + score = resp.get("reported_score") + score_str = f" Β· Score: {score}" if score else "" + with st.expander(f"{label} Β· {ts}{score_str}"): + st.caption(f"Mode: {resp.get('mode', '?')} Β· Source: {resp.get('source', '?')}") + if resp.get("raw_input"): + with st.expander("Original input"): + st.text(resp["raw_input"]) + st.markdown(resp.get("llm_output", "")) diff --git a/config/adzuna.yaml.example b/config/adzuna.yaml.example new file mode 100644 index 0000000..e58a46f --- /dev/null +++ b/config/adzuna.yaml.example @@ -0,0 +1,5 @@ +# Adzuna Jobs API credentials +# Register at https://developer.adzuna.com/admin/applications +# Both app_id and app_key are required. +app_id: "" # short alphanumeric ID from your developer dashboard +app_key: "" # 32-character hex key from your developer dashboard diff --git a/config/blocklist.yaml b/config/blocklist.yaml new file mode 100644 index 0000000..398064d --- /dev/null +++ b/config/blocklist.yaml @@ -0,0 +1,15 @@ +# Discovery blocklist β€” entries matching any rule are silently dropped before DB insert. +# Applies globally across all search profiles and custom boards. + +# Company name blocklist β€” partial case-insensitive match on the company field. +# e.g. "Amazon" blocks any listing where company contains "amazon". +companies: [] + +# Industry/content blocklist β€” blocked if company name OR job description contains any keyword. +# Use this for industries you will never work in regardless of company. +# e.g. "gambling", "crypto", "tobacco", "defense" +industries: [] + +# Location blocklist β€” blocked if the location field contains any of these strings. +# e.g. "Dallas", "Austin, TX" +locations: [] diff --git a/config/craigslist.yaml.example b/config/craigslist.yaml.example new file mode 100644 index 0000000..578dcb8 --- /dev/null +++ b/config/craigslist.yaml.example @@ -0,0 +1,24 @@ +# Craigslist metro subdomains to search. +# Copy to config/craigslist.yaml and adjust for your markets. +# Full subdomain list: https://www.craigslist.org/about/sites +metros: + - sfbay + - newyork + - chicago + - losangeles + - seattle + - austin + +# Maps search profile location strings β†’ Craigslist metro subdomain. +# Locations not listed here are silently skipped. +location_map: + "San Francisco Bay Area, CA": sfbay + "New York, NY": newyork + "Chicago, IL": chicago + "Los Angeles, CA": losangeles + "Seattle, WA": seattle + "Austin, TX": austin + +# Craigslist job category. Defaults to 'jjj' (general jobs) if omitted. +# Other options: csr (customer service), mar (marketing), sof (software/qa/dba) +# category: jjj diff --git a/config/email.yaml.example b/config/email.yaml.example new file mode 100644 index 0000000..b234cc1 --- /dev/null +++ b/config/email.yaml.example @@ -0,0 +1,38 @@ +# config/email.yaml β€” IMAP email sync configuration +# Copy this to config/email.yaml and fill in your credentials. +# config/email.yaml is gitignored β€” never commit real credentials. +# +# Gmail setup: +# 1. Enable IMAP: Gmail Settings β†’ See all settings β†’ Forwarding and POP/IMAP +# 2. Create App Password: myaccount.google.com/apppasswords +# (requires 2-Step Verification to be enabled) +# 3. Use your Gmail address as username, App Password as password. +# +# Outlook / Office 365: +# host: outlook.office365.com +# port: 993 +# use_ssl: true +# (Use your regular email + password, or an App Password if MFA is enabled) + +host: imap.gmail.com +port: 993 +use_ssl: true + +# Your full email address +username: your.email@gmail.com + +# Gmail: use an App Password (16-char code, no spaces) +# Other providers: use your regular password (or App Password if MFA enabled) +password: xxxx-xxxx-xxxx-xxxx + +# Sent folder name β€” leave blank to auto-detect +# Gmail: "[Gmail]/Sent Mail" Outlook: "Sent Items" Generic: "Sent" +sent_folder: "" + +# How many days back to search (90 = ~3 months) +lookback_days: 90 + +# Optional: Gmail label to scan for action-needed emails (e.g. "TO DO JOBS"). +# Emails in this label are matched to pipeline jobs by company name, then +# filtered by action keywords in the subject. Leave blank to disable. +todo_label: "" diff --git a/config/llm.yaml b/config/llm.yaml new file mode 100644 index 0000000..e5a58e5 --- /dev/null +++ b/config/llm.yaml @@ -0,0 +1,66 @@ +backends: + anthropic: + api_key_env: ANTHROPIC_API_KEY + enabled: false + model: claude-sonnet-4-6 + type: anthropic + supports_images: true + claude_code: + api_key: any + base_url: http://localhost:3009/v1 + enabled: false + model: claude-code-terminal + type: openai_compat + supports_images: true + github_copilot: + api_key: any + base_url: http://localhost:3010/v1 + enabled: false + model: gpt-4o + type: openai_compat + supports_images: false + ollama: + api_key: ollama + base_url: http://localhost:11434/v1 + enabled: true + model: alex-cover-writer:latest + type: openai_compat + supports_images: false + ollama_research: + api_key: ollama + base_url: http://localhost:11434/v1 + enabled: true + model: llama3.1:8b + type: openai_compat + supports_images: false + vllm: + api_key: '' + base_url: http://localhost:8000/v1 + enabled: true + model: __auto__ + type: openai_compat + supports_images: false + vision_service: + base_url: http://localhost:8002 + enabled: false + type: vision_service + supports_images: true +fallback_order: +- ollama +- claude_code +- vllm +- github_copilot +- anthropic +research_fallback_order: +- claude_code +- vllm +- ollama_research +- github_copilot +- anthropic +vision_fallback_order: +- vision_service +- claude_code +- anthropic +# Note: 'ollama' (alex-cover-writer) intentionally excluded β€” research +# must never use the fine-tuned writer model, and this also avoids evicting +# the writer from GPU memory while a cover letter task is in flight. diff --git a/config/llm.yaml.example b/config/llm.yaml.example new file mode 100644 index 0000000..e5a58e5 --- /dev/null +++ b/config/llm.yaml.example @@ -0,0 +1,66 @@ +backends: + anthropic: + api_key_env: ANTHROPIC_API_KEY + enabled: false + model: claude-sonnet-4-6 + type: anthropic + supports_images: true + claude_code: + api_key: any + base_url: http://localhost:3009/v1 + enabled: false + model: claude-code-terminal + type: openai_compat + supports_images: true + github_copilot: + api_key: any + base_url: http://localhost:3010/v1 + enabled: false + model: gpt-4o + type: openai_compat + supports_images: false + ollama: + api_key: ollama + base_url: http://localhost:11434/v1 + enabled: true + model: alex-cover-writer:latest + type: openai_compat + supports_images: false + ollama_research: + api_key: ollama + base_url: http://localhost:11434/v1 + enabled: true + model: llama3.1:8b + type: openai_compat + supports_images: false + vllm: + api_key: '' + base_url: http://localhost:8000/v1 + enabled: true + model: __auto__ + type: openai_compat + supports_images: false + vision_service: + base_url: http://localhost:8002 + enabled: false + type: vision_service + supports_images: true +fallback_order: +- ollama +- claude_code +- vllm +- github_copilot +- anthropic +research_fallback_order: +- claude_code +- vllm +- ollama_research +- github_copilot +- anthropic +vision_fallback_order: +- vision_service +- claude_code +- anthropic +# Note: 'ollama' (alex-cover-writer) intentionally excluded β€” research +# must never use the fine-tuned writer model, and this also avoids evicting +# the writer from GPU memory while a cover letter task is in flight. diff --git a/config/notion.yaml.example b/config/notion.yaml.example new file mode 100644 index 0000000..55977dd --- /dev/null +++ b/config/notion.yaml.example @@ -0,0 +1,24 @@ +# Copy to config/notion.yaml and fill in your values. +# notion.yaml is gitignored β€” never commit it. +# +# Get your integration token from: https://www.notion.so/my-integrations +# Then share the "Tracking Job Applications" database with your integration: +# Open the DB in Notion β†’ ... menu β†’ Add connections β†’ select your integration +# +token: "secret_XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX" +database_id: "1bd75cff-7708-8007-8c00-f1de36620a0a" + +field_map: + title_field: "Salary" + job_title: "Job Title" + company: "Company Name" + url: "Role Link" + source: "Job Source" + status: "Status of Application" + status_new: "Application Submitted" + date_found: "Date Found" + remote: "Remote" + match_score: "Match Score" + keyword_gaps: "Keyword Gaps" + notes: "Notes" + job_description: "Job Description" diff --git a/config/resume_keywords.yaml b/config/resume_keywords.yaml new file mode 100644 index 0000000..7cfdab3 --- /dev/null +++ b/config/resume_keywords.yaml @@ -0,0 +1,23 @@ +domains: +- B2B SaaS +- enterprise software +- security +- compliance +- post-sale lifecycle +- SaaS metrics +- web security +keywords: +- churn reduction +- escalation management +- cross-functional +- product feedback loop +- customer advocacy +skills: +- Customer Success +- Technical Account Management +- Revenue Operations +- data analysis +- stakeholder management +- project management +- onboarding +- renewal management diff --git a/config/resume_keywords.yaml.example b/config/resume_keywords.yaml.example new file mode 100644 index 0000000..6ff978c --- /dev/null +++ b/config/resume_keywords.yaml.example @@ -0,0 +1,33 @@ +skills: + - Customer Success + - Technical Account Management + - Revenue Operations + - Salesforce + - Gainsight + - data analysis + - stakeholder management + - project management + - onboarding + - renewal management + +domains: + - B2B SaaS + - enterprise software + - security + - compliance + - post-sale lifecycle + - SaaS metrics + +keywords: + - QBR + - churn reduction + - NRR + - ARR + - MRR + - executive sponsorship + - VOC + - health score + - escalation management + - cross-functional + - product feedback loop + - customer advocacy diff --git a/config/search_profiles.yaml b/config/search_profiles.yaml new file mode 100644 index 0000000..bada59a --- /dev/null +++ b/config/search_profiles.yaml @@ -0,0 +1,123 @@ +profiles: +- boards: + - linkedin + - indeed + - glassdoor + - zip_recruiter + - google + custom_boards: + - adzuna + - theladders + - craigslist + exclude_keywords: + - sales + - account executive + - sales engineer + - SDR + - BDR + - business development + - sales development + - sales manager + - sales representative + - sales rep + hours_old: 240 + locations: + - Remote + - San Francisco Bay Area, CA + name: cs_leadership + results_per_board: 75 + titles: + - Customer Success Manager + - Customer Engagement Manager + - Director of Customer Success + - VP Customer Success + - Head of Customer Success + - Technical Account Manager + - TAM + - Customer Experience Lead + - CSM + - CX + - Customer Success Consultant +- boards: + - linkedin + - indeed + custom_boards: + - adzuna + - craigslist + exclude_keywords: + - sales + - account executive + - SDR + - BDR + - sales development + hours_old: 336 + locations: + - Remote + - San Francisco Bay Area, CA + mission_tags: + - music + name: music_industry + results_per_board: 50 + titles: + - Customer Success Manager + - Partner Success Manager + - Artist Success Manager + - Creator Success Manager + - Technical Account Manager + - Community Manager + - Account Manager + - Label Relations Manager +- boards: + - linkedin + - indeed + custom_boards: + - adzuna + - craigslist + exclude_keywords: + - sales + - account executive + - SDR + - BDR + hours_old: 336 + locations: + - Remote + - San Francisco Bay Area, CA + mission_tags: + - animal_welfare + name: animal_welfare + results_per_board: 50 + titles: + - Customer Success Manager + - Program Manager + - Community Engagement Manager + - Operations Manager + - Partner Success Manager + - Account Manager + - Development Manager +- boards: + - linkedin + - indeed + custom_boards: + - adzuna + - craigslist + exclude_keywords: + - sales + - account executive + - SDR + - BDR + hours_old: 336 + locations: + - Remote + - San Francisco Bay Area, CA + mission_tags: + - education + name: education + results_per_board: 50 + titles: + - Customer Success Manager + - District Success Manager + - Implementation Specialist + - Partner Success Manager + - Account Manager + - School Success Manager + - Customer Experience Manager diff --git a/data/survey_screenshots/.gitkeep b/data/survey_screenshots/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..d381d9d --- /dev/null +++ b/environment.yml @@ -0,0 +1,68 @@ +name: job-seeker +# Recreate: conda env create -f environment.yml +# Update pinned snapshot: conda env export --no-builds > environment.yml +channels: + - conda-forge + - defaults +dependencies: + - python=3.12 + - pip + - pip: + # ── Web UI ──────────────────────────────────────────────────────────────── + - streamlit>=1.35 + - watchdog # live reload + - reportlab>=4.0 # PDF cover letter export + - pandas>=2.0 + - pyarrow # streamlit data tables + - streamlit-paste-button>=0.1.0 + + # ── Job scraping ────────────────────────────────────────────────────────── + - python-jobspy>=1.1 + - playwright # browser automation (run: playwright install chromium) + - selenium + - undetected-chromedriver + - webdriver-manager + - beautifulsoup4 + - requests + - curl_cffi # Chrome TLS fingerprint β€” bypasses Cloudflare on The Ladders + - fake-useragent # company scraper rotation + + # ── LLM / AI backends ───────────────────────────────────────────────────── + - openai>=1.0 # used for OpenAI-compat backends (ollama, vllm, wrappers) + - anthropic>=0.80 # direct Anthropic API fallback + - ollama # Python client for Ollama management + - langchain>=0.2 + - langchain-openai + - langchain-anthropic + - langchain-ollama + - langchain-community + - langchain-google-genai + - google-generativeai + - tiktoken + + # ── Resume matching ─────────────────────────────────────────────────────── + - scikit-learn>=1.3 + - rapidfuzz + - lib-resume-builder-aihawk + + # ── Notion integration ──────────────────────────────────────────────────── + - notion-client>=3.0 + + # ── Document handling ───────────────────────────────────────────────────── + - pypdf + - pdfminer-six + - pyyaml>=6.0 + - python-dotenv + + # ── Utilities ───────────────────────────────────────────────────────────── + - sqlalchemy + - tqdm + - loguru + - rich + - tenacity + - httpx + + # ── Testing ─────────────────────────────────────────────────────────────── + - pytest>=9.0 + - pytest-cov + - pytest-mock diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5ee6477 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +testpaths = tests diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/scripts/company_research.py b/scripts/company_research.py new file mode 100644 index 0000000..3c7069c --- /dev/null +++ b/scripts/company_research.py @@ -0,0 +1,468 @@ +# scripts/company_research.py +""" +Pre-interview company research generator. + +Three-phase approach: + 1. If SearXNG is available (port 8888), use companyScraper.py to fetch live + data: CEO name, HQ address, LinkedIn, contact info. + 1b. Use Phase 1 data (company name + CEO if found) to query SearXNG for + recent news snippets (funding, launches, leadership changes, etc.). + 2. Feed all real data into an LLM prompt to synthesise a structured brief + covering company overview, leadership, recent developments, and talking + points tailored to Alex. + +Falls back to pure LLM knowledge when SearXNG is offline. + +Usage (standalone): + conda run -n job-seeker python scripts/company_research.py --job-id 42 + conda run -n job-seeker python scripts/company_research.py --job-id 42 --no-scrape +""" +import re +import sys +from pathlib import Path +from types import SimpleNamespace + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +# ── SearXNG scraper integration ─────────────────────────────────────────────── +_SCRAPER_DIR = Path("/Library/Development/scrapers") +_SCRAPER_AVAILABLE = False + +if _SCRAPER_DIR.exists(): + sys.path.insert(0, str(_SCRAPER_DIR)) + try: + from companyScraper import EnhancedCompanyScraper, Config as _ScraperConfig + _SCRAPER_AVAILABLE = True + except (ImportError, SystemExit): + # companyScraper calls sys.exit(1) if bs4/fake-useragent aren't installed + pass + + +def _searxng_running() -> bool: + """Quick check whether SearXNG is reachable.""" + try: + import requests + r = requests.get("http://localhost:8888/", timeout=3) + return r.status_code == 200 + except Exception: + return False + + +def _scrape_company(company: str) -> dict: + """ + Use companyScraper in minimal mode to pull live CEO / HQ data. + Returns a dict with keys: ceo, headquarters, linkedin (may be 'Not found'). + """ + mock_args = SimpleNamespace( + mode="minimal", + verbose=False, + dry_run=False, + debug=False, + use_cache=True, + save_raw=False, + target_staff=None, + include_types=None, + exclude_types=None, + include_contact=False, + include_address=False, + include_social=True, # grab LinkedIn while we're at it + timeout=20, + input_file=None, + output_file="/dev/null", + searxng_url="http://localhost:8888/", + ) + # Override the singleton Config URL + _ScraperConfig.SEARXNG_URL = "http://localhost:8888/" + + scraper = EnhancedCompanyScraper(mock_args) + scraper.companies = [company] + + result: dict = {"ceo": "Not found", "headquarters": "Not found", "linkedin": "Not found"} + for search_type in ["ceo", "hq", "social"]: + html = scraper.search_company(company, search_type) + if search_type == "ceo": + result["ceo"] = scraper.extract_ceo(html, company) + elif search_type == "hq": + result["headquarters"] = scraper.extract_address(html, company) + elif search_type == "social": + social = scraper.extract_social(html, company) + # Pull out just the LinkedIn entry + for part in (social or "").split(";"): + if "linkedin" in part.lower(): + result["linkedin"] = part.strip() + break + + return result + + +_SEARCH_QUERIES = { + "news": '"{company}" news 2025 2026', + "funding": '"{company}" funding round investors Series valuation', + "tech": '"{company}" tech stack engineering technology platform', + "competitors": '"{company}" competitors alternatives vs market', + "culture": '"{company}" glassdoor culture reviews employees', + "accessibility": '"{company}" ADA accessibility disability inclusion accommodation ERG', + "ceo_press": '"{ceo}" "{company}"', # only used if ceo is known +} + + +def _run_search_query(query: str, results: dict, key: str) -> None: + """Thread target: run one SearXNG JSON query, store up to 4 snippets in results[key].""" + import requests + + snippets: list[str] = [] + seen: set[str] = set() + try: + resp = requests.get( + "http://localhost:8888/search", + params={"q": query, "format": "json", "language": "en-US"}, + timeout=12, + ) + if resp.status_code != 200: + return + for r in resp.json().get("results", [])[:4]: + url = r.get("url", "") + if url in seen: + continue + seen.add(url) + title = r.get("title", "").strip() + content = r.get("content", "").strip() + if title or content: + snippets.append(f"- **{title}**\n {content}\n <{url}>") + except Exception: + pass + results[key] = "\n\n".join(snippets) + + +def _fetch_search_data(company: str, ceo: str = "") -> dict[str, str]: + """ + Run all search queries in parallel threads. + Returns dict keyed by search type (news, funding, tech, competitors, culture, ceo_press). + Missing/failed queries produce empty strings. + """ + import threading + + results: dict[str, str] = {} + threads = [] + + keys: list[str] = [] + for key, pattern in _SEARCH_QUERIES.items(): + if key == "ceo_press" and not ceo or (ceo or "").lower() == "not found": + continue + # Use replace() not .format() β€” company names may contain curly braces + query = pattern.replace("{company}", company).replace("{ceo}", ceo) + t = threading.Thread( + target=_run_search_query, + args=(query, results, key), + daemon=True, + ) + threads.append(t) + keys.append(key) + t.start() + + for t, key in zip(threads, keys): + t.join(timeout=15) + # Thread may still be alive after timeout β€” pre-populate key so + # the results dict contract ("missing queries β†’ empty string") holds + if t.is_alive(): + results.setdefault(key, "") + + return results + + +def _parse_sections(text: str) -> dict[str, str]: + """Split LLM markdown output on ## headers into named sections.""" + sections: dict[str, str] = {} + pattern = re.compile(r"^##\s+(.+)$", re.MULTILINE) + matches = list(pattern.finditer(text)) + for i, match in enumerate(matches): + name = match.group(1).strip() + start = match.end() + end = matches[i + 1].start() if i + 1 < len(matches) else len(text) + sections[name] = text[start:end].strip() + return sections + + +_RESUME_YAML = Path(__file__).parent.parent / "aihawk" / "data_folder" / "plain_text_resume.yaml" +_KEYWORDS_YAML = Path(__file__).parent.parent / "config" / "resume_keywords.yaml" + +# Companies where Alex has an NDA β€” reference as generic label unless +# the role is security-focused (score >= 3 matching JD keywords). +_NDA_COMPANIES = {"upguard"} + + +def _score_experiences(experiences: list[dict], keywords: list[str], jd: str) -> list[dict]: + """Score each experience entry by keyword overlap with JD; return sorted descending.""" + jd_lower = jd.lower() + scored = [] + for exp in experiences: + text = " ".join([ + exp.get("position", ""), + exp.get("company", ""), + " ".join( + v + for resp in exp.get("key_responsibilities", []) + for v in resp.values() + ), + ]).lower() + score = sum(1 for kw in keywords if kw.lower() in text and kw.lower() in jd_lower) + scored.append({**exp, "score": score}) + return sorted(scored, key=lambda x: x["score"], reverse=True) + + +def _build_resume_context(resume: dict, keywords: list[str], jd: str) -> str: + """ + Build the resume section of the LLM context block. + Top 2 scored experiences included in full detail; rest as one-liners. + Applies UpGuard NDA rule: reference as 'enterprise security vendor (NDA)' + unless the role is security-focused (score >= 3). + """ + experiences = resume.get("experience_details", []) + if not experiences: + return "" + + scored = _score_experiences(experiences, keywords, jd) + top2 = scored[:2] + rest = scored[2:] + + def _company_label(exp: dict) -> str: + company = exp.get("company", "") + if company.lower() in _NDA_COMPANIES and exp.get("score", 0) < 3: + return "enterprise security vendor (NDA)" + return company + + def _exp_header(exp: dict) -> str: + return f"{exp.get('position', '')} @ {_company_label(exp)} ({exp.get('employment_period', '')})" + + def _exp_bullets(exp: dict) -> str: + bullets = [v for resp in exp.get("key_responsibilities", []) for v in resp.values()] + return "\n".join(f" - {b}" for b in bullets) + + lines = ["## Alex's Matched Experience"] + for exp in top2: + lines.append(f"\n**{_exp_header(exp)}** (match score: {exp['score']})") + lines.append(_exp_bullets(exp)) + + if rest: + condensed = ", ".join(_exp_header(e) for e in rest) + lines.append(f"\nAlso in Alex's background: {condensed}") + + return "\n".join(lines) + + +def _load_resume_and_keywords() -> tuple[dict, list[str]]: + """Load resume YAML and keywords config. Returns (resume_dict, all_keywords_list).""" + import yaml as _yaml + + resume = {} + if _RESUME_YAML.exists(): + resume = _yaml.safe_load(_RESUME_YAML.read_text()) or {} + + keywords: list[str] = [] + if _KEYWORDS_YAML.exists(): + kw_cfg = _yaml.safe_load(_KEYWORDS_YAML.read_text()) or {} + for lst in kw_cfg.values(): + if isinstance(lst, list): + keywords.extend(lst) + + return resume, keywords + + +def research_company(job: dict, use_scraper: bool = True, on_stage=None) -> dict: + """ + Generate a pre-interview research brief for a job. + + Parameters + ---------- + job : dict + Job row from the DB (needs at least 'company', 'title', 'description'). + use_scraper : bool + Whether to attempt live data via SearXNG before falling back to LLM. + + Returns + ------- + dict with keys: raw_output, company_brief, ceo_brief, tech_brief, + funding_brief, competitors_brief, red_flags, talking_points + """ + from scripts.llm_router import LLMRouter + + router = LLMRouter() + research_order = router.config.get("research_fallback_order") or router.config["fallback_order"] + company = job.get("company") or "the company" + title = job.get("title") or "this role" + jd_excerpt = (job.get("description") or "")[:1500] + + resume, keywords = _load_resume_and_keywords() + matched_keywords = [kw for kw in keywords if kw.lower() in jd_excerpt.lower()] + resume_context = _build_resume_context(resume, keywords, jd_excerpt) + keywords_note = ( + f"\n\n## Matched Skills & Keywords\nSkills matching this JD: {', '.join(matched_keywords)}" + if matched_keywords else "" + ) + + def _stage(msg: str) -> None: + if on_stage: + try: + on_stage(msg) + except Exception: + pass # never let stage callbacks break the task + + # ── Phase 1: live scrape (optional) ────────────────────────────────────── + live_data: dict = {} + scrape_note = "" + _stage("Checking for live company data…") + if use_scraper and _SCRAPER_AVAILABLE and _searxng_running(): + _stage("Scraping CEO & HQ data…") + try: + live_data = _scrape_company(company) + parts = [] + if live_data.get("ceo") not in (None, "Not found"): + parts.append(f"CEO: {live_data['ceo']}") + if live_data.get("headquarters") not in (None, "Not found"): + parts.append(f"HQ: {live_data['headquarters']}") + if live_data.get("linkedin") not in (None, "Not found"): + parts.append(f"LinkedIn: {live_data['linkedin']}") + if parts: + scrape_note = ( + "\n\n**Live data retrieved via SearXNG:**\n" + + "\n".join(f"- {p}" for p in parts) + + "\n\nIncorporate these facts where relevant." + ) + except BaseException as e: + scrape_note = f"\n\n_(Live scrape attempted but failed: {e})_" + + # ── Phase 1b: parallel search queries ──────────────────────────────────── + search_data: dict[str, str] = {} + _stage("Running web searches…") + if use_scraper and _searxng_running(): + _stage("Running web searches (news, funding, tech, culture)…") + try: + ceo_name = (live_data.get("ceo") or "") if live_data else "" + search_data = _fetch_search_data(company, ceo=ceo_name) + except BaseException: + pass # best-effort; never fail the whole task + + # Track whether SearXNG actually contributed usable data to this brief. + scrape_used = 1 if (live_data or any(v.strip() for v in search_data.values())) else 0 + + def _section_note(key: str, label: str) -> str: + text = search_data.get(key, "").strip() + return f"\n\n## {label} (live web search)\n\n{text}" if text else "" + + news_note = _section_note("news", "News & Press") + funding_note = _section_note("funding", "Funding & Investors") + tech_note = _section_note("tech", "Tech Stack") + competitors_note = _section_note("competitors", "Competitors") + culture_note = _section_note("culture", "Culture & Employee Signals") + accessibility_note = _section_note("accessibility", "Accessibility & Disability Inclusion") + ceo_press_note = _section_note("ceo_press", "CEO in the News") + + # ── Phase 2: LLM synthesis ──────────────────────────────────────────────── + _stage("Generating brief with LLM… (30–90 seconds)") + prompt = f"""You are preparing Alex Rivera for a job interview. + +Role: **{title}** at **{company}** + +## Job Description +{jd_excerpt} +{resume_context}{keywords_note} + +## Live Company Data +{scrape_note.strip() or "_(scrape unavailable)_"} +{news_note}{funding_note}{tech_note}{competitors_note}{culture_note}{accessibility_note}{ceo_press_note} + +--- + +Produce a structured research brief using **exactly** these eight markdown section headers +(include all eight even if a section has limited data β€” say so honestly): + +## Company Overview +What {company} does, core product/service, business model, size/stage (startup / scale-up / enterprise), market positioning. + +## Leadership & Culture +CEO background and leadership style, key execs, mission/values statements, Glassdoor themes. + +## Tech Stack & Product +Technologies, platforms, and product direction relevant to the {title} role. + +## Funding & Market Position +Funding stage, key investors, recent rounds, burn/growth signals, competitor landscape. + +## Recent Developments +News, launches, acquisitions, exec moves, pivots, or press from the past 12–18 months. +Draw on the live snippets above; if none available, note what is publicly known. + +## Red Flags & Watch-outs +Culture issues, layoffs, exec departures, financial stress, or Glassdoor concerns worth knowing before the call. +If nothing notable, write "No significant red flags identified." + +## Inclusion & Accessibility +Assess {company}'s commitment to disability inclusion and accessibility. Cover: +- ADA accommodation language in job postings or company policy +- Disability Employee Resource Group (ERG) or affinity group +- Product or service accessibility (WCAG compliance, adaptive features, AT integrations) +- Any public disability/accessibility advocacy, partnerships, or certifications +- Glassdoor or press signals about how employees with disabilities experience the company +If no specific signals are found, say so clearly β€” absence of public commitment is itself signal. +This section is for Alex's personal decision-making only and will not appear in any application. + +## Talking Points for Alex +Five specific talking points for the phone screen. Each must: +- Reference a concrete experience from Alex's matched background by name + (UpGuard NDA rule: say "enterprise security vendor" unless the role has a clear security/compliance focus) +- Connect to a specific signal from the JD or company context above +- Be 1–2 sentences, ready to speak aloud +- Never give generic advice + +--- +⚠️ This brief combines live web data and LLM training knowledge. Verify key facts before the call. +""" + + raw = router.complete(prompt, fallback_order=research_order) + # Strip … blocks emitted by reasoning models (e.g. DeepSeek, Qwen-R) + raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + sections = _parse_sections(raw) + + return { + "raw_output": raw, + "company_brief": sections.get("Company Overview", ""), + "ceo_brief": sections.get("Leadership & Culture", ""), + "tech_brief": sections.get("Tech Stack & Product", ""), + "funding_brief": sections.get("Funding & Market Position", ""), + "competitors_brief": sections.get("Funding & Market Position", ""), # competitor landscape is in the funding section + "red_flags": sections.get("Red Flags & Watch-outs", ""), + "accessibility_brief": sections.get("Inclusion & Accessibility", ""), + "talking_points": sections.get("Talking Points for Alex", ""), + "scrape_used": scrape_used, + } + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Generate company research brief") + parser.add_argument("--job-id", type=int, required=True, help="Job ID in staging.db") + parser.add_argument("--no-scrape", action="store_true", help="Skip SearXNG live scrape") + args = parser.parse_args() + + from scripts.db import DEFAULT_DB, init_db, save_research + import sqlite3 + + init_db(DEFAULT_DB) + conn = sqlite3.connect(DEFAULT_DB) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone() + conn.close() + + if not row: + sys.exit(f"Job {args.job_id} not found in {DEFAULT_DB}") + + job = dict(row) + print(f"Researching: {job['title']} @ {job['company']} …\n") + if _SCRAPER_AVAILABLE and not args.no_scrape: + print(f"SearXNG available: {_searxng_running()}") + + result = research_company(job, use_scraper=not args.no_scrape) + save_research(DEFAULT_DB, job_id=args.job_id, **result) + print(result["raw_output"]) + print(f"\n[Saved to company_research for job {args.job_id}]") diff --git a/scripts/custom_boards/__init__.py b/scripts/custom_boards/__init__.py new file mode 100644 index 0000000..7b12ac1 --- /dev/null +++ b/scripts/custom_boards/__init__.py @@ -0,0 +1 @@ +# Custom job board scrapers β€” each module exposes scrape(profile, location, results_wanted) -> list[dict] diff --git a/scripts/custom_boards/adzuna.py b/scripts/custom_boards/adzuna.py new file mode 100644 index 0000000..fa57bdc --- /dev/null +++ b/scripts/custom_boards/adzuna.py @@ -0,0 +1,160 @@ +"""Adzuna Jobs API scraper. + +API docs: https://developer.adzuna.com/docs/search +Config: config/adzuna.yaml (gitignored β€” contains app_id + app_key) + +Each title in the search profile is queried as an exact phrase per location. +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import time +from pathlib import Path + +import requests +import yaml + +_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "adzuna.yaml" +_BASE_URL = "https://api.adzuna.com/v1/api/jobs/us/search" + + +def _load_config() -> tuple[str, str]: + if not _CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Adzuna config not found: {_CONFIG_PATH}\n" + "Copy config/adzuna.yaml.example β†’ config/adzuna.yaml and fill in credentials." + ) + cfg = yaml.safe_load(_CONFIG_PATH.read_text()) + app_id = (cfg.get("app_id") or "").strip() + app_key = (cfg.get("app_key") or "").strip() + if not app_id or not app_key: + raise ValueError( + "config/adzuna.yaml requires both 'app_id' and 'app_key'.\n" + "Find your App ID at https://developer.adzuna.com/admin/applications" + ) + return app_id, app_key + + +def _salary_str(job: dict) -> str: + lo = job.get("salary_min") + hi = job.get("salary_max") + try: + if lo and hi: + return f"${int(lo):,} – ${int(hi):,}" + if lo: + return f"${int(lo):,}+" + except (TypeError, ValueError): + pass + return "" + + +def _is_remote(location_display: str) -> bool: + return "remote" in location_display.lower() + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """Fetch jobs from the Adzuna API for a single location. + + Args: + profile: Search profile dict from search_profiles.yaml. + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + """ + try: + app_id, app_key = _load_config() + except (FileNotFoundError, ValueError) as exc: + print(f" [adzuna] Skipped β€” {exc}") + return [] + + titles = profile.get("titles", []) + hours_old = profile.get("hours_old", 240) + max_days_old = max(1, hours_old // 24) + is_remote_search = location.lower() == "remote" + + session = requests.Session() + session.headers.update({"Accept": "application/json", "User-Agent": "Mozilla/5.0"}) + + seen_ids: set[str] = set() + results: list[dict] = [] + + for title in titles: + if len(results) >= results_wanted: + break + + page = 1 + while len(results) < results_wanted: + # Adzuna doesn't support where=remote β€” it treats it as a city name and + # returns 0 results. For remote searches, append "remote" to the what param. + if is_remote_search: + params = { + "app_id": app_id, + "app_key": app_key, + "results_per_page": 50, + "what": f'"{title}" remote', + "sort_by": "date", + "max_days_old": max_days_old, + } + else: + params = { + "app_id": app_id, + "app_key": app_key, + "results_per_page": 50, + "what_phrase": title, + "where": location, + "sort_by": "date", + "max_days_old": max_days_old, + } + try: + resp = session.get(f"{_BASE_URL}/{page}", params=params, timeout=20) + except requests.RequestException as exc: + print(f" [adzuna] Request error ({title}): {exc}") + break + + if resp.status_code == 401: + print(" [adzuna] Auth failed β€” check app_id and app_key in config/adzuna.yaml") + return results + if resp.status_code != 200: + print(f" [adzuna] HTTP {resp.status_code} for '{title}' page {page}") + break + + data = resp.json() + jobs = data.get("results", []) + if not jobs: + break + + for job in jobs: + job_id = str(job.get("id", "")) + if job_id in seen_ids: + continue + seen_ids.add(job_id) + + loc_display = job.get("location", {}).get("display_name", "") + redirect_url = job.get("redirect_url", "") + if not redirect_url: + continue + + results.append({ + "title": job.get("title", ""), + "company": job.get("company", {}).get("display_name", ""), + "url": redirect_url, + "source": "adzuna", + "location": loc_display, + "is_remote": is_remote_search or _is_remote(loc_display), + "salary": _salary_str(job), + "description": job.get("description", ""), + }) + + total = data.get("count", 0) + if len(results) >= total or len(jobs) < 50: + break # last page + + page += 1 + time.sleep(0.5) # polite pacing between pages + + time.sleep(0.5) # between titles + + return results[:results_wanted] diff --git a/scripts/custom_boards/craigslist.py b/scripts/custom_boards/craigslist.py new file mode 100644 index 0000000..30226ae --- /dev/null +++ b/scripts/custom_boards/craigslist.py @@ -0,0 +1,177 @@ +"""Craigslist job scraper β€” RSS-based. + +Uses Craigslist's native RSS feed endpoint for discovery. +Full job description is populated by the scrape_url background task. +Company name and salary (not structured in Craigslist listings) are +extracted from the description body by the enrich_craigslist task. + +Config: config/craigslist.yaml (gitignored β€” metro list + location map) + config/craigslist.yaml.example (committed template) + +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import time +import xml.etree.ElementTree as ET +from datetime import datetime, timezone +from email.utils import parsedate_to_datetime +from pathlib import Path +from urllib.parse import quote_plus + +import requests +import yaml + +_CONFIG_PATH = Path(__file__).parent.parent.parent / "config" / "craigslist.yaml" +_DEFAULT_CATEGORY = "jjj" +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} +_TIMEOUT = 15 +_SLEEP = 0.5 # seconds between requests β€” easy to make configurable later + + +def _load_config() -> dict: + if not _CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Craigslist config not found: {_CONFIG_PATH}\n" + "Copy config/craigslist.yaml.example β†’ config/craigslist.yaml " + "and configure your target metros." + ) + cfg = yaml.safe_load(_CONFIG_PATH.read_text()) or {} + if not cfg.get("metros"): + raise ValueError( + "config/craigslist.yaml must contain at least one entry under 'metros'." + ) + return cfg + + +def _rss_url(metro: str, category: str, query: str) -> str: + return ( + f"https://{metro}.craigslist.org/search/{category}" + f"?query={quote_plus(query)}&format=rss&sort=date" + ) + + +def _parse_pubdate(pubdate_str: str) -> datetime | None: + """Parse an RSS pubDate string to a timezone-aware datetime.""" + try: + return parsedate_to_datetime(pubdate_str) + except Exception: + return None + + +def _fetch_rss(url: str) -> list[dict]: + """Fetch and parse a Craigslist RSS feed. Returns list of raw item dicts.""" + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + try: + root = ET.fromstring(resp.content) + except ET.ParseError as exc: + raise ValueError(f"Malformed RSS XML: {exc}") from exc + + items = [] + for item in root.findall(".//item"): + def _text(tag: str, _item=item) -> str: + el = _item.find(tag) + return (el.text or "").strip() if el is not None else "" + + items.append({ + "title": _text("title"), + "link": _text("link"), + "description": _text("description"), + "pubDate": _text("pubDate"), + }) + return items + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """Fetch jobs from Craigslist RSS for a single location. + + Args: + profile: Search profile dict from search_profiles.yaml. + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all metros and titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + company/salary are empty β€” filled later by enrich_craigslist task. + """ + try: + cfg = _load_config() + except (FileNotFoundError, ValueError) as exc: + print(f" [craigslist] Skipped β€” {exc}") + return [] + + metros_all: list[str] = cfg.get("metros", []) + location_map: dict[str, str] = cfg.get("location_map", {}) + category: str = cfg.get("category") or _DEFAULT_CATEGORY + + is_remote_search = location.lower() == "remote" + if is_remote_search: + metros = metros_all + else: + metro = location_map.get(location) + if not metro: + print(f" [craigslist] No metro mapping for '{location}' β€” skipping") + return [] + metros = [metro] + + titles: list[str] = profile.get("titles", []) + hours_old: int = profile.get("hours_old", 240) + cutoff = datetime.now(tz=timezone.utc).timestamp() - (hours_old * 3600) + + seen_urls: set[str] = set() + results: list[dict] = [] + + for metro in metros: + if len(results) >= results_wanted: + break + + for title in titles: + if len(results) >= results_wanted: + break + + url = _rss_url(metro, category, title) + try: + items = _fetch_rss(url) + except requests.RequestException as exc: + print(f" [craigslist] HTTP error ({metro}/{title}): {exc}") + time.sleep(_SLEEP) + continue + except ValueError as exc: + print(f" [craigslist] Parse error ({metro}/{title}): {exc}") + time.sleep(_SLEEP) + continue + + for item in items: + if len(results) >= results_wanted: + break + + item_url = item.get("link", "") + if not item_url or item_url in seen_urls: + continue + + pub = _parse_pubdate(item.get("pubDate", "")) + if pub and pub.timestamp() < cutoff: + continue + + seen_urls.add(item_url) + results.append({ + "title": item.get("title", ""), + "company": "", + "url": item_url, + "source": "craigslist", + "location": f"{metro} (Craigslist)", + "is_remote": is_remote_search, + "salary": "", + "description": "", + }) + + time.sleep(_SLEEP) + + return results[:results_wanted] diff --git a/scripts/custom_boards/theladders.py b/scripts/custom_boards/theladders.py new file mode 100644 index 0000000..d7330af --- /dev/null +++ b/scripts/custom_boards/theladders.py @@ -0,0 +1,179 @@ +"""The Ladders scraper β€” Playwright-based (requires chromium installed). + +The Ladders is a client-side React app (no SSR __NEXT_DATA__). We use Playwright +to execute JS, wait for job cards to render, then extract from the DOM. + +Company names are hidden from guest (non-logged-in) users, but are encoded in +the job URL slug: /job/{title-slug}-{company-slug}-{location-slug}_{id} + +curl_cffi is no longer needed for this scraper; plain Playwright is sufficient. +playwright must be installed: `conda run -n job-seeker python -m playwright install chromium` + +Returns a list of dicts compatible with scripts.db.insert_job(). +""" +from __future__ import annotations + +import re +import time +from typing import Any + +_BASE = "https://www.theladders.com" +_SEARCH_PATH = "/jobs/searchjobs/{slug}" + +# Location slug in URLs for remote jobs +_REMOTE_SLUG = "virtual-travel" + + +def _company_from_url(href: str, title_slug: str) -> str: + """ + Extract company name from The Ladders job URL slug. + + URL format: /job/{title-slug}-{company-slug}-{location-slug}_{id}?ir=1 + Example: /job/customer-success-manager-gainsight-virtual-travel_85434789 + β†’ "Gainsight" + """ + # Strip path prefix and query + slug = href.split("/job/", 1)[-1].split("?")[0] + # Strip numeric ID suffix (e.g. _85434789) + slug = re.sub(r"_\d+$", "", slug) + # Strip known title prefix + if slug.startswith(title_slug + "-"): + slug = slug[len(title_slug) + 1:] + # Strip common location suffixes + for loc_suffix in [f"-{_REMOTE_SLUG}", "-new-york", "-los-angeles", + "-san-francisco", "-chicago", "-austin", "-seattle", + "-boston", "-atlanta", "-remote"]: + if slug.endswith(loc_suffix): + slug = slug[: -len(loc_suffix)] + break + # Convert kebab-case β†’ title case + return slug.replace("-", " ").title() if slug else "" + + +def _extract_jobs_js() -> str: + """JS to run in page context β€” extracts job data from rendered card elements.""" + return """() => { + const cards = document.querySelectorAll('[class*=job-card-container]'); + return Array.from(cards).map(card => { + const link = card.querySelector('p.job-link-wrapper a, a.clipped-text'); + const salary = card.querySelector('p.salary, .salary-info p'); + const locEl = card.querySelector('.remote-location-text, .location-info'); + const remoteEl = card.querySelector('.remote-flag-badge-remote'); + return { + title: link ? link.textContent.trim() : null, + href: link ? link.getAttribute('href') : null, + salary: salary ? salary.textContent.replace('*','').trim() : null, + location: locEl ? locEl.textContent.trim() : null, + is_remote: !!remoteEl, + }; + }).filter(j => j.title && j.href); + }""" + + +def scrape(profile: dict, location: str, results_wanted: int = 50) -> list[dict]: + """ + Scrape job listings from The Ladders using Playwright. + + Args: + profile: Search profile dict (uses 'titles'). + location: Location string (e.g. "Remote" or "San Francisco Bay Area, CA"). + results_wanted: Maximum results to return across all titles. + + Returns: + List of job dicts with keys: title, company, url, source, location, + is_remote, salary, description. + """ + try: + from playwright.sync_api import sync_playwright + except ImportError: + print( + " [theladders] playwright not installed.\n" + " Install: conda run -n job-seeker pip install playwright && " + "conda run -n job-seeker python -m playwright install chromium" + ) + return [] + + is_remote_search = location.lower() == "remote" + results: list[dict] = [] + seen_urls: set[str] = set() + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + ctx = browser.new_context( + user_agent=( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) + ) + page = ctx.new_page() + + for title in profile.get("titles", []): + if len(results) >= results_wanted: + break + + slug = title.lower().replace(" ", "-").replace("/", "-") + title_slug = slug # used for company extraction from URL + + params: dict[str, str] = {} + if is_remote_search: + params["remote"] = "true" + elif location: + params["location"] = location + + url = _BASE + _SEARCH_PATH.format(slug=slug) + if params: + query = "&".join(f"{k}={v}" for k, v in params.items()) + url = f"{url}?{query}" + + try: + page.goto(url, timeout=30_000) + page.wait_for_load_state("networkidle", timeout=20_000) + except Exception as exc: + print(f" [theladders] Page load error for '{title}': {exc}") + continue + + try: + raw_jobs: list[dict[str, Any]] = page.evaluate(_extract_jobs_js()) + except Exception as exc: + print(f" [theladders] JS extract error for '{title}': {exc}") + continue + + if not raw_jobs: + print(f" [theladders] No cards found for '{title}' β€” selector may need updating") + continue + + for job in raw_jobs: + href = job.get("href", "") + if not href: + continue + full_url = _BASE + href if href.startswith("/") else href + if full_url in seen_urls: + continue + seen_urls.add(full_url) + + company = _company_from_url(href, title_slug) + loc_text = (job.get("location") or "").replace("Remote", "").strip(", ") + if is_remote_search or job.get("is_remote"): + loc_display = "Remote" + (f" β€” {loc_text}" if loc_text and loc_text != "US-Anywhere" else "") + else: + loc_display = loc_text or location + + results.append({ + "title": job.get("title", ""), + "company": company, + "url": full_url, + "source": "theladders", + "location": loc_display, + "is_remote": bool(job.get("is_remote") or is_remote_search), + "salary": job.get("salary") or "", + "description": "", # not available in card view; scrape_url will fill in + }) + + if len(results) >= results_wanted: + break + + time.sleep(1) # polite pacing between titles + + browser.close() + + return results[:results_wanted] diff --git a/scripts/db.py b/scripts/db.py new file mode 100644 index 0000000..b2443a1 --- /dev/null +++ b/scripts/db.py @@ -0,0 +1,728 @@ +""" +SQLite staging layer for job listings. +Jobs flow: pending β†’ approved/rejected β†’ applied β†’ synced + applied β†’ phone_screen β†’ interviewing β†’ offer β†’ hired (or rejected) +""" +import sqlite3 +from datetime import datetime +from pathlib import Path +from typing import Optional + +DEFAULT_DB = Path(__file__).parent.parent / "staging.db" + +CREATE_JOBS = """ +CREATE TABLE IF NOT EXISTS jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, + company TEXT, + url TEXT UNIQUE, + source TEXT, + location TEXT, + is_remote INTEGER DEFAULT 0, + salary TEXT, + description TEXT, + match_score REAL, + keyword_gaps TEXT, + date_found TEXT, + status TEXT DEFAULT 'pending', + notion_page_id TEXT, + cover_letter TEXT, + applied_at TEXT +); +""" + +CREATE_JOB_CONTACTS = """ +CREATE TABLE IF NOT EXISTS job_contacts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL, + direction TEXT DEFAULT 'inbound', + subject TEXT, + from_addr TEXT, + to_addr TEXT, + body TEXT, + received_at TEXT, + is_response_needed INTEGER DEFAULT 0, + responded_at TEXT, + message_id TEXT, + FOREIGN KEY (job_id) REFERENCES jobs(id) +); +""" + +_CONTACT_MIGRATIONS = [ + ("message_id", "TEXT"), + ("stage_signal", "TEXT"), + ("suggestion_dismissed", "INTEGER DEFAULT 0"), +] + +_RESEARCH_MIGRATIONS = [ + ("tech_brief", "TEXT"), + ("funding_brief", "TEXT"), + ("competitors_brief", "TEXT"), + ("red_flags", "TEXT"), + ("scrape_used", "INTEGER"), # 1 = SearXNG contributed data, 0 = LLM-only + ("accessibility_brief", "TEXT"), # Inclusion & Accessibility section +] + +CREATE_COMPANY_RESEARCH = """ +CREATE TABLE IF NOT EXISTS company_research ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL UNIQUE, + generated_at TEXT, + company_brief TEXT, + ceo_brief TEXT, + talking_points TEXT, + raw_output TEXT, + tech_brief TEXT, + funding_brief TEXT, + competitors_brief TEXT, + red_flags TEXT, + FOREIGN KEY (job_id) REFERENCES jobs(id) +); +""" + +CREATE_BACKGROUND_TASKS = """ +CREATE TABLE IF NOT EXISTS background_tasks ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + task_type TEXT NOT NULL, + job_id INTEGER NOT NULL, + status TEXT NOT NULL DEFAULT 'queued', + error TEXT, + created_at DATETIME DEFAULT (datetime('now')), + started_at DATETIME, + finished_at DATETIME, + stage TEXT, + updated_at DATETIME +) +""" + +CREATE_SURVEY_RESPONSES = """ +CREATE TABLE IF NOT EXISTS survey_responses ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + job_id INTEGER NOT NULL REFERENCES jobs(id), + survey_name TEXT, + received_at DATETIME, + source TEXT, + raw_input TEXT, + image_path TEXT, + mode TEXT, + llm_output TEXT, + reported_score TEXT, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP +); +""" + +_MIGRATIONS = [ + ("cover_letter", "TEXT"), + ("applied_at", "TEXT"), + ("interview_date", "TEXT"), + ("rejection_stage", "TEXT"), + ("phone_screen_at", "TEXT"), + ("interviewing_at", "TEXT"), + ("offer_at", "TEXT"), + ("hired_at", "TEXT"), + ("survey_at", "TEXT"), +] + + +def _migrate_db(db_path: Path) -> None: + """Add new columns to existing tables without breaking old data.""" + conn = sqlite3.connect(db_path) + for col, coltype in _MIGRATIONS: + try: + conn.execute(f"ALTER TABLE jobs ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass # column already exists + for col, coltype in _CONTACT_MIGRATIONS: + try: + conn.execute(f"ALTER TABLE job_contacts ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass + for col, coltype in _RESEARCH_MIGRATIONS: + try: + conn.execute(f"ALTER TABLE company_research ADD COLUMN {col} {coltype}") + except sqlite3.OperationalError: + pass + try: + conn.execute("ALTER TABLE background_tasks ADD COLUMN stage TEXT") + except sqlite3.OperationalError: + pass + try: + conn.execute("ALTER TABLE background_tasks ADD COLUMN updated_at TEXT") + except sqlite3.OperationalError: + pass + conn.commit() + conn.close() + + +def init_db(db_path: Path = DEFAULT_DB) -> None: + """Create tables if they don't exist, then run migrations.""" + conn = sqlite3.connect(db_path) + conn.execute(CREATE_JOBS) + conn.execute(CREATE_JOB_CONTACTS) + conn.execute(CREATE_COMPANY_RESEARCH) + conn.execute(CREATE_BACKGROUND_TASKS) + conn.execute(CREATE_SURVEY_RESPONSES) + conn.commit() + conn.close() + _migrate_db(db_path) + + +def insert_job(db_path: Path = DEFAULT_DB, job: dict = None) -> Optional[int]: + """Insert a job. Returns row id, or None if URL already exists.""" + if job is None: + return None + conn = sqlite3.connect(db_path) + try: + cursor = conn.execute( + """INSERT INTO jobs + (title, company, url, source, location, is_remote, salary, description, date_found) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + job.get("title", ""), + job.get("company", ""), + job.get("url", ""), + job.get("source", ""), + job.get("location", ""), + int(bool(job.get("is_remote", False))), + job.get("salary", ""), + job.get("description", ""), + job.get("date_found", ""), + ), + ) + conn.commit() + return cursor.lastrowid + except sqlite3.IntegrityError: + return None # duplicate URL + finally: + conn.close() + + +def get_job_by_id(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]: + """Return a single job by ID, or None if not found.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + return dict(row) if row else None + + +def get_jobs_by_status(db_path: Path = DEFAULT_DB, status: str = "pending") -> list[dict]: + """Return all jobs with the given status as a list of dicts.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM jobs WHERE status = ? ORDER BY date_found DESC, id DESC", + (status,), + ) + rows = [dict(row) for row in cursor.fetchall()] + conn.close() + return rows + + +def get_email_leads(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return pending jobs with source='email', newest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT * FROM jobs WHERE source = 'email' AND status = 'pending' " + "ORDER BY date_found DESC, id DESC" + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def get_job_counts(db_path: Path = DEFAULT_DB) -> dict: + """Return counts per status.""" + conn = sqlite3.connect(db_path) + cursor = conn.execute( + "SELECT status, COUNT(*) as n FROM jobs GROUP BY status" + ) + counts = {row[0]: row[1] for row in cursor.fetchall()} + conn.close() + return counts + + +def update_job_status(db_path: Path = DEFAULT_DB, ids: list[int] = None, status: str = "approved") -> None: + """Batch-update status for a list of job IDs.""" + if not ids: + return + conn = sqlite3.connect(db_path) + conn.execute( + f"UPDATE jobs SET status = ? WHERE id IN ({','.join('?' * len(ids))})", + [status] + list(ids), + ) + conn.commit() + conn.close() + + +def get_existing_urls(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all URLs already in staging (any status).""" + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT url FROM jobs") + urls = {row[0] for row in cursor.fetchall()} + conn.close() + return urls + + +def write_match_scores(db_path: Path = DEFAULT_DB, job_id: int = None, + score: float = 0.0, gaps: str = "") -> None: + """Write match score and keyword gaps back to a job row.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE jobs SET match_score = ?, keyword_gaps = ? WHERE id = ?", + (score, gaps, job_id), + ) + conn.commit() + conn.close() + + +def update_cover_letter(db_path: Path = DEFAULT_DB, job_id: int = None, text: str = "") -> None: + """Persist a generated/edited cover letter for a job.""" + if job_id is None: + return + conn = sqlite3.connect(db_path) + conn.execute("UPDATE jobs SET cover_letter = ? WHERE id = ?", (text, job_id)) + conn.commit() + conn.close() + + +_UPDATABLE_JOB_COLS = { + "title", "company", "url", "source", "location", "is_remote", + "salary", "description", "match_score", "keyword_gaps", +} + + +def update_job_fields(db_path: Path = DEFAULT_DB, job_id: int = None, + fields: dict = None) -> None: + """Update arbitrary job columns. Unknown keys are silently ignored.""" + if job_id is None or not fields: + return + safe = {k: v for k, v in fields.items() if k in _UPDATABLE_JOB_COLS} + if not safe: + return + conn = sqlite3.connect(db_path) + sets = ", ".join(f"{col} = ?" for col in safe) + conn.execute( + f"UPDATE jobs SET {sets} WHERE id = ?", + (*safe.values(), job_id), + ) + conn.commit() + conn.close() + + +def mark_applied(db_path: Path = DEFAULT_DB, ids: list[int] = None) -> None: + """Set status='applied' and record today's date for a list of job IDs.""" + if not ids: + return + today = datetime.now().isoformat()[:10] + conn = sqlite3.connect(db_path) + conn.execute( + f"UPDATE jobs SET status = 'applied', applied_at = ? WHERE id IN ({','.join('?' * len(ids))})", + [today] + list(ids), + ) + conn.commit() + conn.close() + + +def kill_stuck_tasks(db_path: Path = DEFAULT_DB) -> int: + """Mark all queued/running background tasks as failed. Returns count killed.""" + conn = sqlite3.connect(db_path) + count = conn.execute( + "UPDATE background_tasks SET status='failed', error='Killed by user'," + " finished_at=datetime('now') WHERE status IN ('queued','running')" + ).rowcount + conn.commit() + conn.close() + return count + + +def purge_email_data(db_path: Path = DEFAULT_DB) -> tuple[int, int]: + """Delete all job_contacts rows and email-sourced pending jobs. + Returns (contacts_deleted, jobs_deleted). + """ + conn = sqlite3.connect(db_path) + c1 = conn.execute("DELETE FROM job_contacts").rowcount + c2 = conn.execute("DELETE FROM jobs WHERE source='email'").rowcount + conn.commit() + conn.close() + return c1, c2 + + +def purge_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int: + """Delete jobs matching given statuses. Returns number of rows deleted. + If statuses is None or empty, deletes ALL jobs (full reset). + """ + conn = sqlite3.connect(db_path) + if statuses: + placeholders = ",".join("?" * len(statuses)) + cur = conn.execute(f"DELETE FROM jobs WHERE status IN ({placeholders})", statuses) + else: + cur = conn.execute("DELETE FROM jobs") + count = cur.rowcount + conn.commit() + conn.close() + return count + + +def purge_non_remote(db_path: Path = DEFAULT_DB) -> int: + """Delete non-remote jobs that are not yet in the active pipeline. + Preserves applied, phone_screen, interviewing, offer, hired, and synced records. + Returns number of rows deleted. + """ + _safe = ("applied", "phone_screen", "interviewing", "offer", "hired", "synced") + placeholders = ",".join("?" * len(_safe)) + conn = sqlite3.connect(db_path) + count = conn.execute( + f"DELETE FROM jobs WHERE (is_remote = 0 OR is_remote IS NULL)" + f" AND status NOT IN ({placeholders})", + _safe, + ).rowcount + conn.commit() + conn.close() + return count + + +def archive_jobs(db_path: Path = DEFAULT_DB, statuses: list[str] = None) -> int: + """Set status='archived' for jobs matching given statuses. + + Archived jobs stay in the DB (preserving dedup by URL) but are invisible + to Job Review and other pipeline views. + Returns number of rows updated. + """ + if not statuses: + return 0 + placeholders = ",".join("?" * len(statuses)) + conn = sqlite3.connect(db_path) + count = conn.execute( + f"UPDATE jobs SET status = 'archived' WHERE status IN ({placeholders})", + statuses, + ).rowcount + conn.commit() + conn.close() + return count + + +# ── Interview pipeline helpers ──────────────────────────────────────────────── + +_STAGE_TS_COL = { + "phone_screen": "phone_screen_at", + "interviewing": "interviewing_at", + "offer": "offer_at", + "hired": "hired_at", + "survey": "survey_at", +} + + +def get_interview_jobs(db_path: Path = DEFAULT_DB) -> dict[str, list[dict]]: + """Return jobs grouped by interview/post-apply stage.""" + stages = ["applied", "survey", "phone_screen", "interviewing", "offer", "hired", "rejected"] + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + result: dict[str, list[dict]] = {} + for stage in stages: + cursor = conn.execute( + "SELECT * FROM jobs WHERE status = ? ORDER BY applied_at DESC, id DESC", + (stage,), + ) + result[stage] = [dict(row) for row in cursor.fetchall()] + conn.close() + return result + + +def advance_to_stage(db_path: Path = DEFAULT_DB, job_id: int = None, stage: str = "") -> None: + """Move a job to the next interview stage and record a timestamp.""" + now = datetime.now().isoformat()[:16] + ts_col = _STAGE_TS_COL.get(stage) + conn = sqlite3.connect(db_path) + if ts_col: + conn.execute( + f"UPDATE jobs SET status = ?, {ts_col} = ? WHERE id = ?", + (stage, now, job_id), + ) + else: + conn.execute("UPDATE jobs SET status = ? WHERE id = ?", (stage, job_id)) + conn.commit() + conn.close() + + +def reject_at_stage(db_path: Path = DEFAULT_DB, job_id: int = None, + rejection_stage: str = "") -> None: + """Mark a job as rejected and record at which stage it was rejected.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE jobs SET status = 'rejected', rejection_stage = ? WHERE id = ?", + (rejection_stage, job_id), + ) + conn.commit() + conn.close() + + +def set_interview_date(db_path: Path = DEFAULT_DB, job_id: int = None, + date_str: str = "") -> None: + """Persist an interview date for a job.""" + conn = sqlite3.connect(db_path) + conn.execute("UPDATE jobs SET interview_date = ? WHERE id = ?", (date_str, job_id)) + conn.commit() + conn.close() + + +# ── Contact log helpers ─────────────────────────────────────────────────────── + +def add_contact(db_path: Path = DEFAULT_DB, job_id: int = None, + direction: str = "inbound", subject: str = "", + from_addr: str = "", to_addr: str = "", + body: str = "", received_at: str = "", + message_id: str = "", + stage_signal: str = "") -> int: + """Log an email contact. Returns the new row id.""" + ts = received_at or datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + cur = conn.execute( + """INSERT INTO job_contacts + (job_id, direction, subject, from_addr, to_addr, body, + received_at, message_id, stage_signal) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (job_id, direction, subject, from_addr, to_addr, body, + ts, message_id, stage_signal or None), + ) + conn.commit() + row_id = cur.lastrowid + conn.close() + return row_id + + +def get_contacts(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]: + """Return all contact log entries for a job, oldest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM job_contacts WHERE job_id = ? ORDER BY received_at ASC", + (job_id,), + ) + rows = [dict(row) for row in cursor.fetchall()] + conn.close() + return rows + + +def get_unread_stage_signals(db_path: Path = DEFAULT_DB, + job_id: int = None) -> list[dict]: + """Return inbound contacts with a non-neutral, non-dismissed stage signal.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + """SELECT * FROM job_contacts + WHERE job_id = ? + AND direction = 'inbound' + AND stage_signal IS NOT NULL + AND stage_signal != 'neutral' + AND (suggestion_dismissed IS NULL OR suggestion_dismissed = 0) + ORDER BY received_at ASC""", + (job_id,), + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def dismiss_stage_signal(db_path: Path = DEFAULT_DB, + contact_id: int = None) -> None: + """Mark a stage signal suggestion as dismissed.""" + conn = sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET suggestion_dismissed = 1 WHERE id = ?", + (contact_id,), + ) + conn.commit() + conn.close() + + +def get_all_message_ids(db_path: Path = DEFAULT_DB) -> set[str]: + """Return all known Message-IDs across all job contacts.""" + conn = sqlite3.connect(db_path) + rows = conn.execute( + "SELECT message_id FROM job_contacts WHERE message_id IS NOT NULL AND message_id != ''" + ).fetchall() + conn.close() + return {r[0] for r in rows} + + +# ── Company research helpers ────────────────────────────────────────────────── + +def save_research(db_path: Path = DEFAULT_DB, job_id: int = None, + company_brief: str = "", ceo_brief: str = "", + talking_points: str = "", raw_output: str = "", + tech_brief: str = "", funding_brief: str = "", + competitors_brief: str = "", red_flags: str = "", + accessibility_brief: str = "", + scrape_used: int = 0) -> None: + """Insert or replace a company research record for a job.""" + now = datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + conn.execute( + """INSERT INTO company_research + (job_id, generated_at, company_brief, ceo_brief, talking_points, + raw_output, tech_brief, funding_brief, competitors_brief, red_flags, + accessibility_brief, scrape_used) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(job_id) DO UPDATE SET + generated_at = excluded.generated_at, + company_brief = excluded.company_brief, + ceo_brief = excluded.ceo_brief, + talking_points = excluded.talking_points, + raw_output = excluded.raw_output, + tech_brief = excluded.tech_brief, + funding_brief = excluded.funding_brief, + competitors_brief = excluded.competitors_brief, + red_flags = excluded.red_flags, + accessibility_brief = excluded.accessibility_brief, + scrape_used = excluded.scrape_used""", + (job_id, now, company_brief, ceo_brief, talking_points, raw_output, + tech_brief, funding_brief, competitors_brief, red_flags, + accessibility_brief, scrape_used), + ) + conn.commit() + conn.close() + + +def get_research(db_path: Path = DEFAULT_DB, job_id: int = None) -> Optional[dict]: + """Return the company research record for a job, or None if absent.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + cursor = conn.execute( + "SELECT * FROM company_research WHERE job_id = ?", (job_id,) + ) + row = cursor.fetchone() + conn.close() + return dict(row) if row else None + + +# ── Survey response helpers ─────────────────────────────────────────────────── + +def insert_survey_response( + db_path: Path = DEFAULT_DB, + job_id: int = None, + survey_name: str = "", + received_at: str = "", + source: str = "text_paste", + raw_input: str = "", + image_path: str = "", + mode: str = "quick", + llm_output: str = "", + reported_score: str = "", +) -> int: + """Insert a survey response row. Returns the new row id.""" + conn = sqlite3.connect(db_path) + cur = conn.execute( + """INSERT INTO survey_responses + (job_id, survey_name, received_at, source, raw_input, + image_path, mode, llm_output, reported_score) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""", + (job_id, survey_name or None, received_at or None, + source, raw_input or None, image_path or None, + mode, llm_output, reported_score or None), + ) + conn.commit() + row_id = cur.lastrowid + conn.close() + return row_id + + +def get_survey_responses(db_path: Path = DEFAULT_DB, job_id: int = None) -> list[dict]: + """Return all survey responses for a job, newest first.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT * FROM survey_responses WHERE job_id = ? ORDER BY created_at DESC", + (job_id,), + ).fetchall() + conn.close() + return [dict(r) for r in rows] + + +# ── Background task helpers ─────────────────────────────────────────────────── + +def insert_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Insert a new background task. + + Returns (task_id, True) if inserted, or (existing_id, False) if a + queued/running task for the same (task_type, job_id) already exists. + """ + conn = sqlite3.connect(db_path) + existing = conn.execute( + "SELECT id FROM background_tasks WHERE task_type=? AND job_id=? AND status IN ('queued','running')", + (task_type, job_id), + ).fetchone() + if existing: + conn.close() + return existing[0], False + cur = conn.execute( + "INSERT INTO background_tasks (task_type, job_id, status) VALUES (?, ?, 'queued')", + (task_type, job_id), + ) + task_id = cur.lastrowid + conn.commit() + conn.close() + return task_id, True + + +def update_task_status(db_path: Path = DEFAULT_DB, task_id: int = None, + status: str = "", error: Optional[str] = None) -> None: + """Update a task's status and set the appropriate timestamp.""" + now = datetime.now().isoformat()[:16] + conn = sqlite3.connect(db_path) + if status == "running": + conn.execute( + "UPDATE background_tasks SET status=?, started_at=?, updated_at=? WHERE id=?", + (status, now, now, task_id), + ) + elif status in ("completed", "failed"): + conn.execute( + "UPDATE background_tasks SET status=?, finished_at=?, updated_at=?, error=? WHERE id=?", + (status, now, now, error, task_id), + ) + else: + conn.execute( + "UPDATE background_tasks SET status=?, updated_at=? WHERE id=?", + (status, now, task_id), + ) + conn.commit() + conn.close() + + +def update_task_stage(db_path: Path = DEFAULT_DB, task_id: int = None, + stage: str = "") -> None: + """Update the stage label on a running task (for progress display).""" + conn = sqlite3.connect(db_path) + conn.execute("UPDATE background_tasks SET stage=? WHERE id=?", (stage, task_id)) + conn.commit() + conn.close() + + +def get_active_tasks(db_path: Path = DEFAULT_DB) -> list[dict]: + """Return all queued/running tasks with job title and company joined in.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute(""" + SELECT bt.*, j.title, j.company + FROM background_tasks bt + LEFT JOIN jobs j ON j.id = bt.job_id + WHERE bt.status IN ('queued', 'running') + ORDER BY bt.created_at ASC + """).fetchall() + conn.close() + return [dict(r) for r in rows] + + +def get_task_for_job(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> Optional[dict]: + """Return the most recent task row for a (task_type, job_id) pair, or None.""" + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute( + """SELECT * FROM background_tasks + WHERE task_type=? AND job_id=? + ORDER BY id DESC LIMIT 1""", + (task_type, job_id), + ).fetchone() + conn.close() + return dict(row) if row else None diff --git a/scripts/discover.py b/scripts/discover.py new file mode 100644 index 0000000..bd7530a --- /dev/null +++ b/scripts/discover.py @@ -0,0 +1,285 @@ +# scripts/discover.py +""" +JobSpy β†’ SQLite staging pipeline (default) or Notion (notion_push=True). + +Usage: + conda run -n job-seeker python scripts/discover.py +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import yaml +from datetime import datetime + +import pandas as pd +from jobspy import scrape_jobs +from notion_client import Client + +from scripts.db import DEFAULT_DB, init_db, insert_job, get_existing_urls as db_existing_urls +from scripts.custom_boards import adzuna as _adzuna +from scripts.custom_boards import theladders as _theladders +from scripts.custom_boards import craigslist as _craigslist + +CONFIG_DIR = Path(__file__).parent.parent / "config" +NOTION_CFG = CONFIG_DIR / "notion.yaml" +PROFILES_CFG = CONFIG_DIR / "search_profiles.yaml" +BLOCKLIST_CFG = CONFIG_DIR / "blocklist.yaml" + +# Registry of custom board scrapers keyed by name used in search_profiles.yaml +CUSTOM_SCRAPERS: dict[str, object] = { + "adzuna": _adzuna.scrape, + "theladders": _theladders.scrape, + "craigslist": _craigslist.scrape, +} + + +def load_config() -> tuple[dict, dict]: + profiles = yaml.safe_load(PROFILES_CFG.read_text()) + notion_cfg = yaml.safe_load(NOTION_CFG.read_text()) + return profiles, notion_cfg + + +def load_blocklist() -> dict: + """Load global blocklist config. Returns dict with companies, industries, locations lists.""" + if not BLOCKLIST_CFG.exists(): + return {"companies": [], "industries": [], "locations": []} + raw = yaml.safe_load(BLOCKLIST_CFG.read_text()) or {} + return { + "companies": [c.lower() for c in raw.get("companies", []) if c], + "industries": [i.lower() for i in raw.get("industries", []) if i], + "locations": [loc.lower() for loc in raw.get("locations", []) if loc], + } + + +def _is_blocklisted(job_row: dict, blocklist: dict) -> bool: + """Return True if this job matches any global blocklist rule.""" + company_lower = (job_row.get("company") or "").lower() + location_lower = (job_row.get("location") or "").lower() + desc_lower = (job_row.get("description") or "").lower() + content_lower = f"{company_lower} {desc_lower}" + + if any(bl in company_lower for bl in blocklist["companies"]): + return True + if any(bl in content_lower for bl in blocklist["industries"]): + return True + if any(bl in location_lower for bl in blocklist["locations"]): + return True + return False + + +def get_existing_urls(notion: Client, db_id: str, url_field: str) -> set[str]: + """Return the set of all job URLs already tracked in Notion (for notion_push mode).""" + existing: set[str] = set() + has_more = True + start_cursor = None + while has_more: + kwargs: dict = {"database_id": db_id, "page_size": 100} + if start_cursor: + kwargs["start_cursor"] = start_cursor + resp = notion.databases.query(**kwargs) + for page in resp["results"]: + url = page["properties"].get(url_field, {}).get("url") + if url: + existing.add(url) + has_more = resp.get("has_more", False) + start_cursor = resp.get("next_cursor") + return existing + + +def push_to_notion(notion: Client, db_id: str, job: dict, fm: dict) -> None: + """Create a new page in the Notion jobs database for a single listing.""" + min_amt = job.get("min_amount") + max_amt = job.get("max_amount") + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + title_content = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job.get("salary_source") and str(job["salary_source"]) not in ("nan", "None", ""): + title_content = str(job["salary_source"]) + else: + title_content = str(job.get("title", "Unknown")) + + job_url = str(job.get("job_url", "") or "") + if job_url in ("nan", "None"): + job_url = "" + + notion.pages.create( + parent={"database_id": db_id}, + properties={ + fm["title_field"]: {"title": [{"text": {"content": title_content}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": str(job.get("title", "Unknown"))}}]}, + fm["company"]: {"rich_text": [{"text": {"content": str(job.get("company", "") or "")}}]}, + fm["url"]: {"url": job_url or None}, + fm["source"]: {"multi_select": [{"name": str(job.get("site", "unknown")).title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", False))}, + fm["date_found"]: {"date": {"start": datetime.now().isoformat()[:10]}}, + }, + ) + + +def run_discovery(db_path: Path = DEFAULT_DB, notion_push: bool = False) -> None: + profiles_cfg, notion_cfg = load_config() + fm = notion_cfg["field_map"] + blocklist = load_blocklist() + + _bl_summary = {k: len(v) for k, v in blocklist.items() if v} + if _bl_summary: + print(f"[discover] Blocklist active: {_bl_summary}") + + # SQLite dedup β€” by URL and by (title, company) to catch cross-board reposts + init_db(db_path) + existing_urls = db_existing_urls(db_path) + + import sqlite3 as _sqlite3 + _conn = _sqlite3.connect(db_path) + existing_tc = { + (r[0].lower().strip()[:80], r[1].lower().strip()) + for r in _conn.execute("SELECT title, company FROM jobs").fetchall() + } + _conn.close() + + # Notion dedup (only in notion_push mode) + notion = None + if notion_push: + notion = Client(auth=notion_cfg["token"]) + existing_urls |= get_existing_urls(notion, notion_cfg["database_id"], fm["url"]) + + print(f"[discover] {len(existing_urls)} existing listings in DB") + new_count = 0 + + def _s(val, default="") -> str: + """Convert a value to str, treating pandas NaN/None as default.""" + if val is None: + return default + s = str(val) + return default if s in ("nan", "None", "NaN") else s + + def _insert_if_new(job_row: dict, source_label: str) -> bool: + """Dedup-check, blocklist-check, and insert a job dict. Returns True if inserted.""" + url = job_row.get("url", "") + if not url or url in existing_urls: + return False + + # Global blocklist β€” checked before anything else + if _is_blocklisted(job_row, blocklist): + return False + + title_lower = job_row.get("title", "").lower() + desc_lower = job_row.get("description", "").lower() + exclude_kw = job_row.get("_exclude_kw", []) + if any(kw in title_lower or kw in desc_lower for kw in exclude_kw): + return False + + tc_key = (title_lower[:80], job_row.get("company", "").lower().strip()) + if tc_key in existing_tc: + return False + existing_tc.add(tc_key) + + insert_job(db_path, { + "title": job_row.get("title", ""), + "company": job_row.get("company", ""), + "url": url, + "source": job_row.get("source", source_label), + "location": job_row.get("location", ""), + "is_remote": bool(job_row.get("is_remote", False)), + "salary": job_row.get("salary", ""), + "description": job_row.get("description", ""), + "date_found": datetime.now().isoformat()[:10], + }) + existing_urls.add(url) + return True + + for profile in profiles_cfg["profiles"]: + print(f"\n[discover] ── Profile: {profile['name']} ──") + boards = profile.get("boards", []) + custom_boards = profile.get("custom_boards", []) + exclude_kw = [kw.lower() for kw in profile.get("exclude_keywords", [])] + results_per_board = profile.get("results_per_board", 25) + + for location in profile["locations"]: + + # ── JobSpy boards ────────────────────────────────────────────────── + if boards: + print(f" [jobspy] {location} β€” boards: {', '.join(boards)}") + try: + jobs: pd.DataFrame = scrape_jobs( + site_name=boards, + search_term=" OR ".join(f'"{t}"' for t in profile["titles"]), + location=location, + results_wanted=results_per_board, + hours_old=profile.get("hours_old", 72), + linkedin_fetch_description=True, + ) + print(f" [jobspy] {len(jobs)} raw results") + except Exception as exc: + print(f" [jobspy] ERROR: {exc}") + jobs = pd.DataFrame() + + jobspy_new = 0 + for _, job in jobs.iterrows(): + url = str(job.get("job_url", "") or "") + if not url or url in ("nan", "None"): + continue + + job_dict = job.to_dict() + + # Build salary string from JobSpy numeric fields + min_amt = job_dict.get("min_amount") + max_amt = job_dict.get("max_amount") + salary_str = "" + if min_amt and max_amt and not (pd.isna(min_amt) or pd.isna(max_amt)): + salary_str = f"${int(min_amt):,} – ${int(max_amt):,}" + elif job_dict.get("salary_source") and str(job_dict["salary_source"]) not in ("nan", "None", ""): + salary_str = str(job_dict["salary_source"]) + + row = { + "url": url, + "title": _s(job_dict.get("title")), + "company": _s(job_dict.get("company")), + "source": _s(job_dict.get("site")), + "location": _s(job_dict.get("location")), + "is_remote": bool(job_dict.get("is_remote", False)), + "salary": salary_str, + "description": _s(job_dict.get("description")), + "_exclude_kw": exclude_kw, + } + if _insert_if_new(row, _s(job_dict.get("site"))): + if notion_push: + push_to_notion(notion, notion_cfg["database_id"], job_dict, fm) + new_count += 1 + jobspy_new += 1 + print(f" + {row['title']} @ {row['company']} [{row['source']}]") + + print(f" [jobspy] {jobspy_new} new listings from {location}") + + # ── Custom boards ────────────────────────────────────────────────── + for board_name in custom_boards: + scraper_fn = CUSTOM_SCRAPERS.get(board_name) + if scraper_fn is None: + print(f" [{board_name}] Unknown scraper β€” skipping (not in CUSTOM_SCRAPERS registry)") + continue + + print(f" [{board_name}] {location} β€” fetching up to {results_per_board} results …") + try: + custom_jobs = scraper_fn(profile, location, results_wanted=results_per_board) + except Exception as exc: + print(f" [{board_name}] ERROR: {exc}") + custom_jobs = [] + + print(f" [{board_name}] {len(custom_jobs)} raw results") + board_new = 0 + for job in custom_jobs: + row = {**job, "_exclude_kw": exclude_kw} + if _insert_if_new(row, board_name): + new_count += 1 + board_new += 1 + print(f" + {job.get('title')} @ {job.get('company')} [{board_name}]") + + print(f" [{board_name}] {board_new} new listings from {location}") + + print(f"\n[discover] Done β€” {new_count} new listings staged total.") + return new_count + + +if __name__ == "__main__": + run_discovery() diff --git a/scripts/enrich_descriptions.py b/scripts/enrich_descriptions.py new file mode 100644 index 0000000..dce1cae --- /dev/null +++ b/scripts/enrich_descriptions.py @@ -0,0 +1,284 @@ +# scripts/enrich_descriptions.py +""" +Post-discovery enrichment: retry Glassdoor job description fetches that +returned empty/null during the initial scrape (usually rate-limit 429s or +expired listings mid-batch). + +Fetches descriptions one at a time with a configurable delay between +requests to stay under Glassdoor's rate limit. + +Usage: + conda run -n job-seeker python scripts/enrich_descriptions.py + conda run -n job-seeker python scripts/enrich_descriptions.py --dry-run + conda run -n job-seeker python scripts/enrich_descriptions.py --delay 2.0 +""" +import re +import sqlite3 +import sys +import time +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db + +DELAY_SECS = 1.5 # seconds between description fetches + + +def _extract_job_id(url: str) -> str | None: + """Pull the Glassdoor listing ID from a job URL (…?jl=1234567890).""" + m = re.search(r"jl=(\d+)", url or "") + return m.group(1) if m else None + + +def _setup_scraper(): + """ + Create a Glassdoor scraper instance initialised just enough to call + _fetch_job_description() β€” skips the full job-search setup. + """ + from jobspy.glassdoor import Glassdoor + from jobspy.glassdoor.constant import fallback_token, headers + from jobspy.model import ScraperInput, Site + from jobspy.util import create_session + + scraper = Glassdoor() + scraper.base_url = "https://www.glassdoor.com/" + scraper.session = create_session(has_retry=True) + token = scraper._get_csrf_token() + headers["gd-csrf-token"] = token if token else fallback_token + scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) + return scraper + + +def enrich_glassdoor_descriptions( + db_path: Path = DEFAULT_DB, + dry_run: bool = False, + delay: float = DELAY_SECS, +) -> dict: + """ + Find Glassdoor jobs with missing descriptions and re-fetch them. + + Returns: + {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} + """ + init_db(db_path) + + conn = sqlite3.connect(db_path) + rows = conn.execute( + """SELECT id, url, company, title FROM jobs + WHERE source = 'glassdoor' + AND (description IS NULL OR TRIM(description) = '') + ORDER BY id ASC""" + ).fetchall() + conn.close() + + result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} + + if not rows: + print("[enrich] No Glassdoor jobs missing descriptions.") + return result + + print(f"[enrich] {len(rows)} Glassdoor job(s) missing descriptions β€” fetching…") + + try: + scraper = _setup_scraper() + except Exception as e: + msg = f"Glassdoor scraper init failed: {e}" + result["errors"].append(msg) + result["failed"] = len(rows) + print(f"[enrich] ERROR β€” {msg}") + return result + + for db_id, url, company, title in rows: + job_id = _extract_job_id(url) + if not job_id: + msg = f"job #{db_id}: cannot extract listing ID from URL: {url}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] SKIP β€” {msg}") + continue + + try: + description = scraper._fetch_job_description(int(job_id)) + if description and description.strip(): + if not dry_run: + upd = sqlite3.connect(db_path) + upd.execute( + "UPDATE jobs SET description = ? WHERE id = ?", + (description, db_id), + ) + upd.commit() + upd.close() + tag = "[DRY-RUN] " if dry_run else "" + print(f"[enrich] {tag}{company} β€” {title}: {len(description)} chars") + result["succeeded"] += 1 + else: + print(f"[enrich] {company} β€” {title}: empty response (expired listing?)") + result["failed"] += 1 + except Exception as e: + msg = f"job #{db_id} ({company}): {e}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] ERROR β€” {msg}") + + if delay > 0: + time.sleep(delay) + + return result + + +def enrich_all_descriptions( + db_path: Path = DEFAULT_DB, + dry_run: bool = False, + delay: float = DELAY_SECS, +) -> dict: + """ + Find ALL jobs with missing/empty descriptions (any source) and re-fetch them. + + Uses scrape_job_url for every source β€” it handles LinkedIn, Indeed, Glassdoor, + Adzuna, The Ladders, and any generic URL via JSON-LD / og: tags. + + Returns: + {"attempted": N, "succeeded": N, "failed": N, "errors": [...]} + """ + from scripts.scrape_url import scrape_job_url + + init_db(db_path) + + conn = sqlite3.connect(db_path) + rows = conn.execute( + """SELECT id, url, company, title, source FROM jobs + WHERE (description IS NULL OR TRIM(description) = '') + AND url IS NOT NULL AND url != '' + ORDER BY source, id ASC""" + ).fetchall() + conn.close() + + result = {"attempted": len(rows), "succeeded": 0, "failed": 0, "errors": []} + + if not rows: + print("[enrich] No jobs with missing descriptions.") + return result + + print(f"[enrich] {len(rows)} job(s) missing descriptions β€” fetching…") + + for db_id, url, company, title, source in rows: + if not url.startswith("http"): + result["failed"] += 1 + continue + + tag = "[DRY-RUN] " if dry_run else "" + try: + fields = {} if dry_run else scrape_job_url(db_path, db_id) + if fields or dry_run: + desc_len = len(fields.get("description", "") or "") + print(f"[enrich] {tag}[{source}] {company} β€” {title}: {desc_len} chars") + result["succeeded"] += 1 + else: + print(f"[enrich] [{source}] {company} β€” {title}: no data returned") + result["failed"] += 1 + except Exception as e: + msg = f"job #{db_id} ({company}): {e}" + result["errors"].append(msg) + result["failed"] += 1 + print(f"[enrich] ERROR β€” {msg}") + + if delay > 0: + time.sleep(delay) + + return result + + +def enrich_craigslist_fields( + db_path: Path = DEFAULT_DB, + job_id: int = None, +) -> dict: + """ + Use LLM to extract company name and salary from a Craigslist job description. + + Called after scrape_url populates the description for a craigslist job. + Only runs when: source='craigslist', company='', description non-empty. + + Returns dict with keys 'company' and/or 'salary' (may be empty strings). + """ + import json + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute( + "SELECT id, description, company, source FROM jobs WHERE id=?", (job_id,) + ).fetchone() + conn.close() + + if not row: + return {} + if row["source"] != "craigslist": + return {} + if row["company"]: # already populated + return {} + if not (row["description"] or "").strip(): + return {} + + from scripts.llm_router import LLMRouter + + prompt = ( + "Extract the following from this job posting. " + "Return JSON only, no commentary.\n\n" + '{"company": "", ' + '"salary": ""}\n\n' + f"Posting:\n{row['description'][:3000]}" + ) + + try: + router = LLMRouter() + raw = router.complete(prompt) + except Exception as exc: + print(f"[enrich_craigslist] LLM error for job {job_id}: {exc}") + return {} + + try: + clean = re.sub(r"```(?:json)?|```", "", raw).strip() + fields = json.loads(clean) + except (json.JSONDecodeError, ValueError): + print(f"[enrich_craigslist] Could not parse LLM response for job {job_id}: {raw!r}") + return {} + + extracted = { + k: (fields.get(k) or "").strip() + for k in ("company", "salary") + if (fields.get(k) or "").strip() + } + + if extracted: + from scripts.db import update_job_fields + update_job_fields(db_path, job_id, extracted) + print(f"[enrich_craigslist] job {job_id}: " + f"company={extracted.get('company', 'β€”')} " + f"salary={extracted.get('salary', 'β€”')}") + + return extracted + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Re-fetch missing job descriptions (all sources)" + ) + parser.add_argument("--glassdoor-only", action="store_true", + help="Only re-fetch Glassdoor listings (legacy behaviour)") + parser.add_argument("--dry-run", action="store_true", + help="Show what would be fetched without saving") + parser.add_argument("--delay", type=float, default=DELAY_SECS, + help=f"Seconds between requests (default: {DELAY_SECS})") + args = parser.parse_args() + + if args.glassdoor_only: + r = enrich_glassdoor_descriptions(dry_run=args.dry_run, delay=args.delay) + else: + r = enrich_all_descriptions(dry_run=args.dry_run, delay=args.delay) + + print( + f"\n[enrich] Done β€” {r['succeeded']} fetched, {r['failed']} failed" + + (f", {len(r['errors'])} error(s)" if r["errors"] else "") + ) diff --git a/scripts/finetune_local.py b/scripts/finetune_local.py new file mode 100644 index 0000000..6dfa406 --- /dev/null +++ b/scripts/finetune_local.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# scripts/finetune_local.py +""" +Local LoRA fine-tune on Alex's cover letter corpus. +No HuggingFace account or internet required after the base model is cached. + +Usage: + conda run -n ogma python scripts/finetune_local.py + conda run -n ogma python scripts/finetune_local.py --model unsloth/Llama-3.2-3B-Instruct + conda run -n ogma python scripts/finetune_local.py --epochs 15 --rank 16 + +After training, follow the printed instructions to load the model into Ollama. +""" +import argparse +import json +import os +import sys +from pathlib import Path + +# Limit CUDA to GPU 0. device_map={"":0} in FastLanguageModel.from_pretrained +# pins every layer to GPU 0, avoiding the accelerate None-device bug that +# occurs with device_map="auto" on multi-GPU machines with 4-bit quantisation. +# Do NOT set WORLD_SIZE/RANK β€” that triggers torch.distributed initialisation. +os.environ.setdefault("CUDA_VISIBLE_DEVICES", "0") + +# ── Config ──────────────────────────────────────────────────────────────────── +DEFAULT_MODEL = "unsloth/Llama-3.2-3B-Instruct" # safe on 8 GB VRAM +LETTERS_JSONL = Path("/Library/Documents/JobSearch/training_data/cover_letters.jsonl") +OUTPUT_DIR = Path("/Library/Documents/JobSearch/training_data/finetune_output") +GGUF_DIR = Path("/Library/Documents/JobSearch/training_data/gguf") +OLLAMA_NAME = "alex-cover-writer" + +SYSTEM_PROMPT = ( + "You are Alex Rivera's personal cover letter writer. " + "Write professional, warm, and results-focused cover letters in Alex's voice. " + "Draw on her background in customer success, technical account management, " + "and revenue operations. Be specific and avoid generic filler." +) + +# ── Args ────────────────────────────────────────────────────────────────────── +parser = argparse.ArgumentParser() +parser.add_argument("--model", default=DEFAULT_MODEL, help="Base model (HF repo id or local path)") +parser.add_argument("--epochs", type=int, default=10, help="Training epochs (default: 10)") +parser.add_argument("--rank", type=int, default=16, help="LoRA rank (default: 16)") +parser.add_argument("--batch", type=int, default=2, help="Per-device batch size (default: 2)") +parser.add_argument("--no-gguf", action="store_true", help="Skip GGUF export") +parser.add_argument("--max-length", type=int, default=1024, help="Max token length (default: 1024)") +args = parser.parse_args() + +print(f"\n{'='*60}") +print(f" Alex Cover Letter Fine-Tuner") +print(f" Base model : {args.model}") +print(f" Epochs : {args.epochs}") +print(f" LoRA rank : {args.rank}") +print(f" Dataset : {LETTERS_JSONL}") +print(f"{'='*60}\n") + +# ── Load dataset ────────────────────────────────────────────────────────────── +if not LETTERS_JSONL.exists(): + sys.exit(f"ERROR: Dataset not found at {LETTERS_JSONL}\n" + "Run: conda run -n job-seeker python scripts/prepare_training_data.py") + +records = [json.loads(l) for l in LETTERS_JSONL.read_text().splitlines() if l.strip()] +print(f"Loaded {len(records)} training examples.") + +# Convert to chat format expected by SFTTrainer +def to_messages(rec: dict) -> dict: + return {"messages": [ + {"role": "system", "content": SYSTEM_PROMPT}, + {"role": "user", "content": rec["instruction"]}, + {"role": "assistant", "content": rec["output"]}, + ]} + +chat_data = [to_messages(r) for r in records] + +# ── Load model with unsloth ──────────────────────────────────────────────────── +try: + from unsloth import FastLanguageModel + USE_UNSLOTH = True +except ImportError: + USE_UNSLOTH = False + print("WARNING: unsloth not found β€” falling back to standard transformers + PEFT") + print(" Install: pip install 'unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git'") + +import torch + +if USE_UNSLOTH: + model, tokenizer = FastLanguageModel.from_pretrained( + model_name = args.model, + max_seq_length = args.max_length, + load_in_4bit = True, # QLoRA β€” fits 7-9B in 8 GB VRAM + dtype = None, # auto-detect + device_map = {"": 0}, # pin everything to GPU 0; avoids accelerate None-device bug + ) + model = FastLanguageModel.get_peft_model( + model, + r = args.rank, + lora_alpha = args.rank * 2, + lora_dropout = 0, # 0 = full unsloth kernel patching (faster) + target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj"], + bias = "none", + use_gradient_checkpointing = "unsloth", + ) +else: + from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + from peft import LoraConfig, get_peft_model, TaskType + + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model) + model = AutoModelForCausalLM.from_pretrained( + args.model, + quantization_config=bnb_config, + device_map="auto", + ) + lora_config = LoraConfig( + r=args.rank, + lora_alpha=args.rank * 2, + lora_dropout=0.05, + task_type=TaskType.CAUSAL_LM, + ) + model = get_peft_model(model, lora_config) + model.print_trainable_parameters() + +# ── Build HF Dataset ────────────────────────────────────────────────────────── +from datasets import Dataset + +raw = Dataset.from_list(chat_data) +split = raw.train_test_split(test_size=0.1, seed=42) +train_ds = split["train"] +eval_ds = split["test"] +print(f"Train: {len(train_ds)} Eval: {len(eval_ds)}") + +# formatting_func must ALWAYS return a list of strings. +# Unsloth tests it with a single example dict; during training it gets batches. +# Gemma 2 has no "system" role β€” fold it into the first user turn. +def _apply_template(msgs): + msgs = list(msgs) + if msgs and msgs[0]["role"] == "system": + sys_text = msgs.pop(0)["content"] + if msgs and msgs[0]["role"] == "user": + msgs[0] = {"role": "user", "content": f"{sys_text}\n\n{msgs[0]['content']}"} + return tokenizer.apply_chat_template(msgs, tokenize=False, add_generation_prompt=False) + +def formatting_func(example): + msgs_field = example["messages"] + # Single example: messages is a list of role dicts {"role":..., "content":...} + # Batched example: messages is a list of those lists + if msgs_field and isinstance(msgs_field[0], dict): + return [_apply_template(msgs_field)] + return [_apply_template(m) for m in msgs_field] + +# ── Train ───────────────────────────────────────────────────────────────────── +from trl import SFTTrainer, SFTConfig + +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +trainer = SFTTrainer( + model=model, + tokenizer=tokenizer, + train_dataset=train_ds, + eval_dataset=eval_ds, + formatting_func=formatting_func, + args=SFTConfig( + output_dir = str(OUTPUT_DIR), + num_train_epochs = args.epochs, + per_device_train_batch_size = args.batch, + gradient_accumulation_steps = max(1, 8 // args.batch), + learning_rate = 2e-4, + warmup_ratio = 0.1, + lr_scheduler_type = "cosine", + fp16 = not torch.cuda.is_bf16_supported(), + bf16 = torch.cuda.is_bf16_supported(), + logging_steps = 5, + eval_strategy = "epoch", + save_strategy = "epoch", + load_best_model_at_end = True, + max_length = args.max_length, + report_to = "none", + push_to_hub = False, # local only + ), +) + +print("\nStarting training…") +trainer.train() +print("Training complete.") + +# ── Save adapter ────────────────────────────────────────────────────────────── +adapter_path = OUTPUT_DIR / "adapter" +model.save_pretrained(str(adapter_path)) +tokenizer.save_pretrained(str(adapter_path)) +print(f"\nLoRA adapter saved to: {adapter_path}") + +# ── GGUF export ─────────────────────────────────────────────────────────────── +if not args.no_gguf and USE_UNSLOTH: + GGUF_DIR.mkdir(parents=True, exist_ok=True) + gguf_path = GGUF_DIR / f"{OLLAMA_NAME}.gguf" + print(f"\nExporting GGUF β†’ {gguf_path} …") + model.save_pretrained_gguf( + str(GGUF_DIR / OLLAMA_NAME), + tokenizer, + quantization_method="q4_k_m", + ) + # unsloth names the file automatically β€” find it + gguf_files = list(GGUF_DIR.glob("*.gguf")) + if gguf_files: + gguf_path = gguf_files[0] + print(f"GGUF written: {gguf_path}") + else: + print("GGUF export may have succeeded β€” check GGUF_DIR above.") +else: + gguf_path = None + +# ── Print next steps ────────────────────────────────────────────────────────── +print(f"\n{'='*60}") +print(" DONE β€” next steps to load into Ollama:") +print(f"{'='*60}") + +if gguf_path and gguf_path.exists(): + modelfile = OUTPUT_DIR / "Modelfile" + modelfile.write_text(f"""FROM {gguf_path} +SYSTEM \"\"\" +{SYSTEM_PROMPT} +\"\"\" +PARAMETER temperature 0.7 +PARAMETER top_p 0.9 +PARAMETER num_ctx 32768 +""") + print(f"\n1. Modelfile written to: {modelfile}") + print(f"\n2. Create the Ollama model:") + print(f" ollama create {OLLAMA_NAME} -f {modelfile}") + print(f"\n3. Test it:") + print(f" ollama run {OLLAMA_NAME} 'Write a cover letter for a Senior Customer Success Manager position at Acme Corp.'") + print(f"\n4. Update llm.yaml to use '{OLLAMA_NAME}:latest' as the ollama model,") + print(f" then pick it in Settings β†’ LLM Backends β†’ Ollama β†’ Model.") +else: + print(f"\n Adapter only (no GGUF). To convert manually:") + print(f" 1. Merge adapter:") + print(f" conda run -n ogma python -c \"") + print(f" from peft import AutoPeftModelForCausalLM") + print(f" m = AutoPeftModelForCausalLM.from_pretrained('{adapter_path}')") + print(f" m.merge_and_unload().save_pretrained('{OUTPUT_DIR}/merged')\"") + print(f" 2. Convert to GGUF using textgen env's convert_hf_to_gguf.py") + print(f" 3. ollama create {OLLAMA_NAME} -f Modelfile") +print() diff --git a/scripts/generate_cover_letter.py b/scripts/generate_cover_letter.py new file mode 100644 index 0000000..071dd41 --- /dev/null +++ b/scripts/generate_cover_letter.py @@ -0,0 +1,224 @@ +# scripts/generate_cover_letter.py +""" +Generate a cover letter in Alex's voice using few-shot examples from her corpus. + +Usage: + conda run -n job-seeker python scripts/generate_cover_letter.py \ + --title "Director of Customer Success" \ + --company "Acme Corp" \ + --description "We are looking for..." + + Or pass a staging DB job ID: + conda run -n job-seeker python scripts/generate_cover_letter.py --job-id 42 +""" +import argparse +import re +import sys +from pathlib import Path + +LETTERS_DIR = Path("/Library/Documents/JobSearch") +LETTER_GLOB = "*Cover Letter*.md" + +# Background injected into every prompt so the model has Alex's facts +SYSTEM_CONTEXT = """You are writing cover letters for Alex Rivera, a customer success leader. + +Background: +- 6+ years in customer success, technical account management, and CS leadership +- Most recent role: led Americas Customer Success at UpGuard (cybersecurity SaaS), managing enterprise + Fortune 500 accounts, drove NPS consistently above 95 +- Also founder of M3 Consulting, a CS advisory practice for SaaS startups +- Attended Texas State (2 yrs), CSU East Bay (1 yr); completed degree elsewhere +- Based in San Francisco Bay Area; open to remote/hybrid +- Pronouns: any + +Voice guidelines: +- Warm, confident, and specific β€” never generic +- Opens with "I'm delighted/thrilled to apply for [role] at [company]." +- 3–4 focused paragraphs, ~250–350 words total +- Para 2: concrete experience (cite UpGuard and/or M3 Consulting with a specific metric) +- Para 3: genuine connection to THIS company's mission/product +- Closes with "Thank you for considering my application." + warm sign-off +- Never use: "I am writing to express my interest", "passionate about making a difference", + "I look forward to hearing from you", or any hollow filler phrases +""" + + +# ── Mission-alignment detection ─────────────────────────────────────────────── +# When a company/JD signals one of these preferred industries, the cover letter +# prompt injects a hint so Para 3 can reflect genuine personal connection. +# This does NOT disclose any personal disability or family information. + +_MISSION_SIGNALS: dict[str, list[str]] = { + "music": [ + "music", "spotify", "tidal", "soundcloud", "bandcamp", "apple music", + "distrokid", "cd baby", "landr", "beatport", "reverb", "vinyl", + "streaming", "artist", "label", "live nation", "ticketmaster", "aeg", + "songkick", "concert", "venue", "festival", "audio", "podcast", + "studio", "record", "musician", "playlist", + ], + "animal_welfare": [ + "animal", "shelter", "rescue", "humane society", "spca", "aspca", + "veterinary", "vet ", "wildlife", "pet ", "adoption", "foster", + "dog", "cat", "feline", "canine", "sanctuary", "zoo", + ], + "education": [ + "education", "school", "learning", "student", "edtech", "classroom", + "curriculum", "tutoring", "academic", "university", "kids", "children", + "youth", "literacy", "khan academy", "duolingo", "chegg", "coursera", + "instructure", "canvas lms", "clever", "district", "teacher", + "k-12", "k12", "grade", "pedagogy", + ], +} + +_MISSION_NOTES: dict[str, str] = { + "music": ( + "This company is in the music industry, which is one of Alex's genuinely " + "ideal work environments β€” she has a real personal passion for the music scene. " + "Para 3 should warmly and specifically reflect this authentic alignment, not as " + "a generic fan statement, but as an honest statement of where she'd love to apply " + "her CS skills." + ), + "animal_welfare": ( + "This organization works in animal welfare/rescue β€” one of Alex's dream-job " + "domains and a genuine personal passion. Para 3 should reflect this authentic " + "connection warmly and specifically, tying her CS skills to this mission." + ), + "education": ( + "This company works in children's education or EdTech β€” one of Alex's ideal " + "work domains, reflecting genuine personal values around learning and young people. " + "Para 3 should reflect this authentic connection specifically and warmly." + ), +} + + +def detect_mission_alignment(company: str, description: str) -> str | None: + """Return a mission hint string if company/JD matches a preferred industry, else None.""" + text = f"{company} {description}".lower() + for industry, signals in _MISSION_SIGNALS.items(): + if any(sig in text for sig in signals): + return _MISSION_NOTES[industry] + return None + + +def load_corpus() -> list[dict]: + """Load all .md cover letters from LETTERS_DIR. Returns list of {path, company, text}.""" + corpus = [] + for path in sorted(LETTERS_DIR.glob(LETTER_GLOB)): + text = path.read_text(encoding="utf-8", errors="ignore").strip() + if not text: + continue + # Extract company from filename: "Tailscale Cover Letter.md" β†’ "Tailscale" + company = re.sub(r"\s*Cover Letter.*", "", path.stem, flags=re.IGNORECASE).strip() + corpus.append({"path": path, "company": company, "text": text}) + return corpus + + +def find_similar_letters(job_description: str, corpus: list[dict], top_k: int = 3) -> list[dict]: + """Return the top_k letters most similar to the job description by TF-IDF cosine sim.""" + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + + if not corpus: + return [] + + docs = [job_description] + [c["text"] for c in corpus] + vectorizer = TfidfVectorizer(stop_words="english", max_features=500) + tfidf = vectorizer.fit_transform(docs) + sims = cosine_similarity(tfidf[0:1], tfidf[1:])[0] + + ranked = sorted(zip(sims, corpus), key=lambda x: x[0], reverse=True) + return [entry for _, entry in ranked[:top_k]] + + +def build_prompt( + title: str, + company: str, + description: str, + examples: list[dict], + mission_hint: str | None = None, +) -> str: + parts = [SYSTEM_CONTEXT.strip(), ""] + if examples: + parts.append("=== STYLE EXAMPLES (Alex's past letters) ===\n") + for i, ex in enumerate(examples, 1): + parts.append(f"--- Example {i} ({ex['company']}) ---") + parts.append(ex["text"]) + parts.append("") + parts.append("=== END EXAMPLES ===\n") + + if mission_hint: + parts.append(f"⭐ Mission alignment note (for Para 3): {mission_hint}\n") + + parts.append(f"Now write a new cover letter for:") + parts.append(f" Role: {title}") + parts.append(f" Company: {company}") + if description: + snippet = description[:1500].strip() + parts.append(f"\nJob description excerpt:\n{snippet}") + parts.append("\nWrite the full cover letter now:") + return "\n".join(parts) + + +def generate(title: str, company: str, description: str = "", _router=None) -> str: + """Generate a cover letter and return it as a string. + + _router is an optional pre-built LLMRouter (used in tests to avoid real LLM calls). + """ + corpus = load_corpus() + examples = find_similar_letters(description or f"{title} {company}", corpus) + mission_hint = detect_mission_alignment(company, description) + if mission_hint: + print(f"[cover-letter] Mission alignment detected for {company}", file=sys.stderr) + prompt = build_prompt(title, company, description, examples, mission_hint=mission_hint) + + if _router is None: + sys.path.insert(0, str(Path(__file__).parent.parent)) + from scripts.llm_router import LLMRouter + _router = LLMRouter() + + print(f"[cover-letter] Generating for: {title} @ {company}", file=sys.stderr) + print(f"[cover-letter] Style examples: {[e['company'] for e in examples]}", file=sys.stderr) + + result = _router.complete(prompt) + return result.strip() + + +def main() -> None: + parser = argparse.ArgumentParser(description="Generate a cover letter in Alex's voice") + parser.add_argument("--title", help="Job title") + parser.add_argument("--company", help="Company name") + parser.add_argument("--description", default="", help="Job description text") + parser.add_argument("--job-id", type=int, help="Load job from staging.db by ID") + parser.add_argument("--output", help="Write output to this file path") + args = parser.parse_args() + + title, company, description = args.title, args.company, args.description + + if args.job_id is not None: + from scripts.db import DEFAULT_DB + import sqlite3 + conn = sqlite3.connect(DEFAULT_DB) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id = ?", (args.job_id,)).fetchone() + conn.close() + if not row: + print(f"No job with id={args.job_id} in staging.db", file=sys.stderr) + sys.exit(1) + job = dict(row) + title = title or job.get("title", "") + company = company or job.get("company", "") + description = description or job.get("description", "") + + if not title or not company: + parser.error("--title and --company are required (or use --job-id)") + + letter = generate(title, company, description) + + if args.output: + Path(args.output).write_text(letter) + print(f"Saved to {args.output}", file=sys.stderr) + else: + print(letter) + + +if __name__ == "__main__": + main() diff --git a/scripts/imap_sync.py b/scripts/imap_sync.py new file mode 100644 index 0000000..220a54f --- /dev/null +++ b/scripts/imap_sync.py @@ -0,0 +1,906 @@ +# scripts/imap_sync.py +""" +IMAP email sync β€” associates recruitment emails with job applications. + +Safety / privacy design: + - Only imports emails that pass BOTH checks: + 1. Sender or subject contains the exact company name (or derived domain) + 2. Subject contains at least one recruitment keyword + - Fuzzy / partial company name matches are rejected + - Emails between known personal contacts are never imported + - Only the INBOX and Sent folders are touched; no other folders + - Credentials stored in config/email.yaml (gitignored) + +Config: config/email.yaml (see config/email.yaml.example) + +Usage: + conda run -n job-seeker python scripts/imap_sync.py + conda run -n job-seeker python scripts/imap_sync.py --job-id 42 + conda run -n job-seeker python scripts/imap_sync.py --dry-run +""" +import email +import imaplib +import re +import sys +from datetime import datetime, timedelta +from email.header import decode_header as _raw_decode_header +from pathlib import Path +from typing import Optional +from urllib.parse import urlparse + +import yaml + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, init_db, get_interview_jobs, add_contact, get_contacts +from scripts.llm_router import LLMRouter + +_CLASSIFIER_ROUTER = LLMRouter() + +_CLASSIFY_SYSTEM = ( + "You are an email classifier. Classify the recruitment email into exactly ONE of these categories:\n" + " interview_scheduled, offer_received, rejected, positive_response, survey_received, neutral\n\n" + "Rules:\n" + "- interview_scheduled: recruiter wants to book a call/interview\n" + "- offer_received: job offer is being extended\n" + "- rejected: explicitly not moving forward\n" + "- positive_response: interested/impressed but no interview booked yet\n" + "- survey_received: link or request to complete a survey, assessment, or questionnaire\n" + "- neutral: auto-confirmation, generic update, no clear signal\n\n" + "Respond with ONLY the category name. No explanation." +) + +_CLASSIFY_LABELS = [ + "interview_scheduled", "offer_received", "rejected", + "positive_response", "survey_received", "neutral", +] + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "email.yaml" + +# ── Recruitment keyword filter ──────────────────────────────────────────────── +# An email must match at least one of these in its subject line to be imported. +RECRUITMENT_KEYWORDS = { + # Application lifecycle + "interview", "application", "applicant", "apply", "applied", + "position", "opportunity", "role", "opening", "vacancy", + "offer", "offer letter", "schedule", "scheduling", + "screening", "screen", "phone screen", "video call", + "assessment", "hiring", "hired", "recruiter", "recruitment", + "talent", "candidate", "recruiting", "next steps", "follow up", "follow-up", + "onboarding", "start date", "background check", "reference", + "congratulations", "unfortunately", "decision", "update", + # Job board / ATS notifications + "viewed your profile", "interested in your background", + "job alert", "new job", "job match", "job opportunity", + "your application", "application received", "application status", + "application update", "we received", "thank you for applying", + "thanks for applying", "moved forward", "moving forward", + "not moving forward", "decided to", "other candidates", + "keep your resume", "keep you in mind", + # Recruiter outreach + "reaching out", "i came across", "your experience", + "connect with you", "exciting opportunity", "great fit", + "perfect fit", "right fit", "strong fit", "ideal candidate", +} + +# ── Rejection / ATS-confirm phrase filter ───────────────────────────────────── +# Checked against subject + first 800 chars of body BEFORE calling any LLM. +# Covers the cases phi3:mini consistently mis-classifies as "neutral". +_REJECTION_PHRASES = [ + # Explicit rejection β€” safe to check subject + body + "not moving forward", "decided not to move forward", + "not selected", "not be moving forward", "will not be moving forward", + "unfortunately", "regret to inform", "regret to let you know", + "decided to go with other", "decided to pursue other", + "other candidates", "other applicants", "position has been filled", + "filled the position", "no longer moving forward", + "we have decided", "we've decided", "after careful consideration", + "at this time we", "at this point we", + "we will not", "we won't be", "we are not able", + "wish you the best", "best of luck in your", + "keep your resume on file", +] + +# ATS-confirm phrases β€” checked against SUBJECT ONLY. +# Do NOT check these in the body: recruiters often quote ATS thread history, +# so "thank you for applying" can appear in a genuine follow-up body. +_ATS_CONFIRM_SUBJECTS = [ + "application received", "application confirmation", + "thanks for applying", "thank you for applying", + "thank you for your application", + "we received your application", + "application has been received", + "has received your application", + "successfully submitted", + "your application for", + "you applied to", +] + +# Phrases that immediately identify a non-recruitment email (retail, spam, etc.) +_SPAM_PHRASES = [ + # Retail / commerce offers + "special offer", "private offer", "exclusive offer", "limited time offer", + "limited-time offer", "sent you a special offer", "sent you an offer", + "holiday offer", "seasonal offer", "membership offer", + "round trip from $", "bonus points", + "% off", "% discount", "save up to", "free shipping", + "unsubscribe", "view in browser", "view this email in", + "update your preferences", "email preferences", + # LinkedIn apply confirmations & digests (not new inbound leads) + "your application was sent to", + "your application was viewed by", + "application updates this week", + "don't forget to complete your application", + "view your application updates", + "you have new application updates", + # Indeed apply confirmations + "indeed application:", + # DocuSign / e-signature + "requests you to sign", + "has sent you a reminder", + "please sign", + # Security / MFA codes + "security code for your application", + "verification code", +] + +# Subject prefixes that identify non-job emails +_SPAM_SUBJECT_PREFIXES = [ + "@", # "@user sent you a special offer" β€” Depop / social commerce + "re: fw:", # forwarded chains unlikely to be first-contact recruitment + "accepted:", # Google Calendar accepted invite + "notification:", # Google Calendar notification + "[meeting reminder]", # Google Calendar meeting reminder + "updated invitation:", # Google Calendar update + "[updated]", # Google Calendar update + "reminder:", # Generic reminder (AAA digital interview reminders, etc.) + "πŸ“„", # Newsletter/article emoji prefix + "invitation from", # Google Calendar invite forwarded by name +] + +# Unicode-safe "don't forget" variants (Gmail renders typographic apostrophes) +_DONT_FORGET_VARIANTS = [ + "don't forget to complete your application", # straight apostrophe + "don\u2019t forget to complete your application", # right single quotation mark ' + "don\u2018t forget to complete your application", # left single quotation mark ' +] + + +def _has_rejection_or_ats_signal(subject: str, body: str) -> bool: + """Return True if the email is a rejection, ATS auto-confirmation, or non-recruitment spam.""" + subject_lower = subject.lower().strip() + + # Fast subject-prefix checks (Depop "@user", etc.) + if any(subject_lower.startswith(p) for p in _SPAM_SUBJECT_PREFIXES): + return True + + # Fast subject-only check for ATS confirmations + if any(phrase in subject_lower for phrase in _ATS_CONFIRM_SUBJECTS): + return True + + # Check subject + opening body for rejection and spam phrases + haystack = subject_lower + " " + body[:1500].lower() + if any(phrase in haystack for phrase in _REJECTION_PHRASES + _SPAM_PHRASES): + return True + # Unicode-safe "don't forget" check (handles straight, right, and left apostrophes) + raw = (subject + " " + body[:1500]).lower() + return any(phrase in raw for phrase in _DONT_FORGET_VARIANTS) + + +# Legal entity suffixes to strip when normalising company names +_LEGAL_SUFFIXES = re.compile( + r",?\s*\b(Inc|LLC|Ltd|Limited|Corp|Corporation|Co|GmbH|AG|plc|PLC|SAS|SA|NV|BV|LP|LLP)\b\.?\s*$", + re.IGNORECASE, +) + +# Job-board SLDs that must never be used as company-match search terms. +# A LinkedIn job URL has domain "linkedin.com" β†’ SLD "linkedin", which would +# incorrectly match every LinkedIn notification email against every LinkedIn job. +_JOB_BOARD_SLDS = { + "linkedin", "indeed", "glassdoor", "ziprecruiter", "monster", + "careerbuilder", "dice", "simplyhired", "wellfound", "angellist", + "greenhouse", "lever", "workday", "taleo", "icims", "smartrecruiters", + "bamboohr", "ashby", "rippling", "jobvite", "workable", "gusto", + "paylocity", "paycom", "adp", "breezy", "recruitee", "jazz", +} + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +def _decode_str(value: Optional[str]) -> str: + """Decode an RFC2047-encoded header value to a plain Python string.""" + if not value: + return "" + parts = _raw_decode_header(value) + result = [] + for part, encoding in parts: + if isinstance(part, bytes): + result.append(part.decode(encoding or "utf-8", errors="replace")) + else: + result.append(str(part)) + return " ".join(result).strip() + + +def _extract_domain(url_or_email: str) -> str: + """ + Pull the bare domain from a URL (https://company.com/jobs/...) or + an email address (recruiter@company.com). Returns '' if none found. + """ + url_or_email = url_or_email.strip() + if "@" in url_or_email: + return url_or_email.split("@")[-1].split(">")[0].strip().lower() + try: + parsed = urlparse(url_or_email) + host = parsed.netloc or parsed.path + # strip www. + return re.sub(r"^www\.", "", host).lower() + except Exception: + return "" + + +def _normalise_company(company: str) -> str: + """Strip legal suffixes and extra whitespace from a company name.""" + return _LEGAL_SUFFIXES.sub("", company).strip() + + +def _company_search_terms(company: str, job_url: str = "") -> list[str]: + """ + Return a list of strings that must appear (case-insensitively) in the + email's from-address or subject for it to be considered a match. + + We are deliberately conservative: + - Use the full normalised company name (not just the first word) + - Also include the company domain derived from the job URL, but ONLY + when the domain belongs to the actual company (not a job board). + LinkedIn jobs link to linkedin.com β€” if we used "linkedin" as a term + we'd match every LinkedIn notification email against every LinkedIn job. + """ + terms = [] + clean = _normalise_company(company) + if len(clean) >= 3: + terms.append(clean.lower()) + + domain = _extract_domain(job_url) + if domain and len(domain) > 4: + sld = domain.split(".")[0] + if len(sld) >= 3 and sld not in terms and sld not in _JOB_BOARD_SLDS: + terms.append(sld) + + return terms + + +def _has_recruitment_keyword(subject: str) -> bool: + """Return True if the subject contains at least one recruitment keyword.""" + subject_lower = subject.lower() + return any(kw in subject_lower for kw in RECRUITMENT_KEYWORDS) + + +def _email_is_relevant(from_addr: str, subject: str, search_terms: list[str]) -> bool: + """ + Two-gate filter: + Gate 1 β€” from-address OR subject must contain an exact company term + Gate 2 β€” subject must contain a recruitment keyword + + Both gates must pass. This prevents importing unrelated emails that + happen to mention a company name in passing. + """ + combined = (from_addr + " " + subject).lower() + + gate1 = any(term in combined for term in search_terms) + gate2 = _has_recruitment_keyword(subject) + + return gate1 and gate2 + + +def _get_existing_message_ids(job_id: int, db_path: Path) -> set[str]: + contacts = get_contacts(db_path, job_id=job_id) + return {c.get("message_id", "") for c in contacts if c.get("message_id")} + + +def classify_stage_signal(subject: str, body: str) -> Optional[str]: + """Classify an inbound email into a pipeline stage signal. + + Returns one of the 5 label strings, or None on failure. + Uses phi3:mini via Ollama (benchmarked 100% on 12-case test set). + """ + try: + prompt = f"Subject: {subject}\n\nEmail: {body[:400]}" + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_CLASSIFY_SYSTEM, + model_override="llama3.1:8b", + fallback_order=["ollama_research"], + ) + # Strip blocks (in case a reasoning model slips through) + text = re.sub(r".*?", "", raw, flags=re.DOTALL) + text = text.lower().strip() + for label in _CLASSIFY_LABELS: + if text.startswith(label) or label in text: + return label + return "neutral" + except Exception: + return None + + +_EXTRACT_SYSTEM = ( + "Extract the hiring company name and job title from this recruitment email, " + "but ONLY if it represents genuine new recruiter outreach β€” i.e. a recruiter " + "contacting you about an open role for the first time.\n\n" + "Return {\"company\": null, \"title\": null} if the email is any of:\n" + " - A rejection or 'not moving forward' notice\n" + " - An ATS auto-confirmation ('we received your application')\n" + " - A status update for an application already in progress\n" + " - A generic job-alert digest or newsletter\n" + " - A follow-up you sent, not a reply from a recruiter\n\n" + "Otherwise respond with ONLY valid JSON: " + '{"company": "Company Name", "title": "Job Title"}.' +) + + +def extract_lead_info(subject: str, body: str, + from_addr: str) -> tuple[Optional[str], Optional[str]]: + """Use LLM to extract (company, title) from an unmatched recruitment email. + + Returns (company, title) or (None, None) on failure / low confidence. + """ + import json as _json + try: + prompt = ( + f"From: {from_addr}\n" + f"Subject: {subject}\n\n" + f"Email excerpt:\n{body[:600]}" + ) + raw = _CLASSIFIER_ROUTER.complete( + prompt, + system=_EXTRACT_SYSTEM, + fallback_order=["ollama_research"], + ) + text = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() + m = re.search(r'\{.*\}', text, re.DOTALL) + if not m: + return None, None + data = _json.loads(m.group()) + company = data.get("company") or None + title = data.get("title") or None + return company, title + except Exception: + return None, None + + +# Keywords that indicate an email in a curated label needs attention. +# Intentionally separate from RECRUITMENT_KEYWORDS β€” these are action-oriented. +_TODO_LABEL_KEYWORDS = { + "action needed", "action required", + "please complete", "please submit", "please respond", "please reply", + "response needed", "response required", + "next steps", "next step", + "follow up", "follow-up", + "deadline", "by end of", + "your offer", "offer letter", + "background check", "reference check", + "onboarding", "start date", + "congrats", "congratulations", + "we'd like to", "we would like to", + "interview", "schedule", "scheduling", +} + + +def _has_todo_keyword(subject: str) -> bool: + """Return True if the subject contains a TODO-label action keyword.""" + subject_lower = subject.lower() + return any(kw in subject_lower for kw in _TODO_LABEL_KEYWORDS) + + +_LINKEDIN_ALERT_SENDER = "jobalerts-noreply@linkedin.com" + +# Social-proof / nav lines to skip when parsing alert blocks +_ALERT_SKIP_PHRASES = { + "school alumni", "apply with", "actively hiring", "manage alerts", + "view all jobs", "your job alert", "new jobs match", + "unsubscribe", "linkedin corporation", +} + + +def parse_linkedin_alert(body: str) -> list[dict]: + """ + Parse the plain-text body of a LinkedIn Job Alert digest email. + + Returns a list of dicts: {title, company, location, url}. + URL is canonicalized to https://www.linkedin.com/jobs/view// + (tracking parameters stripped). + """ + jobs = [] + # Split on separator lines (10+ dashes) + blocks = re.split(r"\n\s*-{10,}\s*\n", body) + for block in blocks: + lines = [ln.strip() for ln in block.strip().splitlines() if ln.strip()] + + # Find "View job:" URL + url = None + for line in lines: + m = re.search(r"View job:\s*(https?://\S+)", line, re.IGNORECASE) + if m: + raw_url = m.group(1) + job_id_m = re.search(r"/jobs/view/(\d+)", raw_url) + if job_id_m: + url = f"https://www.linkedin.com/jobs/view/{job_id_m.group(1)}/" + break + if not url: + continue + + # Filter noise lines + content = [ + ln for ln in lines + if not any(p in ln.lower() for p in _ALERT_SKIP_PHRASES) + and not ln.lower().startswith("view job:") + and not ln.startswith("http") + ] + if len(content) < 2: + continue + + jobs.append({ + "title": content[0], + "company": content[1], + "location": content[2] if len(content) > 2 else "", + "url": url, + }) + return jobs + + +def _scan_todo_label(conn: imaplib.IMAP4, cfg: dict, db_path: Path, + active_jobs: list[dict], + known_message_ids: set) -> int: + """Scan the configured Gmail label for action emails, matching them to pipeline jobs. + + Two gates per email: + 1. Company name appears in from-address or subject (same as sync_job_emails) + 2. Subject contains a TODO-label action keyword + + Returns count of new contacts attached. + """ + label = cfg.get("todo_label", "").strip() + if not label: + return 0 + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + + # Search the label folder for any emails (no keyword pre-filter β€” it's curated) + uids = _search_folder(conn, label, "ALL", since) + if not uids: + return 0 + + # Build a lookup: search_term β†’ [job, ...] for all active jobs + term_to_jobs: dict[str, list[dict]] = {} + for job in active_jobs: + for term in _company_search_terms(job.get("company", ""), job.get("url", "")): + term_to_jobs.setdefault(term, []).append(job) + + added = 0 + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + mid = parsed["message_id"] + if mid in known_message_ids: + continue + + # Gate 1: company name match β€” from_addr + subject + first 300 chars of body + # Body fallback catches ATS emails (e.g. noreply@greenhouse.io) where the + # company name only appears in the email body, not the sender or subject. + combined = ( + parsed["from_addr"] + " " + + parsed["subject"] + " " + + parsed["body"][:300] + ).lower() + matched_jobs = [] + for term, jobs in term_to_jobs.items(): + if term in combined: + matched_jobs.extend(jobs) + # Deduplicate by job id + seen_ids: set[int] = set() + matched_jobs = [j for j in matched_jobs if not (j["id"] in seen_ids or seen_ids.add(j["id"]))] # type: ignore[func-returns-value] + if not matched_jobs: + continue + + # Gate 2: action keyword in subject + if not _has_todo_keyword(parsed["subject"]): + continue + + for job in matched_jobs: + contact_id = add_contact( + db_path, job_id=job["id"], direction="inbound", + subject=parsed["subject"], + from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], + body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=mid, + ) + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) + + known_message_ids.add(mid) + added += 1 + print(f"[imap] TODO label β†’ {matched_jobs[0].get('company')} β€” {parsed['subject'][:60]}") + + return added + + +def _scan_unmatched_leads(conn: imaplib.IMAP4, cfg: dict, + db_path: Path, + known_message_ids: set) -> int: + """Scan INBOX for recruitment emails not matched to any pipeline job. + + Calls LLM to extract company/title; inserts qualifying emails as pending jobs. + Returns the count of new leads inserted. + """ + from scripts.db import get_existing_urls, insert_job, add_contact as _add_contact + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + + broad_terms = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"] + all_uids: set = set() + for term in broad_terms: + uids = _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since) + all_uids.update(uids) + + existing_urls = get_existing_urls(db_path) + new_leads = 0 + + for uid in all_uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + mid = parsed["message_id"] + if mid in known_message_ids: + continue + + # ── LinkedIn Job Alert digest β€” parse each card individually ────── + if _LINKEDIN_ALERT_SENDER in parsed["from_addr"].lower(): + cards = parse_linkedin_alert(parsed["body"]) + for card in cards: + if card["url"] in existing_urls: + continue + job_id = insert_job(db_path, { + "title": card["title"], + "company": card["company"], + "url": card["url"], + "source": "linkedin", + "location": card["location"], + "is_remote": 0, + "salary": "", + "description": "", + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + from scripts.task_runner import submit_task + submit_task(db_path, "scrape_url", job_id) + existing_urls.add(card["url"]) + new_leads += 1 + print(f"[imap] LinkedIn alert β†’ {card['company']} β€” {card['title']}") + known_message_ids.add(mid) + continue # skip normal LLM extraction path + + if not _has_recruitment_keyword(parsed["subject"]): + continue + + # Fast phrase-based rejection / ATS-confirm filter (catches what phi3 misses) + if _has_rejection_or_ats_signal(parsed["subject"], parsed["body"]): + continue + + # LLM classification as secondary gate β€” skip on rejection or classifier failure + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal is None or signal == "rejected": + continue + + company, title = extract_lead_info( + parsed["subject"], parsed["body"], parsed["from_addr"] + ) + if not company: + continue + + from_domain = _extract_domain(parsed["from_addr"]) or "unknown" + mid_hash = str(abs(hash(mid)))[:10] + synthetic_url = f"email://{from_domain}/{mid_hash}" + + if synthetic_url in existing_urls: + continue + + job_id = insert_job(db_path, { + "title": title or "(untitled)", + "company": company, + "url": synthetic_url, + "source": "email", + "location": "", + "is_remote": 0, + "salary": "", + "description": parsed["body"][:2000], + "date_found": datetime.now().isoformat()[:10], + }) + if job_id: + _add_contact(db_path, job_id=job_id, direction="inbound", + subject=parsed["subject"], + from_addr=parsed["from_addr"], + body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else "", + message_id=mid) + known_message_ids.add(mid) + existing_urls.add(synthetic_url) + new_leads += 1 + + return new_leads + + +# ── IMAP connection ─────────────────────────────────────────────────────────── + +def load_config() -> dict: + if not CONFIG_PATH.exists(): + raise FileNotFoundError( + f"Email config not found: {CONFIG_PATH}\n" + f"Copy config/email.yaml.example β†’ config/email.yaml and fill it in." + ) + return yaml.safe_load(CONFIG_PATH.read_text()) or {} + + +def connect(cfg: dict) -> imaplib.IMAP4: + host = cfg.get("host", "imap.gmail.com") + port = int(cfg.get("port", 993)) + use_ssl = cfg.get("use_ssl", True) + conn = (imaplib.IMAP4_SSL if use_ssl else imaplib.IMAP4)(host, port) + conn.login(cfg["username"], cfg["password"]) + return conn + + +def _detect_sent_folder(conn: imaplib.IMAP4) -> str: + """Try to auto-detect the Sent folder name.""" + candidates = ["[Gmail]/Sent Mail", "Sent", "Sent Items", "Sent Messages", "INBOX.Sent"] + try: + _, folder_list = conn.list() + flat = " ".join(f.decode() for f in (folder_list or [])) + for candidate in candidates: + if candidate.lower() in flat.lower(): + return candidate + except Exception: + pass + return "Sent" + + +def _quote_folder(name: str) -> str: + """Quote an IMAP folder name if it contains spaces. + Escapes internal backslashes and double-quotes per RFC 3501. + e.g. 'TO DO JOBS' β†’ '"TO DO JOBS"', 'My "Jobs"' β†’ '"My \\"Jobs\\""' + """ + if " " in name: + escaped = name.replace("\\", "\\\\").replace('"', '\\"') + return f'"{escaped}"' + return name + + +def _search_folder(conn: imaplib.IMAP4, folder: str, criteria: str, + since: str) -> list[bytes]: + """SELECT a folder and return matching UID list (empty on any error).""" + try: + conn.select(_quote_folder(folder), readonly=True) + _, data = conn.search(None, f'(SINCE "{since}" {criteria})') + return data[0].split() if data and data[0] else [] + except Exception: + return [] + + +def _parse_message(conn: imaplib.IMAP4, uid: bytes) -> Optional[dict]: + """Fetch and parse one message. Returns None on failure.""" + try: + _, data = conn.fetch(uid, "(RFC822)") + if not data or not data[0]: + return None + msg = email.message_from_bytes(data[0][1]) + + body = "" + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + try: + body = part.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + break + else: + try: + body = msg.get_payload(decode=True).decode("utf-8", errors="replace") + except Exception: + pass + + mid = msg.get("Message-ID", "").strip() + if not mid: + return None # No Message-ID β†’ can't dedup; skip to avoid repeat inserts + + return { + "message_id": mid, + "subject": _decode_str(msg.get("Subject")), + "from_addr": _decode_str(msg.get("From")), + "to_addr": _decode_str(msg.get("To")), + "date": _decode_str(msg.get("Date")), + "body": body[:4000], + } + except Exception: + return None + + +# ── Per-job sync ────────────────────────────────────────────────────────────── + +def _update_contact_signal(db_path: Path, contact_id: int, signal: str) -> None: + """Write a stage signal onto an existing contact row.""" + import sqlite3 as _sqlite3 + conn = _sqlite3.connect(db_path) + conn.execute( + "UPDATE job_contacts SET stage_signal = ? WHERE id = ?", + (signal, contact_id), + ) + conn.commit() + conn.close() + + +def sync_job_emails(job: dict, conn: imaplib.IMAP4, cfg: dict, + db_path: Path, dry_run: bool = False) -> tuple[int, int]: + """ + Sync recruitment emails for one job. + Returns (inbound_added, outbound_added). + """ + company = (job.get("company") or "").strip() + if not company: + return 0, 0 + + search_terms = _company_search_terms(company, job.get("url", "")) + if not search_terms: + return 0, 0 + + lookback = int(cfg.get("lookback_days", 90)) + since = (datetime.now() - timedelta(days=lookback)).strftime("%d-%b-%Y") + existing_ids = _get_existing_message_ids(job["id"], db_path) + + inbound = outbound = 0 + + for term in search_terms: + # ── INBOX β€” inbound ─────────────────────────────────────────────── + uids = _search_folder( + conn, "INBOX", + f'(OR FROM "{term}" SUBJECT "{term}")', + since, + ) + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + if parsed["message_id"] in existing_ids: + continue + if not _email_is_relevant(parsed["from_addr"], parsed["subject"], search_terms): + continue + + if not dry_run: + contact_id = add_contact( + db_path, job_id=job["id"], direction="inbound", + subject=parsed["subject"], from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=parsed["message_id"], + ) + signal = classify_stage_signal(parsed["subject"], parsed["body"]) + if signal and signal != "neutral": + _update_contact_signal(db_path, contact_id, signal) + existing_ids.add(parsed["message_id"]) + inbound += 1 + + # ── Sent β€” outbound ─────────────────────────────────────────────── + sent_folder = cfg.get("sent_folder") or _detect_sent_folder(conn) + uids = _search_folder( + conn, sent_folder, + f'(OR TO "{term}" SUBJECT "{term}")', + since, + ) + for uid in uids: + parsed = _parse_message(conn, uid) + if not parsed: + continue + if parsed["message_id"] in existing_ids: + continue + if not _email_is_relevant(parsed["to_addr"], parsed["subject"], search_terms): + continue + + if not dry_run: + add_contact( + db_path, job_id=job["id"], direction="outbound", + subject=parsed["subject"], from_addr=parsed["from_addr"], + to_addr=parsed["to_addr"], body=parsed["body"], + received_at=parsed["date"][:16] if parsed["date"] else since, + message_id=parsed["message_id"], + ) + existing_ids.add(parsed["message_id"]) + outbound += 1 + + return inbound, outbound + + +# ── Main entry ──────────────────────────────────────────────────────────────── + +def sync_all(db_path: Path = DEFAULT_DB, + dry_run: bool = False, + job_ids: Optional[list[int]] = None, + on_stage=None) -> dict: + """ + Sync emails for all active pipeline jobs (or a specific subset). + + Returns a summary dict: + {"synced": N, "inbound": N, "outbound": N, "errors": [...]} + """ + def _stage(msg: str) -> None: + if on_stage: + on_stage(msg) + + cfg = load_config() + init_db(db_path) + + jobs_by_stage = get_interview_jobs(db_path) + active_stages = ["applied", "phone_screen", "interviewing", "offer", "hired"] + all_active = [j for stage in active_stages for j in jobs_by_stage.get(stage, [])] + + if job_ids: + all_active = [j for j in all_active if j["id"] in job_ids] + + if not all_active: + return {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "todo_attached": 0, "errors": []} + + _stage("connecting") + print(f"[imap] Connecting to {cfg.get('host', 'imap.gmail.com')} …") + conn = connect(cfg) + summary = {"synced": 0, "inbound": 0, "outbound": 0, "new_leads": 0, "errors": []} + + try: + for i, job in enumerate(all_active, 1): + _stage(f"job {i}/{len(all_active)}") + try: + inb, out = sync_job_emails(job, conn, cfg, db_path, dry_run=dry_run) + label = "DRY-RUN " if dry_run else "" + print(f"[imap] {label}{job.get('company'):30s} +{inb} in +{out} out") + if inb + out > 0: + summary["synced"] += 1 + summary["inbound"] += inb + summary["outbound"] += out + except Exception as e: + msg = f"{job.get('company')}: {e}" + summary["errors"].append(msg) + print(f"[imap] ERROR β€” {msg}") + + _stage("scanning todo label") + from scripts.db import get_all_message_ids + known_mids = get_all_message_ids(db_path) + summary["todo_attached"] = _scan_todo_label(conn, cfg, db_path, all_active, known_mids) + + _stage("scanning leads") + summary["new_leads"] = _scan_unmatched_leads(conn, cfg, db_path, known_mids) + finally: + try: + conn.logout() + except Exception: + pass + + return summary + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser(description="Sync IMAP emails to job contacts") + parser.add_argument("--job-id", type=int, nargs="+", help="Sync only these job IDs") + parser.add_argument("--dry-run", action="store_true", help="Show matches without saving") + args = parser.parse_args() + + result = sync_all( + dry_run=args.dry_run, + job_ids=args.job_id, + ) + print(f"\n[imap] Done β€” {result['synced']} jobs updated, " + f"{result['inbound']} inbound, {result['outbound']} outbound" + + (f", {len(result['errors'])} errors" if result["errors"] else "")) diff --git a/scripts/llm_router.py b/scripts/llm_router.py new file mode 100644 index 0000000..d4eb237 --- /dev/null +++ b/scripts/llm_router.py @@ -0,0 +1,170 @@ +""" +LLM abstraction layer with priority fallback chain. +Reads config/llm.yaml. Tries backends in order; falls back on any error. +""" +import os +import yaml +import requests +from pathlib import Path +from openai import OpenAI + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml" + + +class LLMRouter: + def __init__(self, config_path: Path = CONFIG_PATH): + with open(config_path) as f: + self.config = yaml.safe_load(f) + + def _is_reachable(self, base_url: str) -> bool: + """Quick health-check ping. Returns True if backend is up.""" + health_url = base_url.rstrip("/").removesuffix("/v1") + "/health" + try: + resp = requests.get(health_url, timeout=2) + return resp.status_code < 500 + except Exception: + return False + + def _resolve_model(self, client: OpenAI, model: str) -> str: + """Resolve __auto__ to the first model served by vLLM.""" + if model != "__auto__": + return model + models = client.models.list() + return models.data[0].id + + def complete(self, prompt: str, system: str | None = None, + model_override: str | None = None, + fallback_order: list[str] | None = None, + images: list[str] | None = None) -> str: + """ + Generate a completion. Tries each backend in fallback_order. + + model_override: when set, replaces the configured model for + openai_compat backends (e.g. pass a research-specific ollama model). + fallback_order: when set, overrides config fallback_order for this + call (e.g. pass config["research_fallback_order"] for research tasks). + images: optional list of base64-encoded PNG/JPG strings. When provided, + backends without supports_images=true are skipped. vision_service backends + are only tried when images is provided. + Raises RuntimeError if all backends are exhausted. + """ + order = fallback_order if fallback_order is not None else self.config["fallback_order"] + for name in order: + backend = self.config["backends"][name] + + if not backend.get("enabled", True): + print(f"[LLMRouter] {name}: disabled, skipping") + continue + + supports_images = backend.get("supports_images", False) + is_vision_service = backend["type"] == "vision_service" + + # vision_service only used when images provided + if is_vision_service and not images: + print(f"[LLMRouter] {name}: vision_service skipped (no images)") + continue + + # non-vision backends skipped when images provided and they don't support it + if images and not supports_images and not is_vision_service: + print(f"[LLMRouter] {name}: no image support, skipping") + continue + + if is_vision_service: + if not self._is_reachable(backend["base_url"]): + print(f"[LLMRouter] {name}: unreachable, skipping") + continue + try: + resp = requests.post( + backend["base_url"].rstrip("/") + "/analyze", + json={ + "prompt": prompt, + "image_base64": images[0] if images else "", + }, + timeout=60, + ) + resp.raise_for_status() + print(f"[LLMRouter] Used backend: {name} (vision_service)") + return resp.json()["text"] + except Exception as e: + print(f"[LLMRouter] {name}: error β€” {e}, trying next") + continue + + elif backend["type"] == "openai_compat": + if not self._is_reachable(backend["base_url"]): + print(f"[LLMRouter] {name}: unreachable, skipping") + continue + try: + client = OpenAI( + base_url=backend["base_url"], + api_key=backend.get("api_key") or "any", + ) + raw_model = model_override or backend["model"] + model = self._resolve_model(client, raw_model) + messages = [] + if system: + messages.append({"role": "system", "content": system}) + if images and supports_images: + content = [{"type": "text", "text": prompt}] + for img in images: + content.append({ + "type": "image_url", + "image_url": {"url": f"data:image/png;base64,{img}"}, + }) + messages.append({"role": "user", "content": content}) + else: + messages.append({"role": "user", "content": prompt}) + + resp = client.chat.completions.create( + model=model, messages=messages + ) + print(f"[LLMRouter] Used backend: {name} ({model})") + return resp.choices[0].message.content + + except Exception as e: + print(f"[LLMRouter] {name}: error β€” {e}, trying next") + continue + + elif backend["type"] == "anthropic": + api_key = os.environ.get(backend["api_key_env"], "") + if not api_key: + print(f"[LLMRouter] {name}: {backend['api_key_env']} not set, skipping") + continue + try: + import anthropic as _anthropic + client = _anthropic.Anthropic(api_key=api_key) + if images and supports_images: + content = [] + for img in images: + content.append({ + "type": "image", + "source": {"type": "base64", "media_type": "image/png", "data": img}, + }) + content.append({"type": "text", "text": prompt}) + else: + content = prompt + kwargs: dict = { + "model": backend["model"], + "max_tokens": 4096, + "messages": [{"role": "user", "content": content}], + } + if system: + kwargs["system"] = system + msg = client.messages.create(**kwargs) + print(f"[LLMRouter] Used backend: {name}") + return msg.content[0].text + except Exception as e: + print(f"[LLMRouter] {name}: error β€” {e}, trying next") + continue + + raise RuntimeError("All LLM backends exhausted") + + +# Module-level singleton for convenience +_router: LLMRouter | None = None + + +def complete(prompt: str, system: str | None = None) -> str: + global _router + if _router is None: + _router = LLMRouter() + return _router.complete(prompt, system) diff --git a/scripts/manage-ui.sh b/scripts/manage-ui.sh new file mode 100755 index 0000000..55cadd9 --- /dev/null +++ b/scripts/manage-ui.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash +# scripts/manage-ui.sh β€” manage the Streamlit job-seeker web UI +# Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs] + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +STREAMLIT_BIN="/devl/miniconda3/envs/job-seeker/bin/streamlit" +APP_ENTRY="$REPO_DIR/app/app.py" +PID_FILE="$REPO_DIR/.streamlit.pid" +LOG_FILE="$REPO_DIR/.streamlit.log" +PORT="${STREAMLIT_PORT:-8501}" + +start() { + if is_running; then + echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload." + return 0 + fi + + echo "Starting Streamlit on http://localhost:$PORT …" + "$STREAMLIT_BIN" run "$APP_ENTRY" \ + --server.port "$PORT" \ + --server.headless true \ + --server.fileWatcherType none \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 2 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID …" + kill "$PID" 2>/dev/null || true + sleep 1 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" + echo "Stopped." +} + +restart() { + stop + sleep 1 + start +} + +status() { + if is_running; then + echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT" + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file found at $LOG_FILE" + fi +} + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +CMD="${1:-help}" +case "$CMD" in + start) start ;; + stop) stop ;; + restart) restart ;; + status) status ;; + logs) logs ;; + *) + echo "Usage: bash scripts/manage-ui.sh [start|stop|restart|status|logs]" + echo "" + echo " start Start the Streamlit UI (default port: $PORT)" + echo " stop Stop the running UI" + echo " restart Stop then start" + echo " status Show whether it's running" + echo " logs Tail the last 50 lines of the log" + echo "" + echo " STREAMLIT_PORT=8502 bash scripts/manage-ui.sh start (custom port)" + ;; +esac diff --git a/scripts/manage-vision.sh b/scripts/manage-vision.sh new file mode 100755 index 0000000..43b089c --- /dev/null +++ b/scripts/manage-vision.sh @@ -0,0 +1,113 @@ +#!/usr/bin/env bash +# scripts/manage-vision.sh β€” manage the moondream2 vision service +# Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs +# +# First-time setup: +# conda env create -f scripts/vision_service/environment.yml +# +# On first start, moondream2 is downloaded from HuggingFace (~1.8GB). +# Model stays resident in memory between requests. + +set -euo pipefail + +CONDA_ENV="job-seeker-vision" +UVICORN_BIN="/devl/miniconda3/envs/${CONDA_ENV}/bin/uvicorn" +PID_FILE="/tmp/vision-service.pid" +LOG_FILE="/tmp/vision-service.log" +PORT=8002 +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(dirname "$SCRIPT_DIR")" + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +start() { + if is_running; then + echo "Already running (PID $(cat "$PID_FILE"))." + return 0 + fi + + if [[ ! -f "$UVICORN_BIN" ]]; then + echo "ERROR: conda env '$CONDA_ENV' not found." + echo "Install with: conda env create -f scripts/vision_service/environment.yml" + exit 1 + fi + + echo "Starting vision service (moondream2) on port $PORT…" + cd "$REPO_ROOT" + PYTHONPATH="$REPO_ROOT" "$UVICORN_BIN" \ + scripts.vision_service.main:app \ + --host 0.0.0.0 \ + --port "$PORT" \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 2 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + echo "Health: http://localhost:$PORT/health" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + rm -f "$PID_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID…" + kill "$PID" 2>/dev/null || true + sleep 2 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" + echo "Stopped." +} + +restart() { stop; sleep 1; start; } + +status() { + if is_running; then + echo "Running (PID $(cat "$PID_FILE")) β€” http://localhost:$PORT" + curl -s "http://localhost:$PORT/health" | python3 -m json.tool 2>/dev/null || true + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file at $LOG_FILE" + fi +} + +CMD="${1:-help}" +case "$CMD" in + start) start ;; + stop) stop ;; + restart) restart ;; + status) status ;; + logs) logs ;; + *) + echo "Usage: bash scripts/manage-vision.sh start|stop|restart|status|logs" + echo "" + echo " Manages the moondream2 vision service on port $PORT." + echo " First-time setup: conda env create -f scripts/vision_service/environment.yml" + ;; +esac diff --git a/scripts/manage-vllm.sh b/scripts/manage-vllm.sh new file mode 100755 index 0000000..8386e20 --- /dev/null +++ b/scripts/manage-vllm.sh @@ -0,0 +1,160 @@ +#!/usr/bin/env bash +# scripts/manage-vllm.sh β€” manage the vLLM inference server +# Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list] + +set -euo pipefail + +VLLM_BIN="/devl/miniconda3/envs/vllm/bin/python" +MODEL_DIR="/Library/Assets/LLM/vllm/models" +PID_FILE="/tmp/vllm-server.pid" +LOG_FILE="/tmp/vllm-server.log" +MODEL_FILE="/tmp/vllm-server.model" +PORT=8000 +GPU=1 + +_list_model_names() { + if [[ -d "$MODEL_DIR" ]]; then + find "$MODEL_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null | sort + fi +} + +is_running() { + if [[ -f "$PID_FILE" ]]; then + PID=$(cat "$PID_FILE") + if kill -0 "$PID" 2>/dev/null; then + return 0 + fi + fi + return 1 +} + +start() { + local model_name="${1:-}" + + if [[ -z "$model_name" ]]; then + model_name=$(_list_model_names | head -1) + if [[ -z "$model_name" ]]; then + echo "No models found in $MODEL_DIR" + exit 1 + fi + fi + + local model_path + if [[ "$model_name" == /* ]]; then + model_path="$model_name" + model_name=$(basename "$model_path") + else + model_path="$MODEL_DIR/$model_name" + fi + + if [[ ! -d "$model_path" ]]; then + echo "Model not found: $model_path" + exit 1 + fi + + if is_running; then + echo "Already running (PID $(cat "$PID_FILE")). Use 'restart' to reload." + return 0 + fi + + echo "Starting vLLM with model: $model_name (GPU $GPU, port $PORT)…" + echo "$model_name" > "$MODEL_FILE" + + # Ouro LoopLM uses total_ut_steps=4 which multiplies KV cache by 4x vs a standard + # transformer. On 8 GiB GPUs: 1.4B models support ~4096 tokens; 2.6B only ~928. + CUDA_VISIBLE_DEVICES="$GPU" "$VLLM_BIN" -m vllm.entrypoints.openai.api_server \ + --model "$model_path" \ + --trust-remote-code \ + --max-model-len 3072 \ + --gpu-memory-utilization 0.75 \ + --enforce-eager \ + --max-num-seqs 8 \ + --port "$PORT" \ + > "$LOG_FILE" 2>&1 & + echo $! > "$PID_FILE" + sleep 3 + + if is_running; then + echo "Started (PID $(cat "$PID_FILE")). Logs: $LOG_FILE" + else + echo "Failed to start. Check logs: $LOG_FILE" + tail -20 "$LOG_FILE" + rm -f "$PID_FILE" "$MODEL_FILE" + exit 1 + fi +} + +stop() { + if ! is_running; then + echo "Not running." + rm -f "$PID_FILE" + return 0 + fi + + PID=$(cat "$PID_FILE") + echo "Stopping PID $PID …" + kill "$PID" 2>/dev/null || true + sleep 2 + if kill -0 "$PID" 2>/dev/null; then + kill -9 "$PID" 2>/dev/null || true + fi + rm -f "$PID_FILE" "$MODEL_FILE" + echo "Stopped." +} + +restart() { + local model_name="${1:-}" + stop + sleep 1 + start "$model_name" +} + +status() { + if is_running; then + local model="" + if [[ -f "$MODEL_FILE" ]]; then + model=" β€” model: $(cat "$MODEL_FILE")" + fi + echo "Running (PID $(cat "$PID_FILE")) on http://localhost:$PORT$model" + else + echo "Not running." + fi +} + +logs() { + if [[ -f "$LOG_FILE" ]]; then + tail -50 "$LOG_FILE" + else + echo "No log file found at $LOG_FILE" + fi +} + +list() { + echo "Available models in $MODEL_DIR:" + _list_model_names | while read -r name; do + echo " - $name" + done +} + +CMD="${1:-help}" +case "$CMD" in + start) start "${2:-}" ;; + stop) stop ;; + restart) restart "${2:-}" ;; + status) status ;; + logs) logs ;; + list) list ;; + *) + echo "Usage: bash scripts/manage-vllm.sh [start [model]|stop|restart [model]|status|logs|list]" + echo "" + echo " start [model] Start vLLM with the specified model (default: first in $MODEL_DIR)" + echo " stop Stop the running vLLM server" + echo " restart [model] Stop then start (pass a new model name to swap)" + echo " status Show whether it's running and which model is loaded" + echo " logs Tail the last 50 lines of the log" + echo " list List available models" + echo "" + echo " GPU: $GPU (CUDA_VISIBLE_DEVICES)" + echo " Port: $PORT" + ;; +esac diff --git a/scripts/match.py b/scripts/match.py new file mode 100644 index 0000000..af1d000 --- /dev/null +++ b/scripts/match.py @@ -0,0 +1,156 @@ +""" +Resume match scoring. + +Two modes: + 1. SQLite batch β€” score all unscored pending/approved jobs in staging.db + Usage: python scripts/match.py + + 2. Notion single β€” score one Notion page by URL/ID and write results back + Usage: python scripts/match.py +""" +import re +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import requests +import yaml +from bs4 import BeautifulSoup +from notion_client import Client + +CONFIG_DIR = Path(__file__).parent.parent / "config" +RESUME_PATH = Path("/Library/Documents/JobSearch/Alex_Rivera_Resume_02-19-2025.pdf") + + +def load_notion() -> tuple[Client, dict]: + cfg = yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + return Client(auth=cfg["token"]), cfg["field_map"] + + +def extract_page_id(url_or_id: str) -> str: + """Extract 32-char Notion page ID from a URL or return as-is.""" + clean = url_or_id.replace("-", "") + match = re.search(r"[0-9a-f]{32}", clean) + return match.group(0) if match else url_or_id.strip() + + +def get_job_url_from_notion(notion: Client, page_id: str, url_field: str) -> str: + page = notion.pages.retrieve(page_id) + return page["properties"][url_field]["url"] or "" + + +def extract_job_description(url: str) -> str: + """Fetch a job listing URL and return its visible text.""" + resp = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + for tag in soup(["script", "style", "nav", "header", "footer"]): + tag.decompose() + return " ".join(soup.get_text(separator=" ").split()) + + +def read_resume_text() -> str: + """Extract text from the ATS-clean PDF resume.""" + import pypdf + reader = pypdf.PdfReader(str(RESUME_PATH)) + return " ".join(page.extract_text() or "" for page in reader.pages) + + +def match_score(resume_text: str, job_text: str) -> tuple[float, list[str]]: + """ + Score resume against job description using TF-IDF cosine similarity. + Returns (score 0–100, list of high-value job keywords missing from resume). + """ + import numpy as np + from sklearn.feature_extraction.text import TfidfVectorizer + from sklearn.metrics.pairwise import cosine_similarity + + vectorizer = TfidfVectorizer(stop_words="english", max_features=200) + tfidf = vectorizer.fit_transform([resume_text, job_text]) + score = float(cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0]) * 100 + + resume_terms = set(resume_text.lower().split()) + feature_names = vectorizer.get_feature_names_out() + job_tfidf = tfidf[1].toarray()[0] + top_indices = np.argsort(job_tfidf)[::-1][:30] + top_job_terms = [feature_names[i] for i in top_indices if job_tfidf[i] > 0] + gaps = [t for t in top_job_terms if t not in resume_terms and t == t][:10] # t==t drops NaN + + return round(score, 1), gaps + + +def write_match_to_notion(notion: Client, page_id: str, score: float, gaps: list[str], fm: dict) -> None: + notion.pages.update( + page_id=page_id, + properties={ + fm["match_score"]: {"number": score}, + fm["keyword_gaps"]: {"rich_text": [{"text": {"content": ", ".join(gaps)}}]}, + }, + ) + + +def run_match(page_url_or_id: str) -> None: + notion, fm = load_notion() + page_id = extract_page_id(page_url_or_id) + + print(f"[match] Page ID: {page_id}") + job_url = get_job_url_from_notion(notion, page_id, fm["url"]) + print(f"[match] Fetching job description from: {job_url}") + + job_text = extract_job_description(job_url) + resume_text = read_resume_text() + + score, gaps = match_score(resume_text, job_text) + print(f"[match] Score: {score}/100") + print(f"[match] Keyword gaps: {', '.join(gaps) or 'none'}") + + write_match_to_notion(notion, page_id, score, gaps, fm) + print("[match] Written to Notion.") + + +def score_pending_jobs(db_path: Path = None) -> int: + """ + Score all unscored jobs (any status) in SQLite using the description + already scraped during discovery. Writes match_score + keyword_gaps back. + Returns the number of jobs scored. + """ + from scripts.db import DEFAULT_DB, write_match_scores + + if db_path is None: + db_path = DEFAULT_DB + + import sqlite3 + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + rows = conn.execute( + "SELECT id, title, company, description FROM jobs " + "WHERE match_score IS NULL " + "AND description IS NOT NULL AND description != '' AND description != 'nan'" + ).fetchall() + conn.close() + + if not rows: + print("[match] No unscored jobs with descriptions found.") + return 0 + + resume_text = read_resume_text() + scored = 0 + for row in rows: + job_id, title, company, description = row["id"], row["title"], row["company"], row["description"] + try: + score, gaps = match_score(resume_text, description) + write_match_scores(db_path, job_id, score, ", ".join(gaps)) + print(f"[match] {title} @ {company}: {score}/100 gaps: {', '.join(gaps) or 'none'}") + scored += 1 + except Exception as e: + print(f"[match] Error scoring job {job_id}: {e}") + + print(f"[match] Done β€” {scored} jobs scored.") + return scored + + +if __name__ == "__main__": + if len(sys.argv) < 2: + score_pending_jobs() + else: + run_match(sys.argv[1]) diff --git a/scripts/prepare_training_data.py b/scripts/prepare_training_data.py new file mode 100644 index 0000000..5b2010b --- /dev/null +++ b/scripts/prepare_training_data.py @@ -0,0 +1,134 @@ +# scripts/prepare_training_data.py +""" +Extract training pairs from Alex's cover letter corpus for LoRA fine-tuning. + +Outputs a JSONL file where each line is: + {"instruction": "Write a cover letter for the [role] position at [company].", + "output": ""} + +Usage: + conda run -n job-seeker python scripts/prepare_training_data.py + conda run -n job-seeker python scripts/prepare_training_data.py --output /path/to/out.jsonl +""" +import argparse +import json +import re +import sys +from pathlib import Path + +LETTERS_DIR = Path("/Library/Documents/JobSearch") +# Use two globs to handle mixed capitalisation ("Cover Letter" vs "cover letter") +LETTER_GLOBS = ["*Cover Letter*.md", "*cover letter*.md"] +DEFAULT_OUTPUT = LETTERS_DIR / "training_data" / "cover_letters.jsonl" + +# Patterns that appear in opening sentences to extract role +ROLE_PATTERNS = [ + r"apply for (?:the )?(.+?) (?:position|role|opportunity) at", + r"apply for (?:the )?(.+?) (?:at|with)\b", +] + + +def extract_role_from_text(text: str) -> str: + """Try to extract the role title from the first ~500 chars of a cover letter.""" + # Search the opening of the letter, skipping past any greeting line + search_text = text[:600] + for pattern in ROLE_PATTERNS: + m = re.search(pattern, search_text, re.IGNORECASE) + if m: + role = m.group(1).strip().rstrip(".") + # Filter out noise β€” role should be ≀6 words + if 1 <= len(role.split()) <= 6: + return role + return "" + + +def extract_company_from_filename(stem: str) -> str: + """Extract company name from cover letter filename stem.""" + return re.sub(r"\s*Cover Letter.*", "", stem, flags=re.IGNORECASE).strip() + + +def strip_greeting(text: str) -> str: + """Remove the 'Dear X,' line so the output is just the letter body + sign-off.""" + lines = text.splitlines() + for i, line in enumerate(lines): + if line.strip().lower().startswith("dear "): + # Skip the greeting line and any following blank lines + rest = lines[i + 1:] + while rest and not rest[0].strip(): + rest = rest[1:] + return "\n".join(rest).strip() + return text.strip() + + +def build_records(letters_dir: Path = LETTERS_DIR) -> list[dict]: + """Parse all cover letters and return list of training records.""" + records = [] + seen: set[Path] = set() + all_paths = [] + for glob in LETTER_GLOBS: + for p in letters_dir.glob(glob): + if p not in seen: + seen.add(p) + all_paths.append(p) + for path in sorted(all_paths): + text = path.read_text(encoding="utf-8", errors="ignore").strip() + if not text or len(text) < 100: + continue + + company = extract_company_from_filename(path.stem) + role = extract_role_from_text(text) + body = strip_greeting(text) + + if not role: + # Use a generic instruction when role extraction fails + instruction = f"Write a cover letter for a position at {company}." + else: + instruction = f"Write a cover letter for the {role} position at {company}." + + records.append({ + "instruction": instruction, + "output": body, + "source_file": path.name, + }) + + return records + + +def write_jsonl(records: list[dict], output_path: Path) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, "w", encoding="utf-8") as f: + for record in records: + f.write(json.dumps(record, ensure_ascii=False) + "\n") + + +def main() -> None: + parser = argparse.ArgumentParser(description="Prepare LoRA training data from cover letter corpus") + parser.add_argument("--output", default=str(DEFAULT_OUTPUT), help="Output JSONL path") + parser.add_argument("--letters-dir", default=str(LETTERS_DIR), help="Directory of cover letters") + parser.add_argument("--stats", action="store_true", help="Print statistics and exit") + args = parser.parse_args() + + records = build_records(Path(args.letters_dir)) + + if args.stats: + print(f"Total letters: {len(records)}") + with_role = sum(1 for r in records if not r["instruction"].startswith("Write a cover letter for a position")) + print(f"Role extracted: {with_role}/{len(records)}") + avg_len = sum(len(r["output"]) for r in records) / max(len(records), 1) + print(f"Avg letter length: {avg_len:.0f} chars") + for r in records: + print(f" {r['source_file']!r:55s} β†’ {r['instruction'][:70]}") + return + + output_path = Path(args.output) + write_jsonl(records, output_path) + print(f"Wrote {len(records)} training records to {output_path}") + print() + print("Next step for LoRA fine-tuning:") + print(" 1. Download base model: huggingface-cli download meta-llama/Meta-Llama-3.1-8B-Instruct") + print(" 2. Fine-tune with TRL: see docs/plans/lora-finetune.md (to be created)") + print(" 3. Or use HuggingFace Jobs: bash scripts/manage-ui.sh β€” hugging-face-model-trainer skill") + + +if __name__ == "__main__": + main() diff --git a/scripts/scrape_url.py b/scripts/scrape_url.py new file mode 100644 index 0000000..e577fe6 --- /dev/null +++ b/scripts/scrape_url.py @@ -0,0 +1,228 @@ +# scripts/scrape_url.py +""" +Scrape a job listing from its URL and update the job record. + +Supports: + - LinkedIn (guest jobs API β€” no auth required) + - Indeed (HTML parse) + - Glassdoor (JobSpy internal scraper, same as enrich_descriptions.py) + - Generic (JSON-LD β†’ og:tags fallback) + +Usage (background task β€” called by task_runner): + from scripts.scrape_url import scrape_job_url + scrape_job_url(db_path, job_id) +""" +import json +import re +import sqlite3 +import sys +from pathlib import Path +from typing import Optional +from urllib.parse import urlparse, urlencode, parse_qsl + +import requests +from bs4 import BeautifulSoup + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.db import DEFAULT_DB, update_job_fields + +_STRIP_PARAMS = { + "utm_source", "utm_medium", "utm_campaign", "utm_content", "utm_term", + "trk", "trkEmail", "refId", "trackingId", "lipi", "midToken", "midSig", + "eid", "otpToken", "ssid", "fmid", +} + +_HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36" + ) +} +_TIMEOUT = 12 + + +def _detect_board(url: str) -> str: + """Return 'linkedin', 'indeed', 'glassdoor', or 'generic'.""" + url_lower = url.lower() + if "linkedin.com" in url_lower: + return "linkedin" + if "indeed.com" in url_lower: + return "indeed" + if "glassdoor.com" in url_lower: + return "glassdoor" + return "generic" + + +def _extract_linkedin_job_id(url: str) -> Optional[str]: + """Extract numeric job ID from a LinkedIn job URL.""" + m = re.search(r"/jobs/view/(\d+)", url) + return m.group(1) if m else None + + +def canonicalize_url(url: str) -> str: + """ + Strip tracking parameters from a job URL and return a clean canonical form. + + LinkedIn: https://www.linkedin.com/jobs/view//?trk=... β†’ https://www.linkedin.com/jobs/view// + Others: strips utm_source/utm_medium/utm_campaign/trk/refId/trackingId + """ + url = url.strip() + if "linkedin.com" in url.lower(): + job_id = _extract_linkedin_job_id(url) + if job_id: + return f"https://www.linkedin.com/jobs/view/{job_id}/" + parsed = urlparse(url) + clean_qs = urlencode([(k, v) for k, v in parse_qsl(parsed.query) if k not in _STRIP_PARAMS]) + return parsed._replace(query=clean_qs).geturl() + + +def _scrape_linkedin(url: str) -> dict: + """Fetch via LinkedIn guest jobs API (no auth required).""" + job_id = _extract_linkedin_job_id(url) + if not job_id: + return {} + api_url = f"https://www.linkedin.com/jobs-guest/jobs/api/jobPosting/{job_id}" + resp = requests.get(api_url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "html.parser") + + def _text(selector, **kwargs): + tag = soup.find(selector, **kwargs) + return tag.get_text(strip=True) if tag else "" + + title = _text("h2", class_="top-card-layout__title") + company = _text("a", class_="topcard__org-name-link") or _text("span", class_="topcard__org-name-link") + location = _text("span", class_="topcard__flavor--bullet") + desc_div = soup.find("div", class_="show-more-less-html__markup") + description = desc_div.get_text(separator="\n", strip=True) if desc_div else "" + + return {k: v for k, v in { + "title": title, + "company": company, + "location": location, + "description": description, + "source": "linkedin", + }.items() if v} + + +def _scrape_indeed(url: str) -> dict: + """Scrape an Indeed job page.""" + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def _scrape_glassdoor(url: str) -> dict: + """Re-use JobSpy's Glassdoor scraper for description fetch.""" + m = re.search(r"jl=(\d+)", url) + if not m: + return {} + try: + from jobspy.glassdoor import Glassdoor + from jobspy.glassdoor.constant import fallback_token, headers + from jobspy.model import ScraperInput, Site + from jobspy.util import create_session + + scraper = Glassdoor() + scraper.base_url = "https://www.glassdoor.com/" + scraper.session = create_session(has_retry=True) + token = scraper._get_csrf_token() + headers["gd-csrf-token"] = token if token else fallback_token + scraper.scraper_input = ScraperInput(site_type=[Site.GLASSDOOR]) + description = scraper._fetch_job_description(int(m.group(1))) + return {"description": description} if description else {} + except Exception: + return {} + + +def _parse_json_ld_or_og(html: str) -> dict: + """Extract job fields from JSON-LD structured data, then og: meta tags.""" + soup = BeautifulSoup(html, "html.parser") + + for script in soup.find_all("script", type="application/ld+json"): + try: + data = json.loads(script.string or "") + if isinstance(data, list): + data = next((d for d in data if d.get("@type") == "JobPosting"), {}) + if data.get("@type") == "JobPosting": + org = data.get("hiringOrganization") or {} + loc = data.get("jobLocation") or {} + if isinstance(loc, list): + loc = loc[0] if loc else {} + addr = loc.get("address") or {} + location = ( + addr.get("addressLocality", "") or + addr.get("addressRegion", "") or + addr.get("addressCountry", "") + ) + return {k: v for k, v in { + "title": data.get("title", ""), + "company": org.get("name", ""), + "location": location, + "description": data.get("description", ""), + "salary": str(data.get("baseSalary", "")) if data.get("baseSalary") else "", + }.items() if v} + except Exception: + continue + + def _meta(prop): + tag = soup.find("meta", property=prop) or soup.find("meta", attrs={"name": prop}) + return tag.get("content", "") if tag else "" + + title_tag = soup.find("title") + title = _meta("og:title") or (title_tag.get_text(strip=True) if title_tag else "") + description = _meta("og:description") + return {k: v for k, v in {"title": title, "description": description}.items() if v} + + +def _scrape_generic(url: str) -> dict: + resp = requests.get(url, headers=_HEADERS, timeout=_TIMEOUT) + resp.raise_for_status() + return _parse_json_ld_or_og(resp.text) or {} + + +def scrape_job_url(db_path: Path = DEFAULT_DB, job_id: int = None) -> dict: + """ + Fetch the job listing at the stored URL and update the job record. + + Returns the dict of fields scraped (may be empty on failure). + Does not raise β€” failures are logged and the job row is left as-is. + """ + if job_id is None: + return {} + + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT url FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if not row: + return {} + + url = row["url"] or "" + if not url.startswith("http"): + return {} + + board = _detect_board(url) + try: + if board == "linkedin": + fields = _scrape_linkedin(url) + elif board == "indeed": + fields = _scrape_indeed(url) + elif board == "glassdoor": + fields = _scrape_glassdoor(url) + else: + fields = _scrape_generic(url) + except requests.RequestException as exc: + print(f"[scrape_url] HTTP error for job {job_id} ({url}): {exc}") + return {} + except Exception as exc: + print(f"[scrape_url] Error scraping job {job_id} ({url}): {exc}") + return {} + + if fields: + fields.pop("url", None) + update_job_fields(db_path, job_id, fields) + print(f"[scrape_url] job {job_id}: scraped '{fields.get('title', '?')}' @ {fields.get('company', '?')}") + + return fields diff --git a/scripts/sync.py b/scripts/sync.py new file mode 100644 index 0000000..ddb5634 --- /dev/null +++ b/scripts/sync.py @@ -0,0 +1,97 @@ +# scripts/sync.py +""" +Push approved jobs from SQLite staging to Notion. + +Usage: + conda run -n job-seeker python scripts/sync.py +""" +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +import yaml +from datetime import datetime + +from notion_client import Client + +from scripts.db import DEFAULT_DB, get_jobs_by_status, update_job_status + +CONFIG_DIR = Path(__file__).parent.parent / "config" + + +def load_notion_config() -> dict: + return yaml.safe_load((CONFIG_DIR / "notion.yaml").read_text()) + + +def _build_properties(job: dict, fm: dict, include_optional: bool = True) -> dict: + """Build the Notion properties dict for a job. Optional fields (match_score, + keyword_gaps) are included by default but can be dropped for DBs that don't + have those columns yet.""" + props = { + fm["title_field"]: {"title": [{"text": {"content": job.get("salary") or job.get("title", "")}}]}, + fm["job_title"]: {"rich_text": [{"text": {"content": job.get("title", "")}}]}, + fm["company"]: {"rich_text": [{"text": {"content": job.get("company", "")}}]}, + fm["url"]: {"url": job.get("url") or None}, + fm["source"]: {"multi_select": [{"name": job.get("source", "unknown").title()}]}, + fm["status"]: {"select": {"name": fm["status_new"]}}, + fm["remote"]: {"checkbox": bool(job.get("is_remote", 0))}, + fm["date_found"]: {"date": {"start": job.get("date_found", datetime.now().isoformat()[:10])}}, + } + if include_optional: + score = job.get("match_score") + if score is not None and fm.get("match_score"): + props[fm["match_score"]] = {"number": score} + gaps = job.get("keyword_gaps") + if gaps and fm.get("keyword_gaps"): + props[fm["keyword_gaps"]] = {"rich_text": [{"text": {"content": gaps}}]} + return props + + +def sync_to_notion(db_path: Path = DEFAULT_DB) -> int: + """Push all approved and applied jobs to Notion. Returns count synced.""" + cfg = load_notion_config() + notion = Client(auth=cfg["token"]) + db_id = cfg["database_id"] + fm = cfg["field_map"] + + approved = get_jobs_by_status(db_path, "approved") + applied = get_jobs_by_status(db_path, "applied") + pending_sync = approved + applied + if not pending_sync: + print("[sync] No approved/applied jobs to sync.") + return 0 + + synced_ids = [] + for job in pending_sync: + try: + notion.pages.create( + parent={"database_id": db_id}, + properties=_build_properties(job, fm, include_optional=True), + ) + synced_ids.append(job["id"]) + print(f"[sync] + {job.get('title')} @ {job.get('company')}") + except Exception as e: + err = str(e) + # Notion returns 400 validation_error when a property column doesn't exist yet. + # Fall back to core fields only and warn the user. + if "validation_error" in err or "Could not find property" in err: + try: + notion.pages.create( + parent={"database_id": db_id}, + properties=_build_properties(job, fm, include_optional=False), + ) + synced_ids.append(job["id"]) + print(f"[sync] + {job.get('title')} @ {job.get('company')} " + f"(skipped optional fields β€” add Match Score / Keyword Gaps columns to Notion DB)") + except Exception as e2: + print(f"[sync] Error syncing {job.get('url')}: {e2}") + else: + print(f"[sync] Error syncing {job.get('url')}: {e}") + + update_job_status(db_path, synced_ids, "synced") + print(f"[sync] Done β€” {len(synced_ids)} jobs synced to Notion.") + return len(synced_ids) + + +if __name__ == "__main__": + sync_to_notion() diff --git a/scripts/task_runner.py b/scripts/task_runner.py new file mode 100644 index 0000000..9e6cafd --- /dev/null +++ b/scripts/task_runner.py @@ -0,0 +1,155 @@ +# scripts/task_runner.py +""" +Background task runner for LLM generation tasks. + +Submitting a task inserts a row in background_tasks and spawns a daemon thread. +The thread calls the appropriate generator, writes results to existing tables, +and marks the task completed or failed. + +Deduplication: only one queued/running task per (task_type, job_id) is allowed. +Different task types for the same job run concurrently (e.g. cover letter + research). +""" +import sqlite3 +import threading +from pathlib import Path + +from scripts.db import ( + DEFAULT_DB, + insert_task, + update_task_status, + update_task_stage, + update_cover_letter, + save_research, +) + + +def submit_task(db_path: Path = DEFAULT_DB, task_type: str = "", + job_id: int = None) -> tuple[int, bool]: + """Submit a background LLM task. + + Returns (task_id, True) if a new task was queued and a thread spawned. + Returns (existing_id, False) if an identical task is already in-flight. + """ + task_id, is_new = insert_task(db_path, task_type, job_id) + if is_new: + t = threading.Thread( + target=_run_task, + args=(db_path, task_id, task_type, job_id), + daemon=True, + ) + t.start() + return task_id, is_new + + +def _run_task(db_path: Path, task_id: int, task_type: str, job_id: int) -> None: + """Thread body: run the generator and persist the result.""" + # job_id == 0 means a global task (e.g. discovery) with no associated job row. + job: dict = {} + if job_id: + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + row = conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + if row is None: + update_task_status(db_path, task_id, "failed", error=f"Job {job_id} not found") + return + job = dict(row) + + update_task_status(db_path, task_id, "running") + + try: + if task_type == "discovery": + from scripts.discover import run_discovery + new_count = run_discovery(db_path) + n = new_count or 0 + update_task_status( + db_path, task_id, "completed", + error=f"{n} new listing{'s' if n != 1 else ''} added", + ) + return + + elif task_type == "cover_letter": + from scripts.generate_cover_letter import generate + result = generate( + job.get("title", ""), + job.get("company", ""), + job.get("description", ""), + ) + update_cover_letter(db_path, job_id, result) + + elif task_type == "company_research": + from scripts.company_research import research_company + result = research_company( + job, + on_stage=lambda s: update_task_stage(db_path, task_id, s), + ) + save_research(db_path, job_id=job_id, **result) + + elif task_type == "enrich_descriptions": + from scripts.enrich_descriptions import enrich_all_descriptions + r = enrich_all_descriptions(db_path) + errs = len(r.get("errors", [])) + msg = ( + f"{r['succeeded']} description(s) fetched, {r['failed']} failed" + + (f", {errs} error(s)" if errs else "") + ) + update_task_status(db_path, task_id, "completed", error=msg) + return + + elif task_type == "scrape_url": + from scripts.scrape_url import scrape_job_url + fields = scrape_job_url(db_path, job_id) + title = fields.get("title") or job.get("url", "?") + company = fields.get("company", "") + msg = f"{title}" + (f" @ {company}" if company else "") + update_task_status(db_path, task_id, "completed", error=msg) + # Auto-enrich company/salary for Craigslist jobs + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + job_row = conn.execute( + "SELECT source, company FROM jobs WHERE id=?", (job_id,) + ).fetchone() + conn.close() + if job_row and job_row["source"] == "craigslist" and not job_row["company"]: + submit_task(db_path, "enrich_craigslist", job_id) + return + + elif task_type == "enrich_craigslist": + from scripts.enrich_descriptions import enrich_craigslist_fields + extracted = enrich_craigslist_fields(db_path, job_id) + company = extracted.get("company", "") + msg = f"company={company}" if company else "no company found" + update_task_status(db_path, task_id, "completed", error=msg) + return + + elif task_type == "email_sync": + try: + from scripts.imap_sync import sync_all + result = sync_all(db_path, + on_stage=lambda s: update_task_stage(db_path, task_id, s)) + leads = result.get("new_leads", 0) + todo = result.get("todo_attached", 0) + errs = len(result.get("errors", [])) + msg = ( + f"{result['synced']} jobs updated, " + f"+{result['inbound']} in, +{result['outbound']} out" + + (f", {leads} new lead(s)" if leads else "") + + (f", {todo} todo attached" if todo else "") + + (f", {errs} error(s)" if errs else "") + ) + update_task_status(db_path, task_id, "completed", error=msg) + return + except FileNotFoundError: + update_task_status(db_path, task_id, "failed", + error="Email not configured β€” go to Settings β†’ Email") + return + + else: + raise ValueError(f"Unknown task_type: {task_type!r}") + + update_task_status(db_path, task_id, "completed") + + except BaseException as exc: + # BaseException catches SystemExit (from companyScraper sys.exit calls) + # in addition to regular exceptions. + update_task_status(db_path, task_id, "failed", error=str(exc)) diff --git a/scripts/test_email_classify.py b/scripts/test_email_classify.py new file mode 100644 index 0000000..8ac47f2 --- /dev/null +++ b/scripts/test_email_classify.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +""" +Compare email classifiers across models on a live sample from IMAP. + +Usage: + conda run -n job-seeker python scripts/test_email_classify.py + conda run -n job-seeker python scripts/test_email_classify.py --limit 30 + conda run -n job-seeker python scripts/test_email_classify.py --dry-run # phrase filter only, no LLM + +Outputs a table: subject | phrase_blocked | phi3 | llama3.1 | vllm +""" +import argparse +import re +import sys +from datetime import datetime, timedelta +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.imap_sync import ( + load_config, connect, _search_folder, _parse_message, + _has_recruitment_keyword, _has_rejection_or_ats_signal, + _CLASSIFY_SYSTEM, _CLASSIFY_LABELS, + _REJECTION_PHRASES, _SPAM_PHRASES, _ATS_CONFIRM_SUBJECTS, _SPAM_SUBJECT_PREFIXES, +) +from scripts.llm_router import LLMRouter + +_ROUTER = LLMRouter() + +MODELS = { + "phi3": ("phi3:mini", ["ollama_research"]), + "llama3": ("llama3.1:8b", ["ollama_research"]), + "vllm": ("__auto__", ["vllm"]), +} + +BROAD_TERMS = ["interview", "opportunity", "offer letter", "job offer", "application", "recruiting"] + + +def _classify(subject: str, body: str, model_override: str, fallback_order: list) -> str: + try: + prompt = f"Subject: {subject}\n\nEmail: {body[:600]}" + raw = _ROUTER.complete( + prompt, + system=_CLASSIFY_SYSTEM, + model_override=model_override, + fallback_order=fallback_order, + ) + text = re.sub(r".*?", "", raw, flags=re.DOTALL).lower().strip() + for label in _CLASSIFY_LABELS: + if text.startswith(label) or label in text: + return label + return f"? ({text[:30]})" + except Exception as e: + return f"ERR: {e!s:.20}" + + +def _short(s: str, n: int = 55) -> str: + return s if len(s) <= n else s[:n - 1] + "…" + + +def _explain_block(subject: str, body: str) -> str: + """Return the first phrase/rule that triggered a block.""" + subject_lower = subject.lower().strip() + for p in _SPAM_SUBJECT_PREFIXES: + if subject_lower.startswith(p): + return f"subject prefix: {p!r}" + for p in _ATS_CONFIRM_SUBJECTS: + if p in subject_lower: + return f"ATS subject: {p!r}" + haystack = subject_lower + " " + body[:800].lower() + for p in _REJECTION_PHRASES + _SPAM_PHRASES: + if p in haystack: + return f"phrase: {p!r}" + return "unknown" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--limit", type=int, default=20, help="Max emails to test") + parser.add_argument("--days", type=int, default=90) + parser.add_argument("--dry-run", action="store_true", + help="Skip LLM calls β€” show phrase filter only") + parser.add_argument("--verbose", action="store_true", + help="Show which phrase triggered each BLOCK") + args = parser.parse_args() + + cfg = load_config() + since = (datetime.now() - timedelta(days=args.days)).strftime("%d-%b-%Y") + + print(f"Connecting to {cfg.get('host')} …") + conn = connect(cfg) + + # Collect unique UIDs across broad terms + all_uids: dict[bytes, None] = {} + for term in BROAD_TERMS: + for uid in _search_folder(conn, "INBOX", f'(SUBJECT "{term}")', since): + all_uids[uid] = None + + sample = list(all_uids.keys())[: args.limit] + print(f"Fetched {len(all_uids)} matching UIDs, testing {len(sample)}\n") + + # Header + if args.dry_run: + print(f"{'Subject':<56} {'RK':3} {'Phrase':7}") + print("-" * 72) + else: + print(f"{'Subject':<56} {'RK':3} {'Phrase':7} {'phi3':<20} {'llama3':<20} {'vllm':<20}") + print("-" * 130) + + passed = skipped = 0 + rows = [] + + for uid in sample: + parsed = _parse_message(conn, uid) + if not parsed: + continue + subj = parsed["subject"] + body = parsed["body"] + + has_rk = _has_recruitment_keyword(subj) + phrase_block = _has_rejection_or_ats_signal(subj, body) + + if args.dry_run: + rk_mark = "βœ“" if has_rk else "βœ—" + pb_mark = "BLOCK" if phrase_block else "pass" + line = f"{_short(subj):<56} {rk_mark:3} {pb_mark:7}" + if phrase_block and args.verbose: + reason = _explain_block(subj, body) + line += f" [{reason}]" + print(line) + continue + + if phrase_block or not has_rk: + skipped += 1 + rk_mark = "βœ“" if has_rk else "βœ—" + pb_mark = "BLOCK" if phrase_block else "pass" + print(f"{_short(subj):<56} {rk_mark:3} {pb_mark:7} {'β€”':<20} {'β€”':<20} {'β€”':<20}") + continue + + passed += 1 + results = {} + for name, (model, fallback) in MODELS.items(): + results[name] = _classify(subj, body, model, fallback) + + pb_mark = "pass" + print(f"{_short(subj):<56} {'βœ“':3} {pb_mark:7} " + f"{results['phi3']:<20} {results['llama3']:<20} {results['vllm']:<20}") + + if not args.dry_run: + print(f"\nPhrase-blocked or no-keyword: {skipped} | Reached LLMs: {passed}") + + try: + conn.logout() + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/vision_service/environment.yml b/scripts/vision_service/environment.yml new file mode 100644 index 0000000..bbbe697 --- /dev/null +++ b/scripts/vision_service/environment.yml @@ -0,0 +1,17 @@ +name: job-seeker-vision +channels: + - conda-forge + - defaults +dependencies: + - python=3.11 + - pip + - pip: + - torch>=2.0.0 + - torchvision>=0.15.0 + - transformers>=4.40.0 + - accelerate>=0.26.0 + - bitsandbytes>=0.43.0 + - einops>=0.7.0 + - Pillow>=10.0.0 + - fastapi>=0.110.0 + - "uvicorn[standard]>=0.27.0" diff --git a/scripts/vision_service/main.py b/scripts/vision_service/main.py new file mode 100644 index 0000000..0cdbf3d --- /dev/null +++ b/scripts/vision_service/main.py @@ -0,0 +1,98 @@ +""" +Vision service β€” moondream2 inference for survey screenshot analysis. + +Start: bash scripts/manage-vision.sh start +Or directly: conda run -n job-seeker-vision uvicorn scripts.vision_service.main:app --port 8002 + +First run downloads moondream2 from HuggingFace (~1.8GB). +Model is loaded lazily on first /analyze request and stays resident. +GPU is used if available (CUDA); falls back to CPU. +4-bit quantization on GPU keeps VRAM footprint ~1.5GB. +""" +import base64 +import io + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel + +app = FastAPI(title="Job Seeker Vision Service") + +# Module-level model state β€” lazy loaded on first /analyze request +_model = None +_tokenizer = None +_device = "cpu" +_loading = False + + +def _load_model() -> None: + global _model, _tokenizer, _device, _loading + if _model is not None: + return + _loading = True + print("[vision] Loading moondream2…") + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + + model_id = "vikhyatk/moondream2" + revision = "2025-01-09" + _device = "cuda" if torch.cuda.is_available() else "cpu" + + if _device == "cuda": + from transformers import BitsAndBytesConfig + bnb = BitsAndBytesConfig(load_in_4bit=True) + _model = AutoModelForCausalLM.from_pretrained( + model_id, revision=revision, + quantization_config=bnb, + trust_remote_code=True, + device_map="auto", + ) + else: + _model = AutoModelForCausalLM.from_pretrained( + model_id, revision=revision, + trust_remote_code=True, + ) + _model.to(_device) + + _tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) + _loading = False + print(f"[vision] moondream2 ready on {_device}") + + +class AnalyzeRequest(BaseModel): + prompt: str + image_base64: str + + +class AnalyzeResponse(BaseModel): + text: str + + +@app.get("/health") +def health(): + import torch + return { + "status": "loading" if _loading else "ok", + "model": "moondream2", + "gpu": torch.cuda.is_available(), + "loaded": _model is not None, + } + + +@app.post("/analyze", response_model=AnalyzeResponse) +def analyze(req: AnalyzeRequest): + from PIL import Image + import torch + + _load_model() + + try: + image_data = base64.b64decode(req.image_base64) + image = Image.open(io.BytesIO(image_data)).convert("RGB") + except Exception as e: + raise HTTPException(status_code=400, detail=f"Invalid image: {e}") + + with torch.no_grad(): + enc_image = _model.encode_image(image) + answer = _model.answer_question(enc_image, req.prompt, _tokenizer) + + return AnalyzeResponse(text=answer) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_company_research.py b/tests/test_company_research.py new file mode 100644 index 0000000..ea696dd --- /dev/null +++ b/tests/test_company_research.py @@ -0,0 +1,84 @@ +import sys +from pathlib import Path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from scripts.company_research import _score_experiences, _build_resume_context, _load_resume_and_keywords + + +RESUME = { + "experience_details": [ + { + "position": "Lead Technical Account Manager", + "company": "UpGuard", + "employment_period": "10/2022 - 05/2023", + "key_responsibilities": [ + {"r1": "Managed enterprise security accounts worth $2M ARR"}, + {"r2": "Led QBR cadence with C-suite stakeholders"}, + ], + }, + { + "position": "Founder and Principal Consultant", + "company": "M3 Consulting Services", + "employment_period": "07/2023 - Present", + "key_responsibilities": [ + {"r1": "Revenue operations consulting for SaaS clients"}, + {"r2": "Built customer success frameworks"}, + ], + }, + { + "position": "Customer Success Manager", + "company": "Generic Co", + "employment_period": "01/2020 - 09/2022", + "key_responsibilities": [ + {"r1": "Managed SMB portfolio"}, + ], + }, + ] +} + +KEYWORDS = ["ARR", "QBR", "enterprise", "security", "stakeholder"] +JD = "Looking for a TAM with enterprise ARR experience and QBR facilitation skills." + + +def test_score_experiences_returns_sorted(): + """UpGuard entry should score highest β€” most keywords present in text and JD.""" + scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD) + assert scored[0]["company"] == "UpGuard" + + +def test_score_experiences_adds_score_key(): + """Each returned entry has a 'score' integer key.""" + scored = _score_experiences(RESUME["experience_details"], KEYWORDS, JD) + for e in scored: + assert isinstance(e["score"], int) + + +def test_build_resume_context_top2_in_full(): + """Top 2 experiences appear with full bullet detail.""" + ctx = _build_resume_context(RESUME, KEYWORDS, JD) + assert "Lead Technical Account Manager" in ctx + assert "Managed enterprise security accounts" in ctx + assert "Founder and Principal Consultant" in ctx + + +def test_build_resume_context_rest_condensed(): + """Remaining experiences appear as condensed one-liners, not full bullets.""" + ctx = _build_resume_context(RESUME, KEYWORDS, JD) + assert "Also in Alex" in ctx + assert "Generic Co" in ctx + # Generic Co bullets should NOT appear in full + assert "Managed SMB portfolio" not in ctx + + +def test_upguard_nda_low_score(): + """UpGuard name replaced with 'enterprise security vendor' when score < 3.""" + ctx = _build_resume_context(RESUME, ["python", "kubernetes"], "python kubernetes devops") + assert "enterprise security vendor" in ctx + + +def test_load_resume_and_keywords_returns_lists(): + """_load_resume_and_keywords returns a tuple of (dict, list[str]).""" + resume, keywords = _load_resume_and_keywords() + assert isinstance(resume, dict) + assert isinstance(keywords, list) + assert all(isinstance(k, str) for k in keywords) diff --git a/tests/test_cover_letter.py b/tests/test_cover_letter.py new file mode 100644 index 0000000..558d261 --- /dev/null +++ b/tests/test_cover_letter.py @@ -0,0 +1,120 @@ +# tests/test_cover_letter.py +import pytest +from pathlib import Path +from unittest.mock import patch, MagicMock + + +# ── prepare_training_data tests ────────────────────────────────────────────── + +def test_extract_role_from_text(): + """extract_role_from_text pulls the role title from the opening sentence.""" + from scripts.prepare_training_data import extract_role_from_text + + text = "Dear Tailscale Hiring Team,\n\nI'm delighted to apply for the Customer Support Manager position at Tailscale." + assert extract_role_from_text(text) == "Customer Support Manager" + + +def test_extract_role_handles_missing(): + """extract_role_from_text returns empty string if no role found.""" + from scripts.prepare_training_data import extract_role_from_text + + assert extract_role_from_text("Dear Team,\n\nHello there.") == "" + + +def test_extract_company_from_filename(): + """extract_company_from_filename strips 'Cover Letter' suffix.""" + from scripts.prepare_training_data import extract_company_from_filename + + assert extract_company_from_filename("Tailscale Cover Letter") == "Tailscale" + assert extract_company_from_filename("Dagster Labs Cover Letter.md") == "Dagster Labs" + + +def test_strip_greeting(): + """strip_greeting removes the 'Dear X,' line and returns the body.""" + from scripts.prepare_training_data import strip_greeting + + text = "Dear Hiring Team,\n\nI'm delighted to apply for the CSM role.\n\nBest regards,\nAlex" + result = strip_greeting(text) + assert result.startswith("I'm delighted") + assert "Dear" not in result + + +def test_build_records_from_tmp_corpus(tmp_path): + """build_records parses a small corpus directory into training records.""" + from scripts.prepare_training_data import build_records + + letter = tmp_path / "Acme Corp Cover Letter.md" + letter.write_text( + "Dear Acme Hiring Team,\n\n" + "I'm delighted to apply for the Director of Customer Success position at Acme Corp. " + "With six years of experience, I bring strong skills.\n\n" + "Best regards,\nAlex Rivera" + ) + + records = build_records(tmp_path) + assert len(records) == 1 + assert "Acme Corp" in records[0]["instruction"] + assert "Director of Customer Success" in records[0]["instruction"] + assert records[0]["output"].startswith("I'm delighted") + + +def test_build_records_skips_empty_files(tmp_path): + """build_records ignores empty or very short files.""" + from scripts.prepare_training_data import build_records + + (tmp_path / "Empty Cover Letter.md").write_text("") + (tmp_path / "Tiny Cover Letter.md").write_text("Hi") + + records = build_records(tmp_path) + assert len(records) == 0 + + +# ── generate_cover_letter tests ─────────────────────────────────────────────── + +def test_find_similar_letters_returns_top_k(): + """find_similar_letters returns at most top_k entries.""" + from scripts.generate_cover_letter import find_similar_letters + + corpus = [ + {"company": "Acme", "text": "customer success technical account management SaaS"}, + {"company": "Beta", "text": "software engineering backend python"}, + {"company": "Gamma", "text": "customer onboarding enterprise NPS"}, + {"company": "Delta", "text": "customer success manager renewal QBR"}, + ] + results = find_similar_letters("customer success manager enterprise SaaS", corpus, top_k=2) + assert len(results) == 2 + # Should prefer customer success companies over software engineering + companies = [r["company"] for r in results] + assert "Beta" not in companies + + +def test_load_corpus_returns_list(): + """load_corpus returns a list (may be empty if LETTERS_DIR absent, must not crash).""" + from scripts.generate_cover_letter import load_corpus, LETTERS_DIR + + if LETTERS_DIR.exists(): + corpus = load_corpus() + assert isinstance(corpus, list) + if corpus: + assert "company" in corpus[0] + assert "text" in corpus[0] + else: + pytest.skip("LETTERS_DIR not present in this environment") + + +def test_generate_calls_llm_router(): + """generate() calls the router's complete() and returns its output.""" + from scripts.generate_cover_letter import generate + + fake_corpus = [ + {"company": "Acme", "text": "I'm delighted to apply for the CSM role at Acme."}, + ] + mock_router = MagicMock() + mock_router.complete.return_value = "Dear Hiring Team,\n\nI'm delighted to apply.\n\nWarm regards,\nAlex Rivera" + + with patch("scripts.generate_cover_letter.load_corpus", return_value=fake_corpus): + result = generate("Customer Success Manager", "TestCo", "Looking for a CSM", + _router=mock_router) + + mock_router.complete.assert_called_once() + assert "Alex Rivera" in result diff --git a/tests/test_craigslist.py b/tests/test_craigslist.py new file mode 100644 index 0000000..1fccaf4 --- /dev/null +++ b/tests/test_craigslist.py @@ -0,0 +1,211 @@ +"""Tests for Craigslist RSS scraper.""" +from datetime import datetime, timezone, timedelta +from email.utils import format_datetime +from unittest.mock import patch, MagicMock +import xml.etree.ElementTree as ET + +import pytest +import requests + + +# ── RSS fixture helpers ──────────────────────────────────────────────────────── + +def _make_rss(items: list[dict]) -> bytes: + """Build minimal Craigslist-style RSS XML from a list of item dicts.""" + channel = ET.Element("channel") + for item_data in items: + item = ET.SubElement(channel, "item") + for tag, value in item_data.items(): + el = ET.SubElement(item, tag) + el.text = value + rss = ET.Element("rss") + rss.append(channel) + return ET.tostring(rss, encoding="utf-8", xml_declaration=True) + + +def _pubdate(hours_ago: float = 1.0) -> str: + """Return an RFC 2822 pubDate string for N hours ago.""" + dt = datetime.now(tz=timezone.utc) - timedelta(hours=hours_ago) + return format_datetime(dt) + + +def _mock_resp(content: bytes, status_code: int = 200) -> MagicMock: + mock = MagicMock() + mock.status_code = status_code + mock.content = content + mock.raise_for_status = MagicMock() + if status_code >= 400: + mock.raise_for_status.side_effect = requests.HTTPError(f"HTTP {status_code}") + return mock + + +# ── Fixtures ────────────────────────────────────────────────────────────────── + +_SAMPLE_RSS = _make_rss([{ + "title": "Customer Success Manager", + "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1234567890.html", + "description": "Great CSM role at Acme Corp. Salary $120k.", + "pubDate": _pubdate(1), +}]) + +_TWO_ITEM_RSS = _make_rss([ + { + "title": "Customer Success Manager", + "link": "https://sfbay.craigslist.org/jjj/d/csm-role/1111111111.html", + "description": "CSM role 1.", + "pubDate": _pubdate(1), + }, + { + "title": "Account Manager", + "link": "https://sfbay.craigslist.org/jjj/d/am-role/2222222222.html", + "description": "AM role.", + "pubDate": _pubdate(2), + }, +]) + +_OLD_ITEM_RSS = _make_rss([{ + "title": "Old Job", + "link": "https://sfbay.craigslist.org/jjj/d/old-job/9999999999.html", + "description": "Very old posting.", + "pubDate": _pubdate(hours_ago=500), +}]) + +_TWO_METRO_CONFIG = { + "metros": ["sfbay", "newyork"], + "location_map": { + "San Francisco Bay Area, CA": "sfbay", + "New York, NY": "newyork", + }, + "category": "jjj", +} + +_SINGLE_METRO_CONFIG = { + "metros": ["sfbay"], + "location_map": {"San Francisco Bay Area, CA": "sfbay"}, +} + +_PROFILE = {"titles": ["Customer Success Manager"], "hours_old": 240} + + +# ── Tests ───────────────────────────────────────────────────────────────────── + +def test_scrape_returns_empty_on_missing_config(): + """Missing craigslist.yaml β†’ returns [] without raising.""" + from scripts.custom_boards import craigslist + with patch("scripts.custom_boards.craigslist._load_config", + side_effect=FileNotFoundError("config not found")): + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + assert result == [] + + +def test_scrape_remote_hits_all_metros(): + """location='Remote' triggers one RSS fetch per configured metro.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_SAMPLE_RSS)) as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote") + + assert mock_get.call_count == 2 + fetched_urls = [call.args[0] for call in mock_get.call_args_list] + assert any("sfbay" in u for u in fetched_urls) + assert any("newyork" in u for u in fetched_urls) + assert all(r["is_remote"] for r in result) + + +def test_scrape_location_map_resolves(): + """Known location string maps to exactly one metro.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_SAMPLE_RSS)) as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + + assert mock_get.call_count == 1 + assert "sfbay" in mock_get.call_args.args[0] + assert len(result) == 1 + assert result[0]["is_remote"] is False + + +def test_scrape_location_not_in_map_returns_empty(): + """Location not in location_map β†’ [] without raising.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get") as mock_get: + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Portland, OR") + + assert result == [] + mock_get.assert_not_called() + + +def test_hours_old_filter(): + """Items older than hours_old are excluded.""" + profile = {"titles": ["Customer Success Manager"], "hours_old": 48} + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_OLD_ITEM_RSS)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(profile, "San Francisco Bay Area, CA") + + assert result == [] + + +def test_dedup_within_run(): + """Same URL from two different metros is only returned once.""" + same_url_rss = _make_rss([{ + "title": "CSM Role", + "link": "https://sfbay.craigslist.org/jjj/d/csm/1234.html", + "description": "Same job.", + "pubDate": _pubdate(1), + }]) + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(same_url_rss)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote") + + urls = [r["url"] for r in result] + assert len(urls) == len(set(urls)) + + +def test_http_error_graceful(): + """HTTP error β†’ [] without raising.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + side_effect=requests.RequestException("timeout")): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + + assert result == [] + + +def test_malformed_xml_graceful(): + """Malformed RSS XML β†’ [] without raising.""" + bad_resp = MagicMock() + bad_resp.content = b"this is not xml <<<<" + bad_resp.raise_for_status = MagicMock() + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_SINGLE_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=bad_resp): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "San Francisco Bay Area, CA") + assert result == [] + + +def test_results_wanted_cap(): + """Never returns more than results_wanted items.""" + with patch("scripts.custom_boards.craigslist._load_config", + return_value=_TWO_METRO_CONFIG): + with patch("scripts.custom_boards.craigslist.requests.get", + return_value=_mock_resp(_TWO_ITEM_RSS)): + from scripts.custom_boards import craigslist + result = craigslist.scrape(_PROFILE, "Remote", results_wanted=1) + + assert len(result) <= 1 diff --git a/tests/test_db.py b/tests/test_db.py new file mode 100644 index 0000000..95e7ca7 --- /dev/null +++ b/tests/test_db.py @@ -0,0 +1,560 @@ +import pytest +import sqlite3 +from pathlib import Path +from unittest.mock import patch + + +def test_init_db_creates_jobs_table(tmp_path): + """init_db creates a jobs table with correct schema.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cursor = conn.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='jobs'") + assert cursor.fetchone() is not None + conn.close() + + +def test_insert_job_returns_id(tmp_path): + """insert_job inserts a row and returns its id.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Great role", "date_found": "2026-02-20", + } + row_id = insert_job(db_path, job) + assert isinstance(row_id, int) + assert row_id > 0 + + +def test_insert_job_skips_duplicate_url(tmp_path): + """insert_job returns None if URL already exists.""" + from scripts.db import init_db, insert_job + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + insert_job(db_path, job) + result = insert_job(db_path, job) + assert result is None + + +def test_get_jobs_by_status(tmp_path): + """get_jobs_by_status returns only jobs with matching status.""" + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + db_path = tmp_path / "test.db" + init_db(db_path) + job = {"title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + row_id = insert_job(db_path, job) + update_job_status(db_path, [row_id], "approved") + approved = get_jobs_by_status(db_path, "approved") + pending = get_jobs_by_status(db_path, "pending") + assert len(approved) == 1 + assert len(pending) == 0 + + +def test_update_job_status_batch(tmp_path): + """update_job_status updates multiple rows at once.""" + from scripts.db import init_db, insert_job, update_job_status, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + ids = [] + for i in range(3): + job = {"title": f"Job {i}", "company": "Co", "url": f"https://example.com/{i}", + "source": "indeed", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20"} + ids.append(insert_job(db_path, job)) + update_job_status(db_path, ids, "rejected") + assert len(get_jobs_by_status(db_path, "rejected")) == 3 + + +def test_migrate_db_adds_columns_to_existing_db(tmp_path): + """_migrate_db adds cover_letter and applied_at to a db created without them.""" + import sqlite3 + from scripts.db import _migrate_db + db_path = tmp_path / "legacy.db" + # Create old-style table without the new columns + conn = sqlite3.connect(db_path) + conn.execute("""CREATE TABLE jobs ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT, company TEXT, url TEXT UNIQUE, status TEXT DEFAULT 'pending' + )""") + conn.commit() + conn.close() + _migrate_db(db_path) + conn = sqlite3.connect(db_path) + cols = {row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()} + conn.close() + assert "cover_letter" in cols + assert "applied_at" in cols + + +def test_update_cover_letter(tmp_path): + """update_cover_letter persists text to the DB.""" + from scripts.db import init_db, insert_job, update_cover_letter, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + update_cover_letter(db_path, job_id, "Dear Hiring Manager,\nGreat role!") + rows = get_jobs_by_status(db_path, "pending") + assert rows[0]["cover_letter"] == "Dear Hiring Manager,\nGreat role!" + + +def test_mark_applied_sets_status_and_date(tmp_path): + """mark_applied sets status='applied' and populates applied_at.""" + from scripts.db import init_db, insert_job, mark_applied, get_jobs_by_status + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + mark_applied(db_path, [job_id]) + applied = get_jobs_by_status(db_path, "applied") + assert len(applied) == 1 + assert applied[0]["status"] == "applied" + assert applied[0]["applied_at"] is not None + + +# ── background_tasks tests ──────────────────────────────────────────────────── + +def test_init_db_creates_background_tasks_table(tmp_path): + """init_db creates a background_tasks table.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cur = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='background_tasks'" + ) + assert cur.fetchone() is not None + conn.close() + + +def test_insert_task_returns_id_and_true(tmp_path): + """insert_task returns (task_id, True) for a new task.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_insert_task_deduplicates_active_task(tmp_path): + """insert_task returns (existing_id, False) if a queued/running task already exists.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + second_id, is_new = insert_task(db_path, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_insert_task_allows_different_types_same_job(tmp_path): + """insert_task allows cover_letter and company_research for the same job concurrently.""" + from scripts.db import init_db, insert_job, insert_task + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + _, cl_new = insert_task(db_path, "cover_letter", job_id) + _, res_new = insert_task(db_path, "company_research", job_id) + assert cl_new is True + assert res_new is True + + +def test_update_task_status_running(tmp_path): + """update_task_status('running') sets started_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "running") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, started_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "running" + assert row[1] is not None + + +def test_update_task_status_completed(tmp_path): + """update_task_status('completed') sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "completed") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "completed" + assert row[1] is not None + + +def test_update_task_status_failed_stores_error(tmp_path): + """update_task_status('failed') stores error message and sets finished_at.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status + import sqlite3 + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + task_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, task_id, "failed", error="LLM timeout") + conn = sqlite3.connect(db_path) + row = conn.execute("SELECT status, error, finished_at FROM background_tasks WHERE id=?", (task_id,)).fetchone() + conn.close() + assert row[0] == "failed" + assert row[1] == "LLM timeout" + assert row[2] is not None + + +def test_get_active_tasks_returns_only_active(tmp_path): + """get_active_tasks returns only queued/running tasks with job info joined.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_active_tasks + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + active_id, _ = insert_task(db_path, "cover_letter", job_id) + done_id, _ = insert_task(db_path, "company_research", job_id) + update_task_status(db_path, done_id, "completed") + + tasks = get_active_tasks(db_path) + assert len(tasks) == 1 + assert tasks[0]["id"] == active_id + assert tasks[0]["company"] == "Acme" + assert tasks[0]["title"] == "CSM" + + +def test_get_task_for_job_returns_latest(tmp_path): + """get_task_for_job returns the most recent task for the given type+job.""" + from scripts.db import init_db, insert_job, insert_task, update_task_status, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + first_id, _ = insert_task(db_path, "cover_letter", job_id) + update_task_status(db_path, first_id, "completed") + second_id, _ = insert_task(db_path, "cover_letter", job_id) # allowed since first is done + + task = get_task_for_job(db_path, "cover_letter", job_id) + assert task is not None + assert task["id"] == second_id + + +def test_get_task_for_job_returns_none_when_absent(tmp_path): + """get_task_for_job returns None when no task exists for that job+type.""" + from scripts.db import init_db, insert_job, get_task_for_job + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + assert get_task_for_job(db_path, "cover_letter", job_id) is None + + +# ── company_research new-column tests ───────────────────────────────────────── + +def test_company_research_has_new_columns(tmp_path): + """init_db creates company_research with the four extended columns.""" + from scripts.db import init_db + db = tmp_path / "test.db" + init_db(db) + conn = sqlite3.connect(db) + cols = [r[1] for r in conn.execute("PRAGMA table_info(company_research)").fetchall()] + conn.close() + assert "tech_brief" in cols + assert "funding_brief" in cols + assert "competitors_brief" in cols + assert "red_flags" in cols + +def test_save_and_get_research_new_fields(tmp_path): + """save_research persists and get_research returns the four new brief fields.""" + from scripts.db import init_db, insert_job, save_research, get_research + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "TAM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + + save_research(db, job_id=job_id, + company_brief="overview", ceo_brief="ceo", + talking_points="points", raw_output="raw", + tech_brief="tech stack", funding_brief="series B", + competitors_brief="vs competitors", red_flags="none") + r = get_research(db, job_id=job_id) + assert r["tech_brief"] == "tech stack" + assert r["funding_brief"] == "series B" + assert r["competitors_brief"] == "vs competitors" + assert r["red_flags"] == "none" + + +# ── stage_signal / suggestion_dismissed tests ───────────────────────────────── + +def test_stage_signal_columns_exist(tmp_path): + """init_db creates stage_signal and suggestion_dismissed columns on job_contacts.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + conn = sqlite3.connect(db_path) + cols = {row[1] for row in conn.execute("PRAGMA table_info(job_contacts)").fetchall()} + conn.close() + assert "stage_signal" in cols + assert "suggestion_dismissed" in cols + + +def test_add_contact_with_stage_signal(tmp_path): + """add_contact stores stage_signal when provided.""" + from scripts.db import init_db, insert_job, add_contact, get_contacts + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" + + +def test_get_unread_stage_signals(tmp_path): + """get_unread_stage_signals returns only non-neutral, non-dismissed signals.""" + from scripts.db import (init_db, insert_job, add_contact, + get_unread_stage_signals, dismiss_stage_signal) + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + c1 = add_contact(db_path, job_id=job_id, direction="inbound", + subject="Interview invite", stage_signal="interview_scheduled") + add_contact(db_path, job_id=job_id, direction="inbound", + subject="Auto-confirm", stage_signal="neutral") + signals = get_unread_stage_signals(db_path, job_id) + assert len(signals) == 1 + assert signals[0]["stage_signal"] == "interview_scheduled" + + dismiss_stage_signal(db_path, c1) + assert get_unread_stage_signals(db_path, job_id) == [] + + +def test_get_email_leads(tmp_path): + """get_email_leads returns only source='email' pending jobs.""" + from scripts.db import init_db, insert_job, get_email_leads + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + insert_job(db_path, { + "title": "TAM", "company": "Wiz", "url": "email://wiz.com/abc123", + "source": "email", "location": "", "is_remote": 0, + "salary": "", "description": "Hi Alex…", "date_found": "2026-02-21", + }) + leads = get_email_leads(db_path) + assert len(leads) == 1 + assert leads[0]["company"] == "Wiz" + assert leads[0]["source"] == "email" + + +def test_get_all_message_ids(tmp_path): + """get_all_message_ids returns all message IDs across jobs.""" + from scripts.db import init_db, insert_job, add_contact, get_all_message_ids + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-21", + }) + add_contact(db_path, job_id=job_id, message_id="") + add_contact(db_path, job_id=job_id, message_id="") + mids = get_all_message_ids(db_path) + assert "" in mids + assert "" in mids + + +# ── survey_responses tests ──────────────────────────────────────────────────── + +def test_survey_responses_table_created(tmp_path): + """init_db creates survey_responses table.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cur = conn.execute( + "SELECT name FROM sqlite_master WHERE type='table' AND name='survey_responses'" + ) + assert cur.fetchone() is not None + conn.close() + + +def test_survey_at_column_exists(tmp_path): + """jobs table has survey_at column after init_db.""" + from scripts.db import init_db + db_path = tmp_path / "test.db" + init_db(db_path) + import sqlite3 + conn = sqlite3.connect(db_path) + cols = [row[1] for row in conn.execute("PRAGMA table_info(jobs)").fetchall()] + assert "survey_at" in cols + conn.close() + + +def test_insert_and_get_survey_response(tmp_path): + """insert_survey_response inserts a row; get_survey_responses returns it.""" + from scripts.db import init_db, insert_job, insert_survey_response, get_survey_responses + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + row_id = insert_survey_response( + db_path, job_id=job_id, survey_name="Culture Fit", + source="text_paste", raw_input="Q1: A B C", mode="quick", + llm_output="1. B β€” collaborative", reported_score="82%", + ) + assert isinstance(row_id, int) + responses = get_survey_responses(db_path, job_id=job_id) + assert len(responses) == 1 + assert responses[0]["survey_name"] == "Culture Fit" + assert responses[0]["reported_score"] == "82%" + + +def test_get_interview_jobs_includes_survey(tmp_path): + """get_interview_jobs returns survey-stage jobs.""" + from scripts.db import init_db, insert_job, update_job_status, get_interview_jobs + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/2", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + update_job_status(db_path, [job_id], "survey") + result = get_interview_jobs(db_path) + assert any(j["id"] == job_id for j in result.get("survey", [])) + + +def test_advance_to_survey_sets_survey_at(tmp_path): + """advance_to_stage('survey') sets survey_at timestamp.""" + from scripts.db import init_db, insert_job, update_job_status, advance_to_stage, get_job_by_id + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/3", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-23", + }) + update_job_status(db_path, [job_id], "applied") + advance_to_stage(db_path, job_id=job_id, stage="survey") + job = get_job_by_id(db_path, job_id=job_id) + assert job["status"] == "survey" + assert job["survey_at"] is not None + + +def test_update_job_fields(tmp_path): + from scripts.db import init_db, insert_job, update_job_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "Importing…", "company": "", "url": "https://example.com/job/1", + "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", + }) + update_job_fields(db, job_id, { + "title": "Customer Success Manager", + "company": "Acme Corp", + "location": "San Francisco, CA", + "description": "Great role.", + "salary": "$120k", + "is_remote": 1, + }) + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Customer Success Manager" + assert row["company"] == "Acme Corp" + assert row["description"] == "Great role." + assert row["is_remote"] == 1 + + +def test_update_job_fields_ignores_unknown_columns(tmp_path): + from scripts.db import init_db, insert_job, update_job_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "Importing…", "company": "", "url": "https://example.com/job/2", + "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", + }) + # Should not raise even with an unknown column + update_job_fields(db, job_id, {"title": "Real Title", "nonexistent_col": "ignored"}) + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Real Title" diff --git a/tests/test_discover.py b/tests/test_discover.py new file mode 100644 index 0000000..4cc0fee --- /dev/null +++ b/tests/test_discover.py @@ -0,0 +1,185 @@ +# tests/test_discover.py +import pytest +from unittest.mock import patch, MagicMock +import pandas as pd +from pathlib import Path + +SAMPLE_JOB = { + "title": "Customer Success Manager", + "company": "Acme Corp", + "location": "Remote", + "is_remote": True, + "job_url": "https://linkedin.com/jobs/view/123456", + "site": "linkedin", + "min_amount": 90000, + "max_amount": 120000, + "salary_source": "$90,000 - $120,000", + "description": "Great CS role", +} + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} +SAMPLE_PROFILES_CFG = { + "profiles": [{"name": "cs", "titles": ["Customer Success Manager"], + "locations": ["Remote"], "boards": ["linkedin"], + "results_per_board": 5, "hours_old": 72}] +} + + +def make_jobs_df(jobs=None): + return pd.DataFrame(jobs or [SAMPLE_JOB]) + + +def test_discover_writes_to_sqlite(tmp_path): + """run_discovery inserts new jobs into SQLite staging db.""" + from scripts.discover import run_discovery + from scripts.db import get_jobs_by_status + + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 + assert jobs[0]["title"] == "Customer Success Manager" + + +def test_discover_skips_duplicate_urls(tmp_path): + """run_discovery does not insert a job whose URL is already in SQLite.""" + from scripts.discover import run_discovery + from scripts.db import init_db, insert_job, get_jobs_by_status + + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "Old", "company": "X", "url": "https://linkedin.com/jobs/view/123456", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-01-01", + }) + + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + jobs = get_jobs_by_status(db_path, "pending") + assert len(jobs) == 1 # only the pre-existing one, not a duplicate + + +def test_discover_pushes_new_jobs(tmp_path): + """Legacy: discover still calls push_to_notion when notion_push=True.""" + from scripts.discover import run_discovery + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(SAMPLE_PROFILES_CFG, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=make_jobs_df()), \ + patch("scripts.discover.push_to_notion") as mock_push, \ + patch("scripts.discover.get_existing_urls", return_value=set()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path, notion_push=True) + assert mock_push.call_count == 1 + + +def test_push_to_notion_sets_status_new(): + """push_to_notion always sets Status to the configured status_new value.""" + from scripts.discover import push_to_notion + mock_notion = MagicMock() + push_to_notion(mock_notion, "fake-db-id", SAMPLE_JOB, SAMPLE_FM) + call_kwargs = mock_notion.pages.create.call_args[1] + status = call_kwargs["properties"]["Status of Application"]["select"]["name"] + assert status == "Application Submitted" + + +# ── Custom boards integration ───────────────────────────────────────────────── + +_PROFILE_WITH_CUSTOM = { + "profiles": [{ + "name": "cs", "titles": ["Customer Success Manager"], + "locations": ["Remote"], "boards": [], + "custom_boards": ["adzuna"], + "results_per_board": 5, "hours_old": 72, + }] +} + +_ADZUNA_JOB = { + "title": "Customer Success Manager", + "company": "TestCo", + "url": "https://www.adzuna.com/jobs/details/999", + "source": "adzuna", + "location": "Remote", + "is_remote": True, + "salary": "$90,000 – $120,000", + "description": "Great remote CSM role", +} + + +def test_discover_custom_board_inserts_jobs(tmp_path): + """run_discovery dispatches custom_boards scrapers and inserts returned jobs.""" + from scripts.discover import run_discovery + from scripts.db import get_jobs_by_status + + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \ + patch("scripts.discover.Client"): + count = run_discovery(db_path=db_path) + + assert count == 1 + jobs = get_jobs_by_status(db_path, "pending") + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["source"] == "adzuna" + + +def test_discover_custom_board_skips_unknown(tmp_path, capsys): + """run_discovery logs and skips an unregistered custom board name.""" + from scripts.discover import run_discovery + + profile_unknown = { + "profiles": [{ + "name": "cs", "titles": ["CSM"], "locations": ["Remote"], + "boards": [], "custom_boards": ["nonexistent_board"], + "results_per_board": 5, "hours_old": 72, + }] + } + db_path = tmp_path / "test.db" + with patch("scripts.discover.load_config", return_value=(profile_unknown, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.Client"): + run_discovery(db_path=db_path) + + captured = capsys.readouterr() + assert "nonexistent_board" in captured.out + assert "Unknown scraper" in captured.out + + +def test_discover_custom_board_deduplicates(tmp_path): + """Custom board results are deduplicated by URL against pre-existing jobs.""" + from scripts.discover import run_discovery + from scripts.db import init_db, insert_job, get_jobs_by_status + + db_path = tmp_path / "test.db" + init_db(db_path) + insert_job(db_path, { + "title": "CSM", "company": "TestCo", + "url": "https://www.adzuna.com/jobs/details/999", + "source": "adzuna", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-01-01", + }) + + with patch("scripts.discover.load_config", return_value=(_PROFILE_WITH_CUSTOM, SAMPLE_NOTION_CFG)), \ + patch("scripts.discover.scrape_jobs", return_value=pd.DataFrame()), \ + patch("scripts.discover.CUSTOM_SCRAPERS", {"adzuna": lambda *a, **kw: [_ADZUNA_JOB]}), \ + patch("scripts.discover.Client"): + count = run_discovery(db_path=db_path) + + assert count == 0 # duplicate skipped + assert len(get_jobs_by_status(db_path, "pending")) == 1 diff --git a/tests/test_enrich_descriptions.py b/tests/test_enrich_descriptions.py new file mode 100644 index 0000000..f3df6e7 --- /dev/null +++ b/tests/test_enrich_descriptions.py @@ -0,0 +1,96 @@ +# tests/test_enrich_descriptions.py +"""Tests for scripts/enrich_descriptions.py β€” enrich_craigslist_fields().""" +from unittest.mock import patch, MagicMock +import sqlite3 + + +def test_enrich_craigslist_fields_skips_non_craigslist(tmp_path): + """Non-craigslist source β†’ returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://example.com/1", + "source": "linkedin", "location": "", "description": "Some company here.", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_skips_populated_company(tmp_path): + """Company already set β†’ returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "Acme Corp", "url": "https://sfbay.craigslist.org/jjj/d/1.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp today.", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_skips_empty_description(tmp_path): + """Empty description β†’ returns {} without calling LLM.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/2.html", + "source": "craigslist", "location": "", "description": "", + "date_found": "2026-02-24", + }) + with patch("scripts.llm_router.LLMRouter") as mock_llm: + result = enrich_craigslist_fields(db, job_id) + assert result == {} + mock_llm.assert_not_called() + + +def test_enrich_craigslist_fields_extracts_and_updates(tmp_path): + """Valid LLM response β†’ updates company/salary in DB, returns extracted dict.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/3.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $120k/yr.", + "date_found": "2026-02-24", + }) + mock_router = MagicMock() + mock_router.complete.return_value = '{"company": "Acme Corp", "salary": "$120k/yr"}' + with patch("scripts.llm_router.LLMRouter", return_value=mock_router): + result = enrich_craigslist_fields(db, job_id) + assert result == {"company": "Acme Corp", "salary": "$120k/yr"} + conn = sqlite3.connect(db) + row = conn.execute("SELECT company, salary FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row[0] == "Acme Corp" + assert row[1] == "$120k/yr" + + +def test_enrich_craigslist_fields_handles_bad_llm_json(tmp_path): + """Unparseable LLM response β†’ returns {} without raising.""" + from scripts.db import init_db, insert_job + from scripts.enrich_descriptions import enrich_craigslist_fields + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/4.html", + "source": "craigslist", "location": "", "description": "Great opportunity.", + "date_found": "2026-02-24", + }) + mock_router = MagicMock() + mock_router.complete.return_value = "Sorry, I cannot extract that." + with patch("scripts.llm_router.LLMRouter", return_value=mock_router): + result = enrich_craigslist_fields(db, job_id) + assert result == {} diff --git a/tests/test_imap_sync.py b/tests/test_imap_sync.py new file mode 100644 index 0000000..d6d057b --- /dev/null +++ b/tests/test_imap_sync.py @@ -0,0 +1,330 @@ +"""Tests for imap_sync helpers (no live IMAP connection required).""" +import pytest +from unittest.mock import patch, MagicMock + + +def test_classify_stage_signal_interview(): + """classify_stage_signal returns interview_scheduled for a call-scheduling email.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "interview_scheduled" + result = classify_stage_signal( + "Let's schedule a call", + "Hi Alex, we'd love to book a 30-min phone screen with you.", + ) + assert result == "interview_scheduled" + + +def test_classify_stage_signal_returns_none_on_error(): + """classify_stage_signal returns None when LLM call raises.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.side_effect = RuntimeError("model not loaded") + result = classify_stage_signal("subject", "body") + assert result is None + + +def test_classify_stage_signal_strips_think_tags(): + """classify_stage_signal strips ... blocks before parsing.""" + from scripts.imap_sync import classify_stage_signal + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "Let me think...\nrejected" + result = classify_stage_signal("Update on your application", "We went with another candidate.") + assert result == "rejected" + + +def test_normalise_company(): + """_normalise_company strips legal suffixes.""" + from scripts.imap_sync import _normalise_company + assert _normalise_company("DataStax, Inc.") == "DataStax" + assert _normalise_company("Wiz Ltd") == "Wiz" + assert _normalise_company("Crusoe Energy") == "Crusoe Energy" + + +def test_company_search_terms_excludes_job_board_sld(): + """Job-board domains like linkedin.com are never used as match terms.""" + from scripts.imap_sync import _company_search_terms + # LinkedIn-sourced job: SLD "linkedin" must not appear in the terms + terms = _company_search_terms("Bamboo Health", "https://www.linkedin.com/jobs/view/123") + assert "linkedin" not in terms + assert "bamboo health" in terms + + # Company with its own domain: SLD should be included + terms = _company_search_terms("Crusoe Energy", "https://crusoe.ai/jobs/456") + assert "crusoe" in terms + + # Indeed-sourced job: "indeed" excluded + terms = _company_search_terms("DoorDash", "https://www.indeed.com/viewjob?jk=abc") + assert "indeed" not in terms + assert "doordash" in terms + + +def test_has_recruitment_keyword(): + """_has_recruitment_keyword matches known keywords.""" + from scripts.imap_sync import _has_recruitment_keyword + assert _has_recruitment_keyword("Interview Invitation β€” Senior TAM") + assert _has_recruitment_keyword("Your application with DataStax") + assert not _has_recruitment_keyword("Team lunch tomorrow") + + +def test_extract_lead_info_returns_company_and_title(): + """extract_lead_info parses LLM JSON response into (company, title).""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = '{"company": "Wiz", "title": "Senior TAM"}' + result = extract_lead_info("Senior TAM at Wiz", "Hi Alex, we have a role…", "recruiter@wiz.com") + assert result == ("Wiz", "Senior TAM") + + +def test_extract_lead_info_returns_none_on_bad_json(): + """extract_lead_info returns (None, None) when LLM returns unparseable output.""" + from scripts.imap_sync import extract_lead_info + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "I cannot determine the company." + result = extract_lead_info("Job opportunity", "blah", "noreply@example.com") + assert result == (None, None) + + +def test_classify_labels_includes_survey_received(): + """_CLASSIFY_LABELS includes survey_received.""" + from scripts.imap_sync import _CLASSIFY_LABELS + assert "survey_received" in _CLASSIFY_LABELS + + +def test_classify_stage_signal_returns_survey_received(): + """classify_stage_signal returns 'survey_received' when LLM outputs that label.""" + from unittest.mock import patch + from scripts.imap_sync import classify_stage_signal + + with patch("scripts.imap_sync._CLASSIFIER_ROUTER") as mock_router: + mock_router.complete.return_value = "survey_received" + result = classify_stage_signal("Complete our culture survey", "Please fill out this form") + assert result == "survey_received" + + +def test_sync_job_emails_classifies_inbound(tmp_path): + """sync_job_emails classifies inbound emails and stores the stage_signal.""" + from scripts.db import init_db, insert_job, get_contacts + from scripts.imap_sync import sync_job_emails + + db_path = tmp_path / "test.db" + init_db(db_path) + job_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", + "url": "https://acme.com/jobs/1", + "source": "linkedin", "location": "Remote", + "is_remote": True, "salary": "", "description": "", + "date_found": "2026-02-21", + }) + job = {"id": job_id, "company": "Acme", "url": "https://acme.com/jobs/1"} + + fake_msg_bytes = ( + b"From: recruiter@acme.com\r\n" + b"To: alex@example.com\r\n" + b"Subject: Interview Invitation\r\n" + b"Message-ID: \r\n" + b"\r\n" + b"Hi Alex, we'd like to schedule a phone screen." + ) + + conn_mock = MagicMock() + conn_mock.select.return_value = ("OK", [b"1"]) + conn_mock.search.return_value = ("OK", [b"1"]) + conn_mock.fetch.return_value = ("OK", [(b"1 (RFC822 {123})", fake_msg_bytes)]) + + with patch("scripts.imap_sync.classify_stage_signal", return_value="interview_scheduled"): + inb, out = sync_job_emails(job, conn_mock, {"lookback_days": 90}, db_path) + + assert inb == 1 + contacts = get_contacts(db_path, job_id=job_id) + assert contacts[0]["stage_signal"] == "interview_scheduled" + + +def test_parse_linkedin_alert_extracts_jobs(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Your job alert for customer success manager in United States +New jobs match your preferences. +Manage alerts: https://www.linkedin.com/comm/jobs/alerts?... + +Customer Success Manager +Reflow +California, United States +View job: https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc%3D%3D&refId=xyz + +--------------------------------------------------------- + +Customer Engagement Manager +Bitwarden +United States + +2 school alumni +Apply with resume & profile +View job: https://www.linkedin.com/comm/jobs/view/4359824983/?trackingId=def%3D%3D + +--------------------------------------------------------- + +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 2 + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["company"] == "Reflow" + assert jobs[0]["location"] == "California, United States" + assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/4376518925/" + assert jobs[1]["title"] == "Customer Engagement Manager" + assert jobs[1]["company"] == "Bitwarden" + assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/4359824983/" + + +def test_parse_linkedin_alert_skips_blocks_without_view_job(): + from scripts.imap_sync import parse_linkedin_alert + body = """\ +Customer Success Manager +Some Company +United States + +--------------------------------------------------------- + +Valid Job Title +Valid Company +Remote +View job: https://www.linkedin.com/comm/jobs/view/1111111/?x=y + +--------------------------------------------------------- +""" + jobs = parse_linkedin_alert(body) + assert len(jobs) == 1 + assert jobs[0]["title"] == "Valid Job Title" + + +def test_parse_linkedin_alert_empty_body(): + from scripts.imap_sync import parse_linkedin_alert + assert parse_linkedin_alert("") == [] + assert parse_linkedin_alert("No jobs here.") == [] + + +# ── _scan_unmatched_leads integration ───────────────────────────────────────── + +_ALERT_BODY = """\ +Your job alert for customer success manager in United States +New jobs match your preferences. + +Customer Success Manager +Acme Corp +California, United States +View job: https://www.linkedin.com/comm/jobs/view/9999001/?trackingId=abc + +--------------------------------------------------------- + +Director of Customer Success +Beta Inc +Remote +View job: https://www.linkedin.com/comm/jobs/view/9999002/?trackingId=def + +--------------------------------------------------------- +""" + +_ALERT_EMAIL = { + "message_id": "", + "from_addr": "jobalerts-noreply@linkedin.com", + "to_addr": "alex@example.com", + "subject": "2 new jobs for customer success manager", + "body": _ALERT_BODY, + "date": "2026-02-24 12:00:00", +} + + +def test_scan_unmatched_leads_linkedin_alert_inserts_jobs(tmp_path): + """_scan_unmatched_leads detects a LinkedIn alert and inserts each job card.""" + import sqlite3 + from unittest.mock import patch, MagicMock + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task") as mock_submit: + + from scripts.imap_sync import _scan_unmatched_leads + known_ids: set = set() + new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, known_ids) + + assert new_leads == 2 + + # Message ID added so it won't be reprocessed + assert "" in known_ids + + # Both jobs inserted with correct fields + conn = sqlite3.connect(db_path) + conn.row_factory = sqlite3.Row + jobs = conn.execute("SELECT * FROM jobs ORDER BY id").fetchall() + conn.close() + + assert len(jobs) == 2 + assert jobs[0]["title"] == "Customer Success Manager" + assert jobs[0]["company"] == "Acme Corp" + assert jobs[0]["url"] == "https://www.linkedin.com/jobs/view/9999001/" + assert jobs[0]["source"] == "linkedin" + assert jobs[1]["title"] == "Director of Customer Success" + assert jobs[1]["url"] == "https://www.linkedin.com/jobs/view/9999002/" + + # scrape_url task submitted for each inserted job + assert mock_submit.call_count == 2 + task_types = [call.args[1] for call in mock_submit.call_args_list] + assert task_types == ["scrape_url", "scrape_url"] + + +def test_scan_unmatched_leads_linkedin_alert_skips_duplicates(tmp_path): + """URLs already in the DB are not re-inserted.""" + from unittest.mock import patch, MagicMock + from scripts.db import init_db, insert_job + + db_path = tmp_path / "test.db" + init_db(db_path) + + # Pre-insert one of the two URLs + insert_job(db_path, { + "title": "Customer Success Manager", "company": "Acme Corp", + "url": "https://www.linkedin.com/jobs/view/9999001/", + "source": "linkedin", "location": "", "is_remote": 0, + "salary": "", "description": "", "date_found": "2026-02-24", + }) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task") as mock_submit: + + from scripts.imap_sync import _scan_unmatched_leads + new_leads = _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set()) + + # Only one new job (the duplicate was skipped) + assert new_leads == 1 + assert mock_submit.call_count == 1 + + +def test_scan_unmatched_leads_linkedin_alert_skips_llm_path(tmp_path): + """After a LinkedIn alert email, the LLM extraction path is never reached.""" + from unittest.mock import patch, MagicMock + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + conn_mock = MagicMock() + + with patch("scripts.imap_sync._search_folder", return_value=[b"1"]), \ + patch("scripts.imap_sync._parse_message", return_value=_ALERT_EMAIL), \ + patch("scripts.task_runner.submit_task"), \ + patch("scripts.imap_sync.extract_lead_info") as mock_llm: + + from scripts.imap_sync import _scan_unmatched_leads + _scan_unmatched_leads(conn_mock, {"lookback_days": 90}, db_path, set()) + + # LLM extraction must never be called for alert emails + mock_llm.assert_not_called() diff --git a/tests/test_llm_router.py b/tests/test_llm_router.py new file mode 100644 index 0000000..0d5a897 --- /dev/null +++ b/tests/test_llm_router.py @@ -0,0 +1,135 @@ +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path +import yaml + +CONFIG_PATH = Path(__file__).parent.parent / "config" / "llm.yaml" + + +def test_config_loads(): + """Config file is valid YAML with required keys.""" + cfg = yaml.safe_load(CONFIG_PATH.read_text()) + assert "fallback_order" in cfg + assert "backends" in cfg + assert len(cfg["fallback_order"]) >= 1 + + +def test_router_uses_first_reachable_backend(): + """Router skips unreachable backends and uses the first that responds.""" + from scripts.llm_router import LLMRouter + + router = LLMRouter(CONFIG_PATH) + + mock_response = MagicMock() + mock_response.choices[0].message.content = "hello" + + with patch.object(router, "_is_reachable", side_effect=[False, True, True, True, True]), \ + patch("scripts.llm_router.OpenAI") as MockOpenAI: + instance = MockOpenAI.return_value + instance.chat.completions.create.return_value = mock_response + mock_model = MagicMock() + mock_model.id = "test-model" + instance.models.list.return_value.data = [mock_model] + + result = router.complete("say hello") + + assert result == "hello" + + +def test_router_raises_when_all_backends_fail(): + """Router raises RuntimeError when every backend is unreachable or errors.""" + from scripts.llm_router import LLMRouter + + router = LLMRouter(CONFIG_PATH) + + with patch.object(router, "_is_reachable", return_value=False): + with pytest.raises(RuntimeError, match="All LLM backends exhausted"): + router.complete("say hello") + + +def test_is_reachable_returns_false_on_connection_error(): + """_is_reachable returns False when the health endpoint is unreachable.""" + from scripts.llm_router import LLMRouter + import requests + + router = LLMRouter(CONFIG_PATH) + + with patch("scripts.llm_router.requests.get", side_effect=requests.ConnectionError): + result = router._is_reachable("http://localhost:9999/v1") + + assert result is False + + +def test_complete_skips_backend_without_image_support(tmp_path): + """When images= is passed, backends without supports_images are skipped.""" + import yaml + from scripts.llm_router import LLMRouter + + cfg = { + "fallback_order": ["ollama", "vision_service"], + "backends": { + "ollama": { + "type": "openai_compat", + "base_url": "http://localhost:11434/v1", + "model": "llava", + "api_key": "ollama", + "enabled": True, + "supports_images": False, + }, + "vision_service": { + "type": "vision_service", + "base_url": "http://localhost:8002", + "enabled": True, + "supports_images": True, + }, + }, + } + cfg_file = tmp_path / "llm.yaml" + cfg_file.write_text(yaml.dump(cfg)) + + from unittest.mock import patch, MagicMock + mock_resp = MagicMock() + mock_resp.status_code = 200 + mock_resp.json.return_value = {"text": "B β€” collaborative"} + + with patch("scripts.llm_router.requests.get") as mock_get, \ + patch("scripts.llm_router.requests.post") as mock_post: + # health check returns ok for vision_service + mock_get.return_value = MagicMock(status_code=200) + mock_post.return_value = mock_resp + + router = LLMRouter(config_path=cfg_file) + result = router.complete("Which option?", images=["base64data"]) + + assert result == "B β€” collaborative" + # vision_service POST /analyze should have been called + assert mock_post.called + + +def test_complete_without_images_skips_vision_service(tmp_path): + """When images=None, vision_service backend is skipped.""" + import yaml + from scripts.llm_router import LLMRouter + from unittest.mock import patch, MagicMock + + cfg = { + "fallback_order": ["vision_service"], + "backends": { + "vision_service": { + "type": "vision_service", + "base_url": "http://localhost:8002", + "enabled": True, + "supports_images": True, + }, + }, + } + cfg_file = tmp_path / "llm.yaml" + cfg_file.write_text(yaml.dump(cfg)) + + router = LLMRouter(config_path=cfg_file) + with patch("scripts.llm_router.requests.post") as mock_post: + try: + router.complete("text only prompt") + except RuntimeError: + pass # all backends exhausted is expected + assert not mock_post.called diff --git a/tests/test_match.py b/tests/test_match.py new file mode 100644 index 0000000..25a823e --- /dev/null +++ b/tests/test_match.py @@ -0,0 +1,47 @@ +import pytest +from unittest.mock import patch, MagicMock + + +def test_extract_job_description_from_url(): + """extract_job_description fetches and returns visible text from a URL.""" + from scripts.match import extract_job_description + + with patch("scripts.match.requests.get") as mock_get: + mock_get.return_value.text = "

We need a CSM with Salesforce.

" + mock_get.return_value.raise_for_status = MagicMock() + result = extract_job_description("https://example.com/job/123") + + assert "CSM" in result + assert "Salesforce" in result + + +def test_score_is_between_0_and_100(): + """match_score returns a float in [0, 100] and a list of keyword gaps.""" + from scripts.match import match_score + + score, gaps = match_score( + resume_text="Customer Success Manager with Salesforce experience", + job_text="Looking for a Customer Success Manager who knows Salesforce and Gainsight", + ) + assert 0 <= score <= 100 + assert isinstance(gaps, list) + + +def test_write_score_to_notion(): + """write_match_to_notion updates the Notion page with score and gaps.""" + from scripts.match import write_match_to_notion + + mock_notion = MagicMock() + + SAMPLE_FM = { + "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", + } + + write_match_to_notion(mock_notion, "page-id-abc", 85.5, ["Gainsight", "Churnzero"], SAMPLE_FM) + + mock_notion.pages.update.assert_called_once() + call_kwargs = mock_notion.pages.update.call_args[1] + assert call_kwargs["page_id"] == "page-id-abc" + score_val = call_kwargs["properties"]["Match Score"]["number"] + assert score_val == 85.5 diff --git a/tests/test_scrape_url.py b/tests/test_scrape_url.py new file mode 100644 index 0000000..37eace4 --- /dev/null +++ b/tests/test_scrape_url.py @@ -0,0 +1,135 @@ +"""Tests for URL-based job scraping.""" +from unittest.mock import patch, MagicMock + + +def _make_db(tmp_path, url="https://www.linkedin.com/jobs/view/99999/"): + from scripts.db import init_db, insert_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "Importing…", "company": "", "url": url, + "source": "manual", "location": "", "description": "", "date_found": "2026-02-24", + }) + return db, job_id + + +def test_canonicalize_url_linkedin(): + from scripts.scrape_url import canonicalize_url + messy = ( + "https://www.linkedin.com/jobs/view/4376518925/" + "?trk=eml-email_job_alert&refId=abc%3D%3D&trackingId=xyz" + ) + assert canonicalize_url(messy) == "https://www.linkedin.com/jobs/view/4376518925/" + + +def test_canonicalize_url_linkedin_comm(): + from scripts.scrape_url import canonicalize_url + comm = "https://www.linkedin.com/comm/jobs/view/4376518925/?trackingId=abc" + assert canonicalize_url(comm) == "https://www.linkedin.com/jobs/view/4376518925/" + + +def test_canonicalize_url_generic_strips_utm(): + from scripts.scrape_url import canonicalize_url + url = "https://jobs.example.com/post/42?utm_source=linkedin&utm_medium=email&jk=real_param" + result = canonicalize_url(url) + assert "utm_source" not in result + assert "real_param" in result + + +def test_detect_board_linkedin(): + from scripts.scrape_url import _detect_board + assert _detect_board("https://www.linkedin.com/jobs/view/12345/") == "linkedin" + assert _detect_board("https://linkedin.com/jobs/view/12345/?tracking=abc") == "linkedin" + + +def test_detect_board_indeed(): + from scripts.scrape_url import _detect_board + assert _detect_board("https://www.indeed.com/viewjob?jk=abc123") == "indeed" + + +def test_detect_board_glassdoor(): + from scripts.scrape_url import _detect_board + assert _detect_board("https://www.glassdoor.com/job-listing/foo-bar-123.htm") == "glassdoor" + + +def test_detect_board_generic(): + from scripts.scrape_url import _detect_board + assert _detect_board("https://jobs.example.com/posting/42") == "generic" + + +def test_extract_linkedin_job_id(): + from scripts.scrape_url import _extract_linkedin_job_id + assert _extract_linkedin_job_id("https://www.linkedin.com/jobs/view/4376518925/") == "4376518925" + assert _extract_linkedin_job_id("https://www.linkedin.com/comm/jobs/view/4376518925/?tracking=x") == "4376518925" + assert _extract_linkedin_job_id("https://example.com/no-id") is None + + +def test_scrape_linkedin_updates_job(tmp_path): + db, job_id = _make_db(tmp_path) + + linkedin_html = """ +

Customer Success Manager

+ Acme Corp + San Francisco, CA +
Exciting CSM role with great benefits.
+ """ + + mock_resp = MagicMock() + mock_resp.text = linkedin_html + mock_resp.raise_for_status = MagicMock() + + with patch("scripts.scrape_url.requests.get", return_value=mock_resp): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + assert result.get("title") == "Customer Success Manager" + assert result.get("company") == "Acme Corp" + assert "CSM role" in result.get("description", "") + + import sqlite3 + conn = sqlite3.connect(db) + conn.row_factory = sqlite3.Row + row = dict(conn.execute("SELECT * FROM jobs WHERE id=?", (job_id,)).fetchone()) + conn.close() + assert row["title"] == "Customer Success Manager" + assert row["company"] == "Acme Corp" + + +def test_scrape_url_generic_json_ld(tmp_path): + db, job_id = _make_db(tmp_path, url="https://jobs.example.com/post/42") + + json_ld_html = """ + + """ + + mock_resp = MagicMock() + mock_resp.text = json_ld_html + mock_resp.raise_for_status = MagicMock() + + with patch("scripts.scrape_url.requests.get", return_value=mock_resp): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + assert result.get("title") == "TAM Role" + assert result.get("company") == "TechCo" + + +def test_scrape_url_graceful_on_http_error(tmp_path): + db, job_id = _make_db(tmp_path) + import requests as req + + with patch("scripts.scrape_url.requests.get", side_effect=req.RequestException("timeout")): + from scripts.scrape_url import scrape_job_url + result = scrape_job_url(db, job_id) + + # Should return empty dict and not raise; job row still exists + assert isinstance(result, dict) + import sqlite3 + conn = sqlite3.connect(db) + row = conn.execute("SELECT id FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row is not None diff --git a/tests/test_sync.py b/tests/test_sync.py new file mode 100644 index 0000000..21c3eea --- /dev/null +++ b/tests/test_sync.py @@ -0,0 +1,88 @@ +# tests/test_sync.py +import pytest +from unittest.mock import patch, MagicMock +from pathlib import Path + + +SAMPLE_FM = { + "title_field": "Salary", "job_title": "Job Title", "company": "Company Name", + "url": "Role Link", "source": "Job Source", "status": "Status of Application", + "status_new": "Application Submitted", "date_found": "Date Found", + "remote": "Remote", "match_score": "Match Score", + "keyword_gaps": "Keyword Gaps", "notes": "Notes", "job_description": "Job Description", +} + +SAMPLE_NOTION_CFG = {"token": "secret_test", "database_id": "fake-db-id", "field_map": SAMPLE_FM} + + +def test_sync_pushes_approved_jobs(tmp_path): + """sync_to_notion pushes approved jobs and marks them synced.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + + db_path = tmp_path / "test.db" + init_db(db_path) + row_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://example.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "$100k", "description": "Good role", "date_found": "2026-02-20", + }) + update_job_status(db_path, [row_id], "approved") + + mock_notion = MagicMock() + mock_notion.pages.create.return_value = {"id": "notion-page-abc"} + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client", return_value=mock_notion): + count = sync_to_notion(db_path=db_path) + + assert count == 1 + mock_notion.pages.create.assert_called_once() + synced = get_jobs_by_status(db_path, "synced") + assert len(synced) == 1 + + +def test_sync_falls_back_to_core_fields_on_validation_error(tmp_path): + """When Notion returns a validation_error (missing column), sync retries without optional fields.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db, insert_job, get_jobs_by_status, update_job_status + + db_path = tmp_path / "test.db" + init_db(db_path) + row_id = insert_job(db_path, { + "title": "CSM", "company": "Acme", "url": "https://example.com/2", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "", "date_found": "2026-02-20", + }) + update_job_status(db_path, [row_id], "approved") + + mock_notion = MagicMock() + # First call raises validation_error; second call (fallback) succeeds + mock_notion.pages.create.side_effect = [ + Exception("validation_error: Could not find property with name: Match Score"), + {"id": "notion-page-fallback"}, + ] + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client", return_value=mock_notion): + count = sync_to_notion(db_path=db_path) + + assert count == 1 + assert mock_notion.pages.create.call_count == 2 + synced = get_jobs_by_status(db_path, "synced") + assert len(synced) == 1 + + +def test_sync_returns_zero_when_nothing_approved(tmp_path): + """sync_to_notion returns 0 when there are no approved jobs.""" + from scripts.sync import sync_to_notion + from scripts.db import init_db + + db_path = tmp_path / "test.db" + init_db(db_path) + + with patch("scripts.sync.load_notion_config", return_value=SAMPLE_NOTION_CFG), \ + patch("scripts.sync.Client"): + count = sync_to_notion(db_path=db_path) + + assert count == 0 diff --git a/tests/test_task_runner.py b/tests/test_task_runner.py new file mode 100644 index 0000000..3ea5090 --- /dev/null +++ b/tests/test_task_runner.py @@ -0,0 +1,210 @@ +import threading +import time +import pytest +from pathlib import Path +from unittest.mock import patch +import sqlite3 + + +def _make_db(tmp_path): + from scripts.db import init_db, insert_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "Acme", "url": "https://ex.com/1", + "source": "linkedin", "location": "Remote", "is_remote": True, + "salary": "", "description": "Great role.", "date_found": "2026-02-20", + }) + return db, job_id + + +def test_submit_task_returns_id_and_true(tmp_path): + """submit_task returns (task_id, True) and spawns a thread.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): # don't actually call LLM + from scripts.task_runner import submit_task + task_id, is_new = submit_task(db, "cover_letter", job_id) + assert isinstance(task_id, int) and task_id > 0 + assert is_new is True + + +def test_submit_task_deduplicates(tmp_path): + """submit_task returns (existing_id, False) for a duplicate in-flight task.""" + db, job_id = _make_db(tmp_path) + with patch("scripts.task_runner._run_task"): + from scripts.task_runner import submit_task + first_id, _ = submit_task(db, "cover_letter", job_id) + second_id, is_new = submit_task(db, "cover_letter", job_id) + assert second_id == first_id + assert is_new is False + + +def test_run_task_cover_letter_success(tmp_path): + """_run_task marks running→completed and saves cover letter to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", return_value="Dear Hiring Manager,\nGreat fit!"): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" + assert task["error"] is None + + conn = sqlite3.connect(db) + row = conn.execute("SELECT cover_letter FROM jobs WHERE id=?", (job_id,)).fetchone() + conn.close() + assert row[0] == "Dear Hiring Manager,\nGreat fit!" + + +def test_run_task_company_research_success(tmp_path): + """_run_task marks running→completed and saves research to DB.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job, get_research + + task_id, _ = insert_task(db, "company_research", job_id) + fake_result = { + "raw_output": "raw", "company_brief": "brief", + "ceo_brief": "ceo", "talking_points": "points", + } + with patch("scripts.company_research.research_company", return_value=fake_result): + from scripts.task_runner import _run_task + _run_task(db, task_id, "company_research", job_id) + + task = get_task_for_job(db, "company_research", job_id) + assert task["status"] == "completed" + + research = get_research(db, job_id=job_id) + assert research["company_brief"] == "brief" + + +def test_run_task_marks_failed_on_exception(tmp_path): + """_run_task marks status=failed and stores error when generator raises.""" + db, job_id = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "cover_letter", job_id) + + with patch("scripts.generate_cover_letter.generate", side_effect=RuntimeError("LLM timeout")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "cover_letter", job_id) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "failed" + assert "LLM timeout" in task["error"] + + +def test_run_task_discovery_success(tmp_path): + """_run_task with task_type=discovery calls run_discovery and stores count in error field.""" + from scripts.db import init_db, insert_task, get_task_for_job + db = tmp_path / "test.db" + init_db(db) + task_id, _ = insert_task(db, "discovery", 0) + + with patch("scripts.discover.run_discovery", return_value=7): + from scripts.task_runner import _run_task + _run_task(db, task_id, "discovery", 0) + + task = get_task_for_job(db, "discovery", 0) + assert task["status"] == "completed" + assert "7 new listings" in task["error"] + + +def test_run_task_email_sync_success(tmp_path): + """email_sync task calls sync_all and marks completed with summary.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + summary = {"synced": 3, "inbound": 5, "outbound": 2, "new_leads": 1, "errors": []} + with patch("scripts.imap_sync.sync_all", return_value=summary): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "completed" + assert "3 jobs" in task["error"] + + +def test_run_task_email_sync_file_not_found(tmp_path): + """email_sync marks failed with helpful message when config is missing.""" + db, _ = _make_db(tmp_path) + from scripts.db import insert_task, get_task_for_job + task_id, _ = insert_task(db, "email_sync", 0) + + with patch("scripts.imap_sync.sync_all", side_effect=FileNotFoundError("config/email.yaml")): + from scripts.task_runner import _run_task + _run_task(db, task_id, "email_sync", 0) + + task = get_task_for_job(db, "email_sync", 0) + assert task["status"] == "failed" + assert "email" in task["error"].lower() + + +def test_submit_task_actually_completes(tmp_path): + """Integration: submit_task spawns a thread that completes asynchronously.""" + db, job_id = _make_db(tmp_path) + from scripts.db import get_task_for_job + + with patch("scripts.generate_cover_letter.generate", return_value="Cover letter text"): + from scripts.task_runner import submit_task + task_id, _ = submit_task(db, "cover_letter", job_id) + # Wait for thread to complete (max 5s) + for _ in range(50): + task = get_task_for_job(db, "cover_letter", job_id) + if task and task["status"] in ("completed", "failed"): + break + time.sleep(0.1) + + task = get_task_for_job(db, "cover_letter", job_id) + assert task["status"] == "completed" + + +def test_run_task_enrich_craigslist_success(tmp_path): + """enrich_craigslist task calls enrich_craigslist_fields and marks completed.""" + from scripts.db import init_db, insert_job, insert_task, get_task_for_job + from unittest.mock import MagicMock + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/9.html", + "source": "craigslist", "location": "", "description": "Join Acme Corp. Pay: $100k.", + "date_found": "2026-02-24", + }) + task_id, _ = insert_task(db, "enrich_craigslist", job_id) + + with patch("scripts.enrich_descriptions.enrich_craigslist_fields", + return_value={"company": "Acme Corp", "salary": "$100k"}) as mock_enrich: + from scripts.task_runner import _run_task + _run_task(db, task_id, "enrich_craigslist", job_id) + + mock_enrich.assert_called_once_with(db, job_id) + task = get_task_for_job(db, "enrich_craigslist", job_id) + assert task["status"] == "completed" + + +def test_scrape_url_submits_enrich_craigslist_for_craigslist_job(tmp_path): + """After scrape_url completes for a craigslist job with empty company, enrich_craigslist is queued.""" + from scripts.db import init_db, insert_job, insert_task, get_task_for_job + db = tmp_path / "test.db" + init_db(db) + job_id = insert_job(db, { + "title": "CSM", "company": "", "url": "https://sfbay.craigslist.org/jjj/d/10.html", + "source": "craigslist", "location": "", "description": "", + "date_found": "2026-02-24", + }) + task_id, _ = insert_task(db, "scrape_url", job_id) + + with patch("scripts.scrape_url.scrape_job_url", return_value={"title": "CSM", "company": ""}): + with patch("scripts.task_runner.submit_task", wraps=None) as mock_submit: + # Use wraps=None so we can capture calls without actually spawning threads + mock_submit.return_value = (99, True) + from scripts.task_runner import _run_task + _run_task(db, task_id, "scrape_url", job_id) + + # submit_task should have been called with enrich_craigslist + assert mock_submit.called + call_args = mock_submit.call_args + assert call_args[0][1] == "enrich_craigslist" + assert call_args[0][2] == job_id